From 83ba6762cc43d9db581b979bb5e3445669e46cc2 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 25 Nov 2024 18:33:56 +0100 Subject: Merging upstream version 2.0.3+dfsg (Closes: #923993, #1042533, #1045145). Signed-off-by: Daniel Baumann --- .../.page-level/_concept-page-template.md | 7 - docs/Demo-Sites.md | 66 +-- .../notifications/README.md | 2 + .../maintenance-operations-on-netdata-agents.md | 11 +- docs/category-overview-pages/working-with-logs.md | 2 +- docs/dashboards-and-charts/README.md | 2 +- docs/dashboards-and-charts/alerts-tab.md | 2 +- docs/dashboards-and-charts/anomaly-advisor-tab.md | 7 +- docs/dashboards-and-charts/events-feed.md | 4 +- .../import-export-print-snapshot.md | 19 +- docs/dashboards-and-charts/kubernetes-tab.md | 1 - docs/dashboards-and-charts/netdata-charts.md | 71 ++- docs/dashboards-and-charts/themes.md | 1 - docs/dashboards-and-charts/top-tab.md | 2 +- docs/deployment-guides/deployment-strategies.md | 27 +- docs/developer-and-contributor-corner/README.md | 2 +- .../build-the-netdata-agent-yourself.md | 2 +- .../collect-apache-nginx-web-logs.md | 15 +- .../collect-unbound-metrics.md | 14 +- docs/developer-and-contributor-corner/customize.md | 13 +- .../kubernetes-k8s-netdata.md | 237 -------- .../kubernetes-k8s-netdata.txt | 234 ++++++++ .../developer-and-contributor-corner/lamp-stack.md | 238 -------- .../lamp-stack.txt | 237 ++++++++ .../monitor-cockroachdb.md | 118 ---- .../monitor-cockroachdb.txt | 118 ++++ .../monitor-debug-applications-ebpf.md | 29 +- .../monitor-hadoop-cluster.md | 20 +- .../pi-hole-raspberry-pi.md | 140 ----- .../pi-hole-raspberry-pi.txt | 120 ++++ docs/developer-and-contributor-corner/process.md | 270 --------- docs/developer-and-contributor-corner/process.txt | 270 +++++++++ .../python-collector.md | 626 -------------------- .../python-collector.txt | 629 +++++++++++++++++++++ .../raspberry-pi-anomaly-detection.md | 96 ---- .../raspberry-pi-anomaly-detection.txt | 96 ++++ .../running-through-cf-tunnels.md | 2 +- .../style-guide.md | 42 +- docs/diagrams/netdata-overview.xml | 2 +- docs/exporting-metrics/README.md | 68 +-- .../enable-an-exporting-connector.md | 4 +- docs/glossary.md | 12 +- docs/guidelines.md | 2 +- docs/netdata-agent/README.md | 10 +- docs/netdata-agent/backup-and-restore-an-agent.md | 45 +- docs/netdata-agent/configuration/README.md | 20 +- .../configuration/anonymous-telemetry-events.md | 60 +- docs/netdata-agent/configuration/cheatsheet.md | 144 +---- .../configuration/common-configuration-changes.md | 16 +- .../configuration/dynamic-configuration.md | 12 +- .../optimize-the-netdata-agents-performance.md | 80 ++- .../optimizing-metrics-database/README.md | 2 +- .../change-metrics-storage.md | 51 +- .../organize-systems-metrics-and-alerts.md | 93 +-- .../README.md | 8 +- .../Running-behind-apache.md | 229 ++++---- .../Running-behind-caddy.md | 15 +- .../Running-behind-h2o.md | 47 +- .../Running-behind-haproxy.md | 54 +- .../Running-behind-lighttpd.md | 34 +- .../Running-behind-nginx.md | 71 ++- docs/netdata-agent/securing-netdata-agents.md | 116 ++-- docs/netdata-agent/sizing-netdata-agents/README.md | 106 ++-- .../bandwidth-requirements.md | 12 +- .../sizing-netdata-agents/cpu-requirements.md | 80 +-- .../disk-requirements-and-retention.md | 10 +- .../sizing-netdata-agents/ram-requirements.md | 20 +- docs/netdata-agent/start-stop-restart.md | 150 +---- docs/netdata-agent/versions-and-platforms.md | 13 +- docs/netdata-assistant.md | 8 +- .../authentication-and-authorization/api-tokens.md | 2 +- .../enterprise-sso-authentication.md | 31 +- .../role-based-access-model.md | 6 +- .../netdata-cloud-on-prem/installation.md | 61 +- docs/netdata-cloud/versions.md | 2 +- .../active-journal-source-without-encryption.md | 2 +- ...th-encryption-using-self-signed-certificates.md | 6 +- ...ve-journal-centralization-without-encryption.md | 4 +- ...ing-and-high-availability-of-netdata-parents.md | 6 +- .../metrics-centralization-points/configuration.md | 6 +- .../metrics-centralization-points/faq.md | 10 +- .../replication-of-past-samples.md | 4 +- docs/security-and-privacy-design/README.md | 81 ++- .../netdata-agent-security.md | 3 +- .../netdata-cloud-security.md | 2 +- docs/top-monitoring-netdata-functions.md | 2 +- 86 files changed, 2633 insertions(+), 2981 deletions(-) delete mode 100644 docs/developer-and-contributor-corner/kubernetes-k8s-netdata.md create mode 100644 docs/developer-and-contributor-corner/kubernetes-k8s-netdata.txt delete mode 100644 docs/developer-and-contributor-corner/lamp-stack.md create mode 100644 docs/developer-and-contributor-corner/lamp-stack.txt delete mode 100644 docs/developer-and-contributor-corner/monitor-cockroachdb.md create mode 100644 docs/developer-and-contributor-corner/monitor-cockroachdb.txt delete mode 100644 docs/developer-and-contributor-corner/pi-hole-raspberry-pi.md create mode 100644 docs/developer-and-contributor-corner/pi-hole-raspberry-pi.txt delete mode 100644 docs/developer-and-contributor-corner/process.md create mode 100644 docs/developer-and-contributor-corner/process.txt delete mode 100644 docs/developer-and-contributor-corner/python-collector.md create mode 100644 docs/developer-and-contributor-corner/python-collector.txt delete mode 100644 docs/developer-and-contributor-corner/raspberry-pi-anomaly-detection.md create mode 100644 docs/developer-and-contributor-corner/raspberry-pi-anomaly-detection.txt (limited to 'docs') diff --git a/docs/.templates/.page-level/_concept-page-template.md b/docs/.templates/.page-level/_concept-page-template.md index 685dd2ff3..d6c4babba 100644 --- a/docs/.templates/.page-level/_concept-page-template.md +++ b/docs/.templates/.page-level/_concept-page-template.md @@ -1,10 +1,3 @@ - - # Title Why should the reader care: “What’s in it for me?” diff --git a/docs/Demo-Sites.md b/docs/Demo-Sites.md index 291e3a5e3..91b9c514f 100644 --- a/docs/Demo-Sites.md +++ b/docs/Demo-Sites.md @@ -1,44 +1,34 @@ - # Live demos -See the live Netdata Cloud demo with Rooms (listed below) for specific use cases at **https://app.netdata.cloud/spaces/netdata-demo** +See the live Netdata Cloud demo with Rooms (listed below) for specific use cases at `https://app.netdata.cloud/spaces/netdata-demo` -| Location | Netdata Demo URL | 60 mins reqs | VM donated by | -| :------------------ | :-------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| :------------------------------------------------- | -| Netdata Cloud | **[Netdata Demo - All nodes](https://app.netdata.cloud/spaces/netdata-demo/rooms/all-nodes/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Active Directory](https://app.netdata.cloud/spaces/netdata-demo/rooms/active-directory/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Apache](https://app.netdata.cloud/spaces/netdata-demo/rooms/apache/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Cassandra](https://app.netdata.cloud/spaces/netdata-demo/rooms/cassandra/overview)** ||| -| Netdata Cloud | **[Netdata Demo - CoreDNS](https://app.netdata.cloud/spaces/netdata-demo/rooms/coredns/overview)** ||| -| Netdata Cloud | **[Netdata Demo - DNS Query](https://app.netdata.cloud/spaces/netdata-demo/rooms/dns-query/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Docker](https://app.netdata.cloud/spaces/netdata-demo/rooms/docker/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Host Reachability](https://app.netdata.cloud/spaces/netdata-demo/rooms/host-reachability/overview)** ||| -| Netdata Cloud | **[Netdata Demo - HTTP Endpoints](https://app.netdata.cloud/spaces/netdata-demo/rooms/http-endpoints/overview)** ||| -| Netdata Cloud | **[Netdata Demo - IIS](https://app.netdata.cloud/spaces/netdata-demo/rooms/iis/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Kubernetes](https://app.netdata.cloud/spaces/netdata-demo/rooms/kubernetes/kubernetes)** ||| -| Netdata Cloud | **[Netdata Demo - Machine Learning](https://app.netdata.cloud/spaces/netdata-demo/rooms/machine-learning/overview)** ||| -| Netdata Cloud | **[Netdata Demo - MS Exchange](https://app.netdata.cloud/spaces/netdata-demo/rooms/ms-exchange/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Nginx](https://app.netdata.cloud/spaces/netdata-demo/rooms/nginx/overview)** ||| -| Netdata Cloud | **[Netdata Demo - PostgreSQL](https://app.netdata.cloud/spaces/netdata-demo/rooms/postgresql/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Redis](https://app.netdata.cloud/spaces/netdata-demo/rooms/redis/overview)** ||| -| Netdata Cloud | **[Netdata Demo - Windows](https://app.netdata.cloud/spaces/netdata-demo/rooms/windows/overview)** ||| -| London (UK) | **[london3.my-netdata.io](https://london3.my-netdata.io)**
(this is the global Netdata **registry** and has **named** and **mysql** charts) | [![Requests Per Second](https://london3.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://london3.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | -| Atlanta (USA) | **[cdn77.my-netdata.io](https://cdn77.my-netdata.io)**
(with **named** and **mysql** charts) | [![Requests Per Second](https://cdn77.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://cdn77.my-netdata.io) | [CDN77.com](https://www.cdn77.com/) | -| Bangalore (India) | **[bangalore.my-netdata.io](https://bangalore.my-netdata.io)** | [![Requests Per Second](https://bangalore.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://bangalore.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | -| Frankfurt (Germany) | **[frankfurt.my-netdata.io](https://frankfurt.my-netdata.io)** | [![Requests Per Second](https://frankfurt.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://frankfurt.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | -| New York (USA) | **[newyork.my-netdata.io](https://newyork.my-netdata.io)** | [![Requests Per Second](https://newyork.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://newyork.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | -| San Francisco (USA) | **[sanfrancisco.my-netdata.io](https://sanfrancisco.my-netdata.io)** | [![Requests Per Second](https://sanfrancisco.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://sanfrancisco.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | -| Singapore | **[singapore.my-netdata.io](https://singapore.my-netdata.io)** | [![Requests Per Second](https://singapore.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://singapore.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | -| Toronto (Canada) | **[toronto.my-netdata.io](https://toronto.my-netdata.io)** | [![Requests Per Second](https://toronto.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://toronto.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | +| Location | Netdata Demo URL | 60 mins reqs | VM donated by | +|:--------------------|:------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------| +| Netdata Cloud | **[Netdata Demo - All nodes](https://app.netdata.cloud/spaces/netdata-demo/rooms/all-nodes/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Active Directory](https://app.netdata.cloud/spaces/netdata-demo/rooms/active-directory/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Apache](https://app.netdata.cloud/spaces/netdata-demo/rooms/apache/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Cassandra](https://app.netdata.cloud/spaces/netdata-demo/rooms/cassandra/overview)** | | | +| Netdata Cloud | **[Netdata Demo - CoreDNS](https://app.netdata.cloud/spaces/netdata-demo/rooms/coredns/overview)** | | | +| Netdata Cloud | **[Netdata Demo - DNS Query](https://app.netdata.cloud/spaces/netdata-demo/rooms/dns-query/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Docker](https://app.netdata.cloud/spaces/netdata-demo/rooms/docker/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Host Reachability](https://app.netdata.cloud/spaces/netdata-demo/rooms/host-reachability/overview)** | | | +| Netdata Cloud | **[Netdata Demo - HTTP Endpoints](https://app.netdata.cloud/spaces/netdata-demo/rooms/http-endpoints/overview)** | | | +| Netdata Cloud | **[Netdata Demo - IIS](https://app.netdata.cloud/spaces/netdata-demo/rooms/iis/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Kubernetes](https://app.netdata.cloud/spaces/netdata-demo/rooms/kubernetes/kubernetes)** | | | +| Netdata Cloud | **[Netdata Demo - Machine Learning](https://app.netdata.cloud/spaces/netdata-demo/rooms/machine-learning/overview)** | | | +| Netdata Cloud | **[Netdata Demo - MS Exchange](https://app.netdata.cloud/spaces/netdata-demo/rooms/ms-exchange/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Nginx](https://app.netdata.cloud/spaces/netdata-demo/rooms/nginx/overview)** | | | +| Netdata Cloud | **[Netdata Demo - PostgreSQL](https://app.netdata.cloud/spaces/netdata-demo/rooms/postgresql/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Redis](https://app.netdata.cloud/spaces/netdata-demo/rooms/redis/overview)** | | | +| Netdata Cloud | **[Netdata Demo - Windows](https://app.netdata.cloud/spaces/netdata-demo/rooms/windows/overview)** | | | +| London (UK) | **[london3.my-netdata.io](https://london3.my-netdata.io)**
(this is the global Netdata **registry** and has **named** and **mysql** charts) | [![Requests Per Second](https://london3.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://london3.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | +| Atlanta (USA) | **[cdn77.my-netdata.io](https://cdn77.my-netdata.io)**
(with **named** and **mysql** charts) | [![Requests Per Second](https://cdn77.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://cdn77.my-netdata.io) | [CDN77.com](https://www.cdn77.com/) | +| Bangalore (India) | **[bangalore.my-netdata.io](https://bangalore.my-netdata.io)** | [![Requests Per Second](https://bangalore.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://bangalore.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | +| Frankfurt (Germany) | **[frankfurt.my-netdata.io](https://frankfurt.my-netdata.io)** | [![Requests Per Second](https://frankfurt.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://frankfurt.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | +| New York (USA) | **[newyork.my-netdata.io](https://newyork.my-netdata.io)** | [![Requests Per Second](https://newyork.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://newyork.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | +| San Francisco (USA) | **[sanfrancisco.my-netdata.io](https://sanfrancisco.my-netdata.io)** | [![Requests Per Second](https://sanfrancisco.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://sanfrancisco.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | +| Singapore | **[singapore.my-netdata.io](https://singapore.my-netdata.io)** | [![Requests Per Second](https://singapore.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://singapore.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | +| Toronto (Canada) | **[toronto.my-netdata.io](https://toronto.my-netdata.io)** | [![Requests Per Second](https://toronto.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://toronto.my-netdata.io) | [DigitalOcean.com](https://m.do.co/c/83dc9f941745) | Netdata dashboards are mobile- and touch-friendly. diff --git a/docs/alerts-and-notifications/notifications/README.md b/docs/alerts-and-notifications/notifications/README.md index 3368b4e14..870076b97 100644 --- a/docs/alerts-and-notifications/notifications/README.md +++ b/docs/alerts-and-notifications/notifications/README.md @@ -2,6 +2,8 @@ This section includes the documentation of the integrations for both of Netdata's notification methods. + + - Netdata Cloud provides centralized alert notifications, utilizing the health status data already sent to Netdata Cloud from connected nodes to send alerts to configured integrations. [Supported integrations](/docs/alerts-&-notifications/notifications/centralized-cloud-notifications) include Amazon SNS, Discord, Slack, Splunk, and others. - The Netdata Agent offers a [wider range of notification options](/docs/alerts-&-notifications/notifications/agent-dispatched-notifications) directly from the agent itself. You can choose from over a dozen services, including email, Slack, PagerDuty, Twilio, and others, for more granular control over notifications on each node. diff --git a/docs/category-overview-pages/maintenance-operations-on-netdata-agents.md b/docs/category-overview-pages/maintenance-operations-on-netdata-agents.md index 1867d863f..e989abc85 100644 --- a/docs/category-overview-pages/maintenance-operations-on-netdata-agents.md +++ b/docs/category-overview-pages/maintenance-operations-on-netdata-agents.md @@ -1,8 +1,7 @@ -# Maintenance operations on Netdata Agents Overview +# Netdata Agent Maintenance Operations Overview -This section provides information on various actions you can take while maintaining a Netdata Agent. +This section provides information on various actions to maintain a Netdata Agent: -- [Starting and Stopping Netdata Agents](/docs/netdata-agent/start-stop-restart.md) -- [Update Netdata Agents](/packaging/installer/UPDATE.md) -- [Reinstall Netdata Agents](/packaging/installer/REINSTALL.md) -- [Uninstall Netdata Agents](/packaging/installer/UNINSTALL.md) +- [Service Control](/docs/netdata-agent/start-stop-restart.md) +- [Update](/packaging/installer/UPDATE.md) +- [Uninstall](/packaging/installer/UNINSTALL.md) diff --git a/docs/category-overview-pages/working-with-logs.md b/docs/category-overview-pages/working-with-logs.md index e1f027529..d28074d2e 100644 --- a/docs/category-overview-pages/working-with-logs.md +++ b/docs/category-overview-pages/working-with-logs.md @@ -6,4 +6,4 @@ The [systemd journal plugin](/src/collectors/systemd-journal.plugin/) is the cor For structured logs, Netdata provides tools like [log2journal](/src/collectors/log2journal/README.md) and [systemd-cat-native](/src/libnetdata/log/systemd-cat-native.md) to convert them into compatible systemd journal entries. -You can also find useful guides on how to set up log centralization points in the [Observability Cetralization Points](/docs/observability-centralization-points/README.md) section of our docs. +You can also find useful guides on how to set up log centralization points in the [Observability Centralization Points](/docs/observability-centralization-points/README.md) section of our docs. diff --git a/docs/dashboards-and-charts/README.md b/docs/dashboards-and-charts/README.md index 372f2030b..f94d776a3 100644 --- a/docs/dashboards-and-charts/README.md +++ b/docs/dashboards-and-charts/README.md @@ -35,6 +35,6 @@ You can access the dashboard at and [sign-in with a ### Netdata Agent -To view your Netdata dashboard, open a web browser and enter the address `http://NODE:19999` - replace `NODE` with your Agent's IP address or hostname. If the Agent is on the same machine, use http://localhost:19999. +To view your Netdata dashboard, open a web browser and enter the address `http://NODE:19999` - replace `NODE` with your Agent's IP address or hostname. If the Agent is on the same machine, use `http://localhost:19999`. Documentation for previous Agent dashboard can still be found [here](/src/web/gui/README.md). diff --git a/docs/dashboards-and-charts/alerts-tab.md b/docs/dashboards-and-charts/alerts-tab.md index 00d3efcb7..66c019ec0 100644 --- a/docs/dashboards-and-charts/alerts-tab.md +++ b/docs/dashboards-and-charts/alerts-tab.md @@ -45,7 +45,7 @@ At the bottom of the panel you can click the green button "View alert page" to o ### Silence an alert -From this tab, the "Silencing" column shows if there is any rule present for each alert, and from the "Actions" column you can create a new [silencing rule](/docs/alerts-and-notifications/notifications/centralized-cloud-notifications/centralized-cloud-notifications-reference.md#alert-notifications-silencing-rules) for this alert, or get help and information about this alert from the [Netdata Assistant](/docs/netdata-assistant.md). +From this tab, the "Silencing" column shows if there is any rule present for each alert, and from the "Actions" column you can create a new [silencing rule](/docs/alerts-and-notifications/notifications/centralized-cloud-notifications/centralized-cloud-notifications-reference.md#alert-notification-silencing-rules) for this alert, or get help and information about this alert from the [Netdata Assistant](/docs/netdata-assistant.md). ## Alert Configurations tab diff --git a/docs/dashboards-and-charts/anomaly-advisor-tab.md b/docs/dashboards-and-charts/anomaly-advisor-tab.md index 51b58b23a..bf3243ef1 100644 --- a/docs/dashboards-and-charts/anomaly-advisor-tab.md +++ b/docs/dashboards-and-charts/anomaly-advisor-tab.md @@ -1,11 +1,10 @@ # Anomaly Advisor tab -The Anomaly Advisor tab lets you focus on potentially anomalous metrics and charts related to a particular highlighted window of interest. In addition to this tab, each chart in the [Metrics tab](/docs/dashboards-and-charts/metrics-tab-and-single-node-tabs.md) also has an [Anomaly Rate ribbon](/docs/dashboards-and-charts/netdata-charts.md#anomaly-rate-ribbon). +The Anomaly Advisor tab lets you focus on potentially anomalous metrics and charts related to a particular highlighted window of interest. In addition to this tab, each chart in the [Metrics tab](/docs/dashboards-and-charts/metrics-tab-and-single-node-tabs.md) also has an [Anomaly Rate ribbon](/docs/dashboards-and-charts/netdata-charts.md#anomaly-rate-ribbon). +More details about configuration can be found in the [ML documentation](/src/ml/README.md). -More details about configuration can be found in the [ML documentation](/src/ml/README.md#configuration). - -This tab uses our [Anomaly Rate ML feature](/src/ml/README.md#anomaly-rate---averageanomaly-bit) to score metrics in terms of anomalous behavior. +This tab uses our [Anomaly Rate ML feature](/src/ml/README.md#anomaly-bit) to score metrics in terms of anomalous behavior. - The "Anomaly Rate" chart shows the percentage of anomalous metrics over time per node. diff --git a/docs/dashboards-and-charts/events-feed.md b/docs/dashboards-and-charts/events-feed.md index a5386e80e..34d6ee0e6 100644 --- a/docs/dashboards-and-charts/events-feed.md +++ b/docs/dashboards-and-charts/events-feed.md @@ -66,8 +66,8 @@ All users will be able to see events from the Topology and Alerts domain but Aud ## How to use the events feed 1. Click on the **Events** tab (located near the top of your screen) -1. You will be presented with a table listing the events that occurred from the timeframe defined on the [date time picker](/docs/dashboards-and-charts/visualization-date-and-time-controls.md#date-and-time-selector) -1. You can use the filtering capabilities available on right-hand bar to slice through the results provided. See more details on [event types and filters](#event-types-and-filters) +2. You will be presented with a table listing the events that occurred from the timeframe defined on the [date time picker](/docs/dashboards-and-charts/visualization-date-and-time-controls.md#date-and-time-selector) +3. You can use the filtering capabilities available on right-hand bar to slice through the results provided > **Note** > diff --git a/docs/dashboards-and-charts/import-export-print-snapshot.md b/docs/dashboards-and-charts/import-export-print-snapshot.md index 80bf514ae..f2df15dab 100644 --- a/docs/dashboards-and-charts/import-export-print-snapshot.md +++ b/docs/dashboards-and-charts/import-export-print-snapshot.md @@ -1,22 +1,7 @@ - - # Import, export, and print a snapshot >❗This feature is only available on v1 dashboards, it hasn't been port-forwarded to v2. -> For more information on accessing dashboards check [this documentation](/docs/dashboards-and-charts/README.md). - +> For more information on accessing dashboards check [this documentation](/docs/dashboards-and-charts/README.md). Netdata can export snapshots of the contents of your dashboard at a given time, which you can then import into any other node running Netdata. Or, you can create a print-ready version of your dashboard to save to PDF or actually print to @@ -44,7 +29,7 @@ Select the Netdata snapshot file to import. Once the file is loaded, the modal u snapshot and the system from which it was taken. Click **Import** to begin to process. Netdata takes the data embedded inside the snapshot and re-creates a static replica on your dashboard. When the import -finishes, you're free to move around and examine the charts. +finishes, you're free to move around and examine the charts. Some caveats and tips to keep in mind: diff --git a/docs/dashboards-and-charts/kubernetes-tab.md b/docs/dashboards-and-charts/kubernetes-tab.md index 9b5df87d8..3289615f0 100644 --- a/docs/dashboards-and-charts/kubernetes-tab.md +++ b/docs/dashboards-and-charts/kubernetes-tab.md @@ -27,7 +27,6 @@ Netdata Cloud organizes and visualizes the following metrics from your Kubernete | `k8s.cgroup.net_net` | Sum of `received` and `sent` bandwidth per second. | | `k8s.cgroup.net_packets` | Sum of `multicast`, `received`, and `sent` packets. | - When viewing the [overview of this dashboard](#kubernetes-containers-overview), Netdata presents the above metrics per container, or aggregated based on their associated pods. diff --git a/docs/dashboards-and-charts/netdata-charts.md b/docs/dashboards-and-charts/netdata-charts.md index 5536f83b2..c7563aa29 100644 --- a/docs/dashboards-and-charts/netdata-charts.md +++ b/docs/dashboards-and-charts/netdata-charts.md @@ -19,14 +19,14 @@ These charts provide a lot of useful information, so that you can: - View individual metric collection status about a chart These charts are available on Netdata Cloud's -[Metrics tab](/docs/dashboards-and-charts/metrics-tab-and-single-node-tabs.md), [single sode tabs](/docs/dashboards-and-charts/metrics-tab-and-single-node-tabs.md) and +[Metrics tab](/docs/dashboards-and-charts/metrics-tab-and-single-node-tabs.md), [single node tabs](/docs/dashboards-and-charts/metrics-tab-and-single-node-tabs.md) and on your [Custom Dashboards](/docs/dashboards-and-charts/dashboards-tab.md). ## Overview A Netdata chart looks like this: - +A Netdata Chart With a quick glance you have immediate information available at your disposal: @@ -37,7 +37,7 @@ With a quick glance you have immediate information available at your disposal: - [Chart area](#hover-over-the-chart) - [Legend with dimensions](#dimensions-bar) -## Fundemental elements +## Fundamental elements While Netdata's charts require no configuration and are easy to interact with, they have a lot of underlying complexity. To meaningfully organize charts out of the box based on what's happening in your nodes, Netdata uses the concepts of [dimensions](#dimensions), [contexts](#contexts), and [families](#families). @@ -100,7 +100,7 @@ names: When you start interacting with a chart, you'll notice valuable information on the Title bar: - +Netdata Chart Title bar Title bar elements: @@ -110,8 +110,7 @@ Title bar elements: Along with viewing chart type, context and units, on this bar you have access to immediate actions over the chart: - - +Netdata Chart Title bar immediate actions - **Manage Alerts**: manage [Alert configurations](/docs/dashboards-and-charts/alerts-tab.md#alert-configurations-tab) for this chart. - **Chart info**: get more information relevant to the chart you are interacting with. @@ -119,14 +118,14 @@ Along with viewing chart type, context and units, on this bar you have access to - **Enter fullscreen mode**: expand the current chart to the full size of your screen. - **User settings**: save your settings for the chart at hand, so it persists across dashboard reloads. - Personal has the top priority. - - Room and Space settings for a chart are shared across all users who don't have personal settings for it. + - Room and Space settings for a chart are shared across all users who don't have personal settings for it. - **Drag and Drop the chart to a Dashboard**: add the chart to an existing custom [Dashboard](/docs/dashboards-and-charts/dashboards-tab.md) or directly create a new one that includes the chart. ## Definition bar Each composite chart has a definition bar to provide information and options about the following: - +Netdata Chart Definition bar - Group by option - Aggregate function to be applied in case multiple data sources exist @@ -145,14 +144,14 @@ To help users instantly understand and validate the data they see on charts, we > allowing you to zoom in to the different parts of it. > > -> +> Netdata NIDL Framework > You can rapidly access condensed information for collected metrics, grouped by node, monitored instances, dimension, or any key/value label pair. At the Definition bar of each chart, there are a few dropdown menus: - +Netdata Chart NIDL Dropdown menus These dropdown menus have 2 functions: @@ -171,7 +170,7 @@ All of these dropdown menus can be used for instantly filtering the information The "Group by" dropdown menu allows selecting 1 or more groupings to be applied at once on the same dataset. - +Netdata Chart Group by dropdown It supports: @@ -188,7 +187,7 @@ Using this menu, you can slice and dice the data in any possible way, to quickly > You have the means to change the default group by or apply filtering to get a better view into what data your are trying to analyze. > For example, if you change the group by to _instance_ you get a view with the data of all the instances (cgroups) that contribute to that chart. > Then you can use further filtering tools to focus the data that is important to you and even save the result to your own dashboards. - +> > ### Tip > > Group by instance, dimension to see the time series of every individual collected metric participating in the chart. @@ -197,7 +196,7 @@ Using this menu, you can slice and dice the data in any possible way, to quickly Each chart uses an opinionated-but-valuable default aggregate function over the data sources. - +Netdata Chart Aggregate functions over data For example, the `system.cpu` chart shows the average for each dimension from every contributing chart, while the `net.net` chart shows the sum for each dimension from every contributing chart, which can also come from multiple networking interfaces. @@ -218,7 +217,7 @@ The following aggregate functions are available for each selected dimension: In this dropdown, you can view or filter the nodes contributing time-series metrics to the chart. This menu also provides the contribution of each node to the volume of the chart, and a break down of the anomaly rate of the queried data per node. - +Netdata Chart Nodes dropdown If one or more nodes can't contribute to a given chart, the definition bar shows a warning symbol plus the number of affected nodes, then lists them in the dropdown along with the associated error. Nodes might return errors because of @@ -229,38 +228,38 @@ networking issues, a stopped `netdata` service, or because that node does not ha In this dropdown, you can view or filter the instances contributing time-series metrics to the chart. This menu also provides the contribution of each instance to the volume of the chart, and a break down of the anomaly rate of the queried data per instance. - +Netdata Chart Instances dropdown ### Dimensions dropdown In this dropdown, you can view or filter the original dimensions contributing time-series metrics to the chart. This menu also presents the contribution of each original dimensions on the chart, and a break down of the anomaly rate of the data per dimension. - +Netdata Chart Dimensions Dropdown ### Labels dropdown In this dropdown, you can view or filter the contributing time-series labels of the chart. This menu also presents the contribution of each label on the chart,and a break down of the anomaly rate of the data per label. - +Netdata Chart Labels Dropdown ### Aggregate functions over time When the granularity of the data collected is higher than the plotted points on the chart an aggregation function over time is applied. - +Netdata Chart Aggregate functions over time By default the aggregation applied is _average_ but the user can choose different options from the following: - Min, Max, Average or Sum - Percentile - you can specify the percentile you want to focus on: 25th, 50th, 75th, 80th, 90th, 95th, 97th, 98th and 99th. - + Netdata Chart Aggregate functions over time Percentile selection - Trimmed Mean or Trimmed Median - you can choose the percentage of data tha you want to focus on: 1%, 2%, 3%, 5%, 10%, 15%, 20% and 25%. - + Netdata Chart Aggregate functions over time Trimmed Mean or Median selection - Median - Standard deviation - Coefficient of variation @@ -280,7 +279,7 @@ It then uses these unique models during data collection to predict the value tha If the value collected is an outlier, it is marked as anomalous. - +Netdata Chart Anomaly Rate Ribbon This unmatched capability of real-time predictions as data is collected allows you to **detect anomalies for potentially millions of metrics across your entire infrastructure within a second of occurrence**. @@ -297,29 +296,29 @@ It includes a bar indicating the volume percentage of each time series compared This overlay sorts all dimensions by value, makes bold the closest dimension to the mouse and presents a histogram based on the values of the dimensions. - +Netdata Chart Hover over Chart When hovering the anomaly ribbon, the overlay sorts all dimensions by anomaly rate, and presents a histogram of these anomaly rates. -#### Info column +### Info column Additionally, when hovering over the chart, the overlay may display an indication in the "Info" column. Currently, this column is used to inform users of any data collection issues that might affect the chart. Below each chart, there is an information ribbon. This ribbon currently shows 3 states related to the points presented in the chart: -1. **[P]: Partial Data** +1. **Partial Data** At least one of the dimensions in the chart has partial data, meaning that not all instances available contributed data to this point. This can happen when a container is stopped, or when a node is restarted. This indicator helps to gain confidence of the dataset, in situations when unusual spikes or dives appear due to infrastructure maintenance, or due to failures to part of the infrastructure. -2. **[O]: Overflown** +2. **Overflown** At least one of the data sources included in the chart has a counter that has overflowed at this point. -3. **[E]: Empty Data** +3. **Empty Data** At least one of the dimensions included in the chart has no data at all for the given points. All these indicators are also visualized per dimension, in the pop-over that appears when hovering the chart. - +Netdata Chart Hover over the chart Info Column ## Play, Pause and Reset @@ -346,7 +345,7 @@ Note: These interactions are available when the default "Pan" action is used fro While exploring the chart, a tool bar will appear. This tool bar is there to support you on this task. The available manipulation tools you can select are: - +Netdata Chart Tool bar - Pan - Highlight @@ -382,10 +381,10 @@ Selecting timeframes is useful when you see an interesting spike or change in a You can zoom to a specific timeframe, either horizontally of vertically, by selecting a timeframe. -| Interaction | Keyboard/mouse | Touchpad/touchscreen | -|:-------------------------------------------|:-------------------------------------|:-----------------------------------------------------| -| **Zoom** to a specific timeframe | `Shift + mouse vertical selection` | `n/a` | -| **Horizontal Zoom** a specific Y-axis area | `Shift + mouse horizontal selection` | `n/a` | +| Interaction | Keyboard/mouse | Touchpad/touchscreen | +|:-------------------------------------------|:-------------------------------------|:---------------------| +| **Zoom** to a specific timeframe | `Shift + mouse vertical selection` | `n/a` | +| **Horizontal Zoom** a specific Y-axis area | `Shift + mouse horizontal selection` | `n/a` | ### Chart zoom @@ -394,9 +393,9 @@ of an anomaly or outage. Zooming out lets you see metrics within the larger context, such as the last hour, day, or week, which is useful in understanding what "normal" looks like, or to identify long-term trends, like a slow creep in memory usage. -| Interaction | Keyboard/mouse | Touchpad/touchscreen | -|:-------------------------------------------|:-------------------------------------|:-----------------------------------------------------| -| **Zoom** in or out | `Shift + mouse scrollwheel` | `two-finger pinch`
`Shift + two-finger scroll` | +| Interaction | Keyboard/mouse | Touchpad/touchscreen | +|:-------------------|:----------------------------|:-----------------------------------------------------| +| **Zoom** in or out | `Shift + mouse scrollwheel` | `two-finger pinch`
`Shift + two-finger scroll` | ## Dimensions bar @@ -404,7 +403,7 @@ Zooming out lets you see metrics within the larger context, such as the last hou The bottom legend where you can see the dimensions of the chart can be ordered by: - +Netdata Chart order dimensions legend - Dimension name (Ascending or Descending) - Dimension value (Ascending or Descending) diff --git a/docs/dashboards-and-charts/themes.md b/docs/dashboards-and-charts/themes.md index 0ca7425ae..bdce5db6f 100644 --- a/docs/dashboards-and-charts/themes.md +++ b/docs/dashboards-and-charts/themes.md @@ -12,4 +12,3 @@ tab, and then choose your preferred theme: **Light** or **Dark**. **Light**: ![Light theme](https://github.com/netdata/netdata/assets/70198089/eb0fb8c1-5695-450a-8ba8-a185874e8496) - diff --git a/docs/dashboards-and-charts/top-tab.md b/docs/dashboards-and-charts/top-tab.md index 4edaf32f9..6b96010a7 100644 --- a/docs/dashboards-and-charts/top-tab.md +++ b/docs/dashboards-and-charts/top-tab.md @@ -6,7 +6,7 @@ They can be used to retrieve additional information to help you troubleshoot or > **Tip** > > You can also execute a Function from the [Nodes tab](/docs/dashboards-and-charts/nodes-tab.md), by pressing the `f(x)` button. - +> > **Note** > > If you get an error saying that your node can't execute Functions please check the [prerequisites](/docs/top-monitoring-netdata-functions.md#prerequisites). diff --git a/docs/deployment-guides/deployment-strategies.md b/docs/deployment-guides/deployment-strategies.md index 1a3c67164..5c7afda20 100644 --- a/docs/deployment-guides/deployment-strategies.md +++ b/docs/deployment-guides/deployment-strategies.md @@ -32,7 +32,7 @@ In this example, Machine Learning and Alerting are disabled for the Child, so th ##### netdata.conf -On the child node, edit `netdata.conf` by using the [edit-config](/docs/netdata-agent/configuration/README.md#edit-netdataconf) script and set the following parameters: +On the child node, edit `netdata.conf` by using the [edit-config](/docs/netdata-agent/configuration/README.md#edit-a-configuration-file-using-edit-config) script and set the following parameters: ```yaml [db] @@ -63,7 +63,7 @@ On the child node, edit `netdata.conf` by using the [edit-config](/docs/netdata- ##### stream.conf -To edit `stream.conf`, use again the [edit-config](/docs/netdata-agent/configuration/README.md#edit-netdataconf) script and set the following parameters: +To edit `stream.conf`, use again the [edit-config](/docs/netdata-agent/configuration/README.md#edit-a-configuration-file-using-edit-config) script and set the following parameters: ```yaml [stream] @@ -77,7 +77,7 @@ To edit `stream.conf`, use again the [edit-config](/docs/netdata-agent/configura #### Parent config -For the Parent, besides setting up streaming, this example also provides configuration for multiple [tiers of metrics storage](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics), for 10 Children, with about 2k metrics each. This allows for: +For the Parent, besides setting up streaming, this example also provides configuration for multiple [tiers of metrics storage](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md), for 10 Children, with about 2k metrics each. This allows for: - 1s granularity at tier 0 for 1 week - 1m granularity at tier 1 for 1 month @@ -90,28 +90,23 @@ Requiring: ##### netdata.conf -On the Parent, edit `netdata.conf` by using the [edit-config](/docs/netdata-agent/configuration/README.md#edit-netdataconf) script and set the following parameters: +On the Parent, edit `netdata.conf` by using the [edit-config](/docs/netdata-agent/configuration/README.md#edit-a-configuration-file-using-edit-config) script and set the following parameters: ```yaml [db] mode = dbengine + dbengine tier backfill = new storage tiers = 3 - # To allow memory pressure to offload index from ram - dbengine page descriptors in file mapped memory = yes + dbengine page cache size = 1.4GiB # storage tier 0 update every = 1 - dbengine multihost disk space MB = 12000 - dbengine page cache size MB = 1400 + dbengine tier 0 retention space = 12GiB # storage tier 1 - dbengine tier 1 page cache size MB = 512 - dbengine tier 1 multihost disk space MB = 4096 dbengine tier 1 update every iterations = 60 - dbengine tier 1 backfill = new + dbengine tier 1 retention space = 4GiB # storage tier 2 - dbengine tier 2 page cache size MB = 128 - dbengine tier 2 multihost disk space MB = 2048 dbengine tier 2 update every iterations = 60 - dbengine tier 2 backfill = new + dbengine tier 2 retention space = 2GiB [ml] # Enabled by default # enabled = yes @@ -125,7 +120,7 @@ On the Parent, edit `netdata.conf` by using the [edit-config](/docs/netdata-agen ##### stream.conf -On the Parent node, edit `stream.conf` by using the [edit-config](/docs/netdata-agent/configuration/README.md#edit-netdataconf) script and set the following parameters: +On the Parent node, edit `stream.conf` by using the [edit-config](/docs/netdata-agent/configuration/README.md#edit-a-configuration-file-using-edit-config) script and set the following parameters: ```yaml [API_KEY] @@ -137,7 +132,7 @@ On the Parent node, edit `stream.conf` by using the [edit-config](/docs/netdata- In order to setup active–active streaming between Parent 1 and Parent 2, Parent 1 needs to be instructed to stream data to Parent 2 and Parent 2 to stream data to Parent 1. The Child Agents need to be configured with the addresses of both Parent Agents. An Agent will only connect to one Parent at a time, falling back to the next upon failure. These examples use the same API key between Parent Agents and for connections for Child Agents. -On both Netdata Parent and all Child Agents, edit `stream.conf` by using the [edit-config](/docs/netdata-agent/configuration/README.md#edit-netdataconf) script: +On both Netdata Parent and all Child Agents, edit `stream.conf` by using the [edit-config](/docs/netdata-agent/configuration/README.md#edit-a-configuration-file-using-edit-config) script: #### stream.conf on Parent 1 diff --git a/docs/developer-and-contributor-corner/README.md b/docs/developer-and-contributor-corner/README.md index d4d86382a..817938126 100644 --- a/docs/developer-and-contributor-corner/README.md +++ b/docs/developer-and-contributor-corner/README.md @@ -1,3 +1,3 @@ # Developer and Contributor Corner -In this section of our Documentation you will find more advanced information, suited for developers and contributors alike. \ No newline at end of file +In this section of our Documentation you will find more advanced information, suited for developers and contributors alike. diff --git a/docs/developer-and-contributor-corner/build-the-netdata-agent-yourself.md b/docs/developer-and-contributor-corner/build-the-netdata-agent-yourself.md index 99166ad95..d98784ccd 100644 --- a/docs/developer-and-contributor-corner/build-the-netdata-agent-yourself.md +++ b/docs/developer-and-contributor-corner/build-the-netdata-agent-yourself.md @@ -1,3 +1,3 @@ # Build the Netdata Agent yourself -This section contains documentation on all the ways that you can build the Netdata Agent. \ No newline at end of file +This section contains documentation on all the ways that you can build the Netdata Agent. diff --git a/docs/developer-and-contributor-corner/collect-apache-nginx-web-logs.md b/docs/developer-and-contributor-corner/collect-apache-nginx-web-logs.md index 55af82fb7..9a307b0b3 100644 --- a/docs/developer-and-contributor-corner/collect-apache-nginx-web-logs.md +++ b/docs/developer-and-contributor-corner/collect-apache-nginx-web-logs.md @@ -81,18 +81,13 @@ jobs: log_type: auto ``` -Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate -method](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for your system. Netdata should pick up your web server's access log and -begin showing real-time charts! +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate method](/docs/netdata-agent/start-stop-restart.md) for your system. Netdata should pick up your web server's access log and begin showing real-time charts! ### Custom log formats and fields -The web log collector is capable of parsing custom Nginx and Apache log formats and presenting them as charts, but we'll -leave that topic for a separate guide. +The web log collector is capable of parsing custom Nginx and Apache log formats and presenting them as charts, but we'll leave that topic for a separate guide. -We do have [extensive -documentation](/src/go/plugin/go.d/modules/weblog/README.md#custom-log-format) on how -to build custom parsing for Nginx and Apache logs. +We do have [extensive documentation](/src/go/plugin/go.d/modules/weblog/README.md) on how to build custom parsing for Nginx and Apache logs. ## Tweak web log collector alerts @@ -100,7 +95,7 @@ Over time, we've created some default alerts for web log monitoring. These alert web server is receiving more than 120 requests per minute. Otherwise, there's simply not enough data to make conclusions about what is "too few" or "too many." -- [web log alerts](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/web_log.conf). +- [web log alerts](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/web_log.conf). You can also edit this file directly with `edit-config`: @@ -108,5 +103,5 @@ You can also edit this file directly with `edit-config`: ./edit-config health.d/weblog.conf ``` -For more information about editing the defaults or writing new alert entities, see our +For more information about editing the defaults or writing new alert entities, see our [health monitoring documentation](/src/health/README.md). diff --git a/docs/developer-and-contributor-corner/collect-unbound-metrics.md b/docs/developer-and-contributor-corner/collect-unbound-metrics.md index ac997b7f9..abfaca723 100644 --- a/docs/developer-and-contributor-corner/collect-unbound-metrics.md +++ b/docs/developer-and-contributor-corner/collect-unbound-metrics.md @@ -1,13 +1,3 @@ - - # Monitor Unbound DNS servers with Netdata [Unbound](https://nlnetlabs.nl/projects/unbound/about/) is a "validating, recursive, caching DNS resolver" from NLNet @@ -35,7 +25,7 @@ the TLS key files that will encrypt connections to the remote interface. Then ad documentation](https://nlnetlabs.nl/documentation/unbound/howto-setup/#setup-remote-control) for more details on using `unbound-control`, such as how to handle situations when Unbound is run under a unique user. -```conf +```text # enable remote-control remote-control: control-enable: yes @@ -137,5 +127,3 @@ Now that you're collecting metrics from your Unbound servers, let us know how it for improvement or refinement based on real-world use cases. Feel free to [file an issue](https://github.com/netdata/netdata/issues/new?assignees=&labels=bug%2Cneeds+triage&template=BUG_REPORT.yml) with your thoughts. - - diff --git a/docs/developer-and-contributor-corner/customize.md b/docs/developer-and-contributor-corner/customize.md index 03a6a842a..7d9895dc0 100644 --- a/docs/developer-and-contributor-corner/customize.md +++ b/docs/developer-and-contributor-corner/customize.md @@ -1,15 +1,15 @@ # Customize the standard dashboard -> ### Disclaimer +> **Disclaimer** > > This document is only applicable to the v1 version of the dashboard and doesn't affect the [Netdata Dashboard](/docs/dashboards-and-charts/README.md). -While the [Netdata dashboard](/src/web/gui/README.md) comes preconfigured with hundreds of charts and +While the [Netdata dashboard](/src/web/gui/README.md) comes pre-configured with hundreds of charts and thousands of metrics, you may want to alter your experience based on a particular use case or preferences. ## Dashboard settings -To change dashboard settings, click the on the **settings** icon +To change dashboard settings, click the on the **settings** icon ![Import icon](https://raw.githubusercontent.com/netdata/netdata-ui/98e31799c1ec0983f433537ff16d2ac2b0d994aa/src/components/icon/assets/gear.svg) in the top panel. @@ -21,10 +21,9 @@ Here are a few popular settings: ### Change chart legend position -Find this setting under the **Visual** tab. By default, Netdata places the legend of dimensions _below_ charts. +Find this setting under the **Visual** tab. By default, Netdata places the legend of dimensions _below_ charts. Click this toggle to move the legend to the _right_ of charts. - ### Change theme Find this setting under the **Visual** tab. Choose between Dark (the default) and White. @@ -67,9 +66,9 @@ dashboard. Save the file, then navigate to your [Netdata config directory](/docs/netdata-agent/configuration/README.md) to edit `netdata.conf`. Add the following line to the `[web]` section to tell Netdata where to find your custom configuration. -```conf +```text [web] custom dashboard_info.js = your_dashboard_info_file.js ``` -Reload your browser tab to see your custom configuration. \ No newline at end of file +Reload your browser tab to see your custom configuration. diff --git a/docs/developer-and-contributor-corner/kubernetes-k8s-netdata.md b/docs/developer-and-contributor-corner/kubernetes-k8s-netdata.md deleted file mode 100644 index 011aac8da..000000000 --- a/docs/developer-and-contributor-corner/kubernetes-k8s-netdata.md +++ /dev/null @@ -1,237 +0,0 @@ -# Kubernetes monitoring with Netdata - -This document gives an overview of what visualizations Netdata provides on Kubernetes deployments. - -At Netdata, we've built Kubernetes monitoring tools that add visibility without complexity while also helping you -actively troubleshoot anomalies or outages. This guide walks you through each of the visualizations and offers best -practices on how to use them to start Kubernetes monitoring in a matter of minutes, not hours or days. - -Netdata's Kubernetes monitoring solution uses a handful of [complementary tools and -collectors](#related-reference-documentation) for peeling back the many complex layers of a Kubernetes cluster, -_entirely for free_. These methods work together to give you every metric you need to troubleshoot performance or -availability issues across your Kubernetes infrastructure. - -## Challenge - -While Kubernetes (k8s) might simplify the way you deploy, scale, and load-balance your applications, not all clusters -come with "batteries included" when it comes to monitoring. Doubly so for a monitoring stack that helps you actively -troubleshoot issues with your cluster. - -Some k8s providers, like GKE (Google Kubernetes Engine), do deploy clusters bundled with monitoring capabilities, such -as Google Stackdriver Monitoring. However, these pre-configured solutions might not offer the depth of metrics, -customization, or integration with your preferred alerting methods. - -Without this visibility, it's like you built an entire house and _then_ smashed your way through the finished walls to -add windows. - -## Solution - -In this tutorial, you'll learn how to navigate Netdata's Kubernetes monitoring features, using -[robot-shop](https://github.com/instana/robot-shop) as an example deployment. Deploying robot-shop is purely optional. -You can also follow along with your own Kubernetes deployment if you choose. While the metrics might be different, the -navigation and best practices are the same for every cluster. - -## What you need to get started - -To follow this tutorial, you need: - -- A free Netdata Cloud account. [Sign up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) if you don't have one - already. -- A working cluster running Kubernetes v1.9 or newer, with a Netdata deployment and connected parent/child nodes. See - our [Kubernetes deployment process](/packaging/installer/methods/kubernetes.md) for details on deployment and - conneting to Cloud. -- The [`kubectl`](https://kubernetes.io/docs/reference/kubectl/overview/) command line tool, within [one minor version - difference](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin) of your cluster, on an - administrative system. -- The [Helm package manager](https://helm.sh/) v3.0.0 or newer on the same administrative system. - -### Install the `robot-shop` demo (optional) - -Begin by downloading the robot-shop code and using `helm` to create a new deployment. - -```bash -git clone git@github.com:instana/robot-shop.git -cd robot-shop/K8s/helm -kubectl create ns robot-shop -helm install robot-shop --namespace robot-shop . -``` - -Running `kubectl get pods` shows both the Netdata and robot-shop deployments. - -```bash -kubectl get pods --all-namespaces -NAMESPACE NAME READY STATUS RESTARTS AGE -default netdata-child-29f9c 2/2 Running 0 10m -default netdata-child-8xphf 2/2 Running 0 10m -default netdata-child-jdvds 2/2 Running 0 11m -default netdata-parent-554c755b7d-qzrx4 1/1 Running 0 11m -kube-system aws-node-jnjv8 1/1 Running 0 17m -kube-system aws-node-svzdb 1/1 Running 0 17m -kube-system aws-node-ts6n2 1/1 Running 0 17m -kube-system coredns-559b5db75d-f58hp 1/1 Running 0 22h -kube-system coredns-559b5db75d-tkzj2 1/1 Running 0 22h -kube-system kube-proxy-9p9cd 1/1 Running 0 17m -kube-system kube-proxy-lt9ss 1/1 Running 0 17m -kube-system kube-proxy-n75t9 1/1 Running 0 17m -robot-shop cart-b4bbc8fff-t57js 1/1 Running 0 14m -robot-shop catalogue-8b5f66c98-mr85z 1/1 Running 0 14m -robot-shop dispatch-67d955c7d8-lnr44 1/1 Running 0 14m -robot-shop mongodb-7f65d86c-dsslc 1/1 Running 0 14m -robot-shop mysql-764c4c5fc7-kkbnf 1/1 Running 0 14m -robot-shop payment-67c87cb7d-5krxv 1/1 Running 0 14m -robot-shop rabbitmq-5bb66bb6c9-6xr5b 1/1 Running 0 14m -robot-shop ratings-94fd9c75b-42wvh 1/1 Running 0 14m -robot-shop redis-0 0/1 Pending 0 14m -robot-shop shipping-7d69cb88b-w7hpj 1/1 Running 0 14m -robot-shop user-79c445b44b-hwnm9 1/1 Running 0 14m -robot-shop web-8bb887476-lkcjx 1/1 Running 0 14m -``` - -## Explore Netdata's Kubernetes monitoring charts - -The Netdata Helm chart deploys and enables everything you need for monitoring Kubernetes on every layer. Once you deploy -Netdata and connect your cluster's nodes, you're ready to check out the visualizations **with zero configuration**. - -To get started, [sign in](https://app.netdata.cloud/sign-in?cloudRoute=/spaces) to your Netdata Cloud account. Head over -to the Room you connected your cluster to, if not **General**. - -Let's walk through monitoring each layer of a Kubernetes cluster using the Overview as our framework. - -## Cluster and node metrics - -The gauges and time-series charts you see right away in the Overview show aggregated metrics from every node in your -cluster. - -For example, the `apps.cpu` chart (in the **Applications** menu item), visualizes the CPU utilization of various -applications/services running on each of the nodes in your cluster. The **X Nodes** dropdown shows which nodes -contribute to the chart and links to jump a single-node dashboard for further investigation. - -![Per-application monitoring in a Kubernetes -cluster](https://user-images.githubusercontent.com/1153921/109042169-19c8fa00-768d-11eb-91a7-1a7afc41fea2.png) - -For example, the chart above shows a spike in the CPU utilization from `rabbitmq` every minute or so, along with a -baseline CPU utilization of 10-15% across the cluster. - - -## Pod and container metrics - -Click on the **Kubernetes xxxxxxx...** section to jump down to Netdata Cloud's unique Kubernetes visualizations for view -real-time resource utilization metrics from your Kubernetes pods and containers. - -![Navigating to the Kubernetes monitoring -visualizations](https://user-images.githubusercontent.com/1153921/109049195-349f6c80-7695-11eb-8902-52a029dca77f.png) - -### Health map - -The first visualization is the [health map](/docs/dashboards-and-charts/kubernetes-tab.md#health-map), -which places each container into its own box, then varies the intensity of their color to visualize the resource -utilization. By default, the health map shows the **average CPU utilization as a percentage of the configured limit** -for every container in your cluster. - -![The Kubernetes health map in Netdata -Cloud](https://user-images.githubusercontent.com/1153921/109050085-3f0e3600-7696-11eb-988f-52cb187f53ea.png) - -Let's explore the most colorful box by hovering over it. - -![Hovering over a -container](https://user-images.githubusercontent.com/1153921/109049544-a8417980-7695-11eb-80a7-109b4a645a27.png) - -The **Context** tab shows `rabbitmq-5bb66bb6c9-6xr5b` as the container's image name, which means this container is -running a [RabbitMQ](/src/go/plugin/go.d/modules/rabbitmq/README.md) workload. - -Click the **Metrics** tab to see real-time metrics from that container. Unsurprisingly, it shows a spike in CPU -utilization at regular intervals. - -![Viewing real-time container -metrics](https://user-images.githubusercontent.com/1153921/109050482-aa580800-7696-11eb-9e3e-d3bdf0f3eff7.png) - -### Time-series charts - -Beneath the health map is a variety of time-series charts that help you visualize resource utilization over time, which -is useful for targeted troubleshooting. - -The default is to display metrics grouped by the `k8s_namespace` label, which shows resource utilization based on your -different namespaces. - -![Time-series Kubernetes monitoring in Netdata -Cloud](https://user-images.githubusercontent.com/1153921/109075210-126a1680-76b6-11eb-918d-5acdcdac152d.png) - -Each composite chart has a [definition bar](/docs/dashboards-and-charts/netdata-charts.md#definition-bar) -for complete customization. For example, grouping the top chart by `k8s_container_name` reveals new information. - -![Changing time-series charts](https://user-images.githubusercontent.com/1153921/109075212-139b4380-76b6-11eb-836f-939482ae55fc.png) - -## Service metrics - -Netdata has a [service discovery plugin](https://github.com/netdata/agent-service-discovery), which discovers and -creates configuration files for [compatible -services](https://github.com/netdata/helmchart#service-discovery-and-supported-services) and any endpoints covered by -our [generic Prometheus collector](/src/go/plugin/go.d/modules/prometheus/README.md). -Netdata uses these files to collect metrics from any compatible application as they run _inside_ of a pod. Service -discovery happens without manual intervention as pods are created, destroyed, or moved between nodes. - -Service metrics show up on the Overview as well, beneath the **Kubernetes** section, and are labeled according to the -service in question. For example, the **RabbitMQ** section has numerous charts from the [`rabbitmq` -collector](/src/go/plugin/go.d/modules/rabbitmq/README.md): - -![Finding service discovery -metrics](https://user-images.githubusercontent.com/1153921/109054511-2eac8a00-769b-11eb-97f1-da93acb4b5fe.png) - -> The robot-shop cluster has more supported services, such as MySQL, which are not visible with zero configuration. This -> is usually because of services running on non-default ports, using non-default names, or required passwords. Read up -> on [configuring service discovery](/packaging/installer/methods/kubernetes.md#configure-service-discovery) to collect -> more service metrics. - -Service metrics are essential to infrastructure monitoring, as they're the best indicator of the end-user experience, -and key signals for troubleshooting anomalies or issues. - -## Kubernetes components - -Netdata also automatically collects metrics from two essential Kubernetes processes. - -### kubelet - -The **k8s kubelet** section visualizes metrics from the Kubernetes agent responsible for managing every pod on a given -node. This also happens without any configuration thanks to the [kubelet -collector](/src/go/plugin/go.d/modules/k8s_kubelet/README.md). - -Monitoring each node's kubelet can be invaluable when diagnosing issues with your Kubernetes cluster. For example, you -can see if the number of running containers/pods has dropped, which could signal a fault or crash in a particular -Kubernetes service or deployment (see `kubectl get services` or `kubectl get deployments` for more details). If the -number of pods increases, it may be because of something more benign, like another team member scaling up a -service with `kubectl scale`. - -You can also view charts for the Kubelet API server, the volume of runtime/Docker operations by type, -configuration-related errors, and the actual vs. desired numbers of volumes, plus a lot more. - -### kube-proxy - -The **k8s kube-proxy** section displays metrics about the network proxy that runs on each node in your Kubernetes -cluster. kube-proxy lets pods communicate with each other and accept sessions from outside your cluster. Its metrics are -collected by the [kube-proxy -collector](/src/go/plugin/go.d/modules/k8s_kubeproxy/README.md). - -With Netdata, you can monitor how often your k8s proxies are syncing proxy rules between nodes. Dramatic changes in -these figures could indicate an anomaly in your cluster that's worthy of further investigation. - -## What's next? - -After reading this guide, you should now be able to monitor any Kubernetes cluster with Netdata, including nodes, pods, -containers, services, and more. - -With the health map, time-series charts, and the ability to drill down into individual nodes, you can see hundreds of -per-second metrics with zero configuration and less time remembering all the `kubectl` options. Netdata moves with your -cluster, automatically picking up new nodes or services as your infrastructure scales. And it's entirely free for -clusters of all sizes. - -### Related reference documentation - -- [Netdata Helm chart](https://github.com/netdata/helmchart) -- [Netdata service discovery](https://github.com/netdata/agent-service-discovery) -- [Netdata Agent · `kubelet` - collector](/src/go/plugin/go.d/modules/k8s_kubelet/README.md) -- [Netdata Agent · `kube-proxy` - collector](/src/go/plugin/go.d/modules/k8s_kubeproxy/README.md) -- [Netdata Agent · `cgroups.plugin`](/src/collectors/cgroups.plugin/README.md) - - diff --git a/docs/developer-and-contributor-corner/kubernetes-k8s-netdata.txt b/docs/developer-and-contributor-corner/kubernetes-k8s-netdata.txt new file mode 100644 index 000000000..5ebb963c3 --- /dev/null +++ b/docs/developer-and-contributor-corner/kubernetes-k8s-netdata.txt @@ -0,0 +1,234 @@ +# Kubernetes monitoring with Netdata + +This document gives an overview of what visualizations Netdata provides on Kubernetes deployments. + +At Netdata, we've built Kubernetes monitoring tools that add visibility without complexity while also helping you +actively troubleshoot anomalies or outages. This guide walks you through each of the visualizations and offers best +practices on how to use them to start Kubernetes monitoring in a matter of minutes, not hours or days. + +Netdata's Kubernetes monitoring solution uses a handful of [complementary tools and +collectors](#related-reference-documentation) for peeling back the many complex layers of a Kubernetes cluster, +_entirely for free_. These methods work together to give you every metric you need to troubleshoot performance or +availability issues across your Kubernetes infrastructure. + +## Challenge + +While Kubernetes (k8s) might simplify the way you deploy, scale, and load-balance your applications, not all clusters +come with "batteries included" when it comes to monitoring. Doubly so for a monitoring stack that helps you actively +troubleshoot issues with your cluster. + +Some k8s providers, like GKE (Google Kubernetes Engine), do deploy clusters bundled with monitoring capabilities, such +as Google Stackdriver Monitoring. However, these pre-configured solutions might not offer the depth of metrics, +customization, or integration with your preferred alerting methods. + +Without this visibility, it's like you built an entire house and _then_ smashed your way through the finished walls to +add windows. + +## Solution + +In this tutorial, you'll learn how to navigate Netdata's Kubernetes monitoring features, using +[robot-shop](https://github.com/instana/robot-shop) as an example deployment. Deploying robot-shop is purely optional. +You can also follow along with your own Kubernetes deployment if you choose. While the metrics might be different, the +navigation and best practices are the same for every cluster. + +## What you need to get started + +To follow this tutorial, you need: + +- A free Netdata Cloud account. [Sign up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) if you don't have one + already. +- A working cluster running Kubernetes v1.9 or newer, with a Netdata deployment and connected parent/child nodes. See + our [Kubernetes deployment process](/packaging/installer/methods/kubernetes.md) for details on deployment and + connecting to Cloud. +- The [`kubectl`](https://kubernetes.io/docs/reference/kubectl/overview/) command line tool, within [one minor version + difference](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin) of your cluster, on an + administrative system. +- The [Helm package manager](https://helm.sh/) v3.0.0 or newer on the same administrative system. + +### Install the `robot-shop` demo (optional) + +Begin by downloading the robot-shop code and using `helm` to create a new deployment. + +```bash +git clone git@github.com:instana/robot-shop.git +cd robot-shop/K8s/helm +kubectl create ns robot-shop +helm install robot-shop --namespace robot-shop . +``` + +Running `kubectl get pods` shows both the Netdata and robot-shop deployments. + +```bash +kubectl get pods --all-namespaces +NAMESPACE NAME READY STATUS RESTARTS AGE +default netdata-child-29f9c 2/2 Running 0 10m +default netdata-child-8xphf 2/2 Running 0 10m +default netdata-child-jdvds 2/2 Running 0 11m +default netdata-parent-554c755b7d-qzrx4 1/1 Running 0 11m +kube-system aws-node-jnjv8 1/1 Running 0 17m +kube-system aws-node-svzdb 1/1 Running 0 17m +kube-system aws-node-ts6n2 1/1 Running 0 17m +kube-system coredns-559b5db75d-f58hp 1/1 Running 0 22h +kube-system coredns-559b5db75d-tkzj2 1/1 Running 0 22h +kube-system kube-proxy-9p9cd 1/1 Running 0 17m +kube-system kube-proxy-lt9ss 1/1 Running 0 17m +kube-system kube-proxy-n75t9 1/1 Running 0 17m +robot-shop cart-b4bbc8fff-t57js 1/1 Running 0 14m +robot-shop catalogue-8b5f66c98-mr85z 1/1 Running 0 14m +robot-shop dispatch-67d955c7d8-lnr44 1/1 Running 0 14m +robot-shop mongodb-7f65d86c-dsslc 1/1 Running 0 14m +robot-shop mysql-764c4c5fc7-kkbnf 1/1 Running 0 14m +robot-shop payment-67c87cb7d-5krxv 1/1 Running 0 14m +robot-shop rabbitmq-5bb66bb6c9-6xr5b 1/1 Running 0 14m +robot-shop ratings-94fd9c75b-42wvh 1/1 Running 0 14m +robot-shop redis-0 0/1 Pending 0 14m +robot-shop shipping-7d69cb88b-w7hpj 1/1 Running 0 14m +robot-shop user-79c445b44b-hwnm9 1/1 Running 0 14m +robot-shop web-8bb887476-lkcjx 1/1 Running 0 14m +``` + +## Explore Netdata's Kubernetes monitoring charts + +The Netdata Helm chart deploys and enables everything you need for monitoring Kubernetes on every layer. Once you deploy +Netdata and connect your cluster's nodes, you're ready to check out the visualizations **with zero configuration**. + +To get started, [sign in](https://app.netdata.cloud/sign-in?cloudRoute=/spaces) to your Netdata Cloud account. Head over +to the Room you connected your cluster to, if not **General**. + +Let's walk through monitoring each layer of a Kubernetes cluster using the Overview as our framework. + +## Cluster and node metrics + +The gauges and time-series charts you see right away in the Overview show aggregated metrics from every node in your +cluster. + +For example, the `apps.cpu` chart (in the **Applications** menu item), visualizes the CPU utilization of various +applications/services running on each of the nodes in your cluster. The **X Nodes** dropdown shows which nodes +contribute to the chart and links to jump a single-node dashboard for further investigation. + +![Per-application monitoring in a Kubernetes +cluster](https://user-images.githubusercontent.com/1153921/109042169-19c8fa00-768d-11eb-91a7-1a7afc41fea2.png) + +For example, the chart above shows a spike in the CPU utilization from `rabbitmq` every minute or so, along with a +baseline CPU utilization of 10-15% across the cluster. + +## Pod and container metrics + +Click on the **Kubernetes xxxxxxx...** section to jump down to Netdata Cloud's unique Kubernetes visualizations for view +real-time resource utilization metrics from your Kubernetes pods and containers. + +![Navigating to the Kubernetes monitoring +visualizations](https://user-images.githubusercontent.com/1153921/109049195-349f6c80-7695-11eb-8902-52a029dca77f.png) + +### Health map + +The first visualization is the [health map](/docs/dashboards-and-charts/kubernetes-tab.md#health-map), +which places each container into its own box, then varies the intensity of their color to visualize the resource +utilization. By default, the health map shows the **average CPU utilization as a percentage of the configured limit** +for every container in your cluster. + +![The Kubernetes health map in Netdata +Cloud](https://user-images.githubusercontent.com/1153921/109050085-3f0e3600-7696-11eb-988f-52cb187f53ea.png) + +Let's explore the most colorful box by hovering over it. + +![Hovering over a +container](https://user-images.githubusercontent.com/1153921/109049544-a8417980-7695-11eb-80a7-109b4a645a27.png) + +The **Context** tab shows `rabbitmq-5bb66bb6c9-6xr5b` as the container's image name, which means this container is +running a [RabbitMQ](/src/go/plugin/go.d/modules/rabbitmq/README.md) workload. + +Click the **Metrics** tab to see real-time metrics from that container. Unsurprisingly, it shows a spike in CPU +utilization at regular intervals. + +![Viewing real-time container +metrics](https://user-images.githubusercontent.com/1153921/109050482-aa580800-7696-11eb-9e3e-d3bdf0f3eff7.png) + +### Time-series charts + +Beneath the health map is a variety of time-series charts that help you visualize resource utilization over time, which +is useful for targeted troubleshooting. + +The default is to display metrics grouped by the `k8s_namespace` label, which shows resource utilization based on your +different namespaces. + +![Time-series Kubernetes monitoring in Netdata +Cloud](https://user-images.githubusercontent.com/1153921/109075210-126a1680-76b6-11eb-918d-5acdcdac152d.png) + +Each composite chart has a [definition bar](/docs/dashboards-and-charts/netdata-charts.md#definition-bar) +for complete customization. For example, grouping the top chart by `k8s_container_name` reveals new information. + +![Changing time-series charts](https://user-images.githubusercontent.com/1153921/109075212-139b4380-76b6-11eb-836f-939482ae55fc.png) + +## Service metrics + +Netdata has a [service discovery plugin](https://github.com/netdata/agent-service-discovery), which discovers and +creates configuration files for [compatible +services](https://github.com/netdata/helmchart#service-discovery-and-supported-services) and any endpoints covered by +our [generic Prometheus collector](/src/go/plugin/go.d/modules/prometheus/README.md). +Netdata uses these files to collect metrics from any compatible application as they run _inside_ of a pod. Service +discovery happens without manual intervention as pods are created, destroyed, or moved between nodes. + +Service metrics show up on the Overview as well, beneath the **Kubernetes** section, and are labeled according to the +service in question. For example, the **RabbitMQ** section has numerous charts from the [`rabbitmq` +collector](/src/go/plugin/go.d/modules/rabbitmq/README.md): + +![Finding service discovery +metrics](https://user-images.githubusercontent.com/1153921/109054511-2eac8a00-769b-11eb-97f1-da93acb4b5fe.png) + +> The robot-shop cluster has more supported services, such as MySQL, which are not visible with zero configuration. This +> is usually because of services running on non-default ports, using non-default names, or required passwords. Read up +> on [configuring service discovery](/packaging/installer/methods/kubernetes.md#configure-service-discovery) to collect +> more service metrics. + +Service metrics are essential to infrastructure monitoring, as they're the best indicator of the end-user experience, +and key signals for troubleshooting anomalies or issues. + +## Kubernetes components + +Netdata also automatically collects metrics from two essential Kubernetes processes. + +### kubelet + +The **k8s kubelet** section visualizes metrics from the Kubernetes agent responsible for managing every pod on a given +node. This also happens without any configuration thanks to the [kubelet +collector](/src/go/plugin/go.d/modules/k8s_kubelet/README.md). + +Monitoring each node's kubelet can be invaluable when diagnosing issues with your Kubernetes cluster. For example, you +can see if the number of running containers/pods has dropped, which could signal a fault or crash in a particular +Kubernetes service or deployment (see `kubectl get services` or `kubectl get deployments` for more details). If the +number of pods increases, it may be because of something more benign, like another team member scaling up a +service with `kubectl scale`. + +You can also view charts for the Kubelet API server, the volume of runtime/Docker operations by type, +configuration-related errors, and the actual vs. desired numbers of volumes, plus a lot more. + +### kube-proxy + +The **k8s kube-proxy** section displays metrics about the network proxy that runs on each node in your Kubernetes +cluster. kube-proxy lets pods communicate with each other and accept sessions from outside your cluster. Its metrics are +collected by the [kube-proxy +collector](/src/go/plugin/go.d/modules/k8s_kubeproxy/README.md). + +With Netdata, you can monitor how often your k8s proxies are syncing proxy rules between nodes. Dramatic changes in +these figures could indicate an anomaly in your cluster that's worthy of further investigation. + +## What's next? + +After reading this guide, you should now be able to monitor any Kubernetes cluster with Netdata, including nodes, pods, +containers, services, and more. + +With the health map, time-series charts, and the ability to drill down into individual nodes, you can see hundreds of +per-second metrics with zero configuration and less time remembering all the `kubectl` options. Netdata moves with your +cluster, automatically picking up new nodes or services as your infrastructure scales. And it's entirely free for +clusters of all sizes. + +### Related reference documentation + +- [Netdata Helm chart](https://github.com/netdata/helmchart) +- [Netdata service discovery](https://github.com/netdata/agent-service-discovery) +- [Netdata Agent · `kubelet` + collector](/src/go/plugin/go.d/modules/k8s_kubelet/README.md) +- [Netdata Agent · `kube-proxy` + collector](/src/go/plugin/go.d/modules/k8s_kubeproxy/README.md) +- [Netdata Agent · `cgroups.plugin`](/src/collectors/cgroups.plugin/README.md) diff --git a/docs/developer-and-contributor-corner/lamp-stack.md b/docs/developer-and-contributor-corner/lamp-stack.md deleted file mode 100644 index 2df5a7167..000000000 --- a/docs/developer-and-contributor-corner/lamp-stack.md +++ /dev/null @@ -1,238 +0,0 @@ -import { OneLineInstallWget } from '@site/src/components/OneLineInstall/' - -# LAMP stack monitoring with Netdata - -Set up robust LAMP stack monitoring (Linux, Apache, MySQL, PHP) in a few minutes using Netdata. - -The LAMP stack is the "hello world" for deploying dynamic web applications. It's fast, flexible, and reliable, which -means a developer or sysadmin won't go far in their career without interacting with the stack and its services. - -_LAMP_ is an acronym of the core services that make up the web application: **L**inux, **A**pache, **M**ySQL, and -**P**HP. - -- [Linux](https://en.wikipedia.org/wiki/Linux) is the operating system running the whole stack. -- [Apache](https://httpd.apache.org/) is a web server that responds to HTTP requests from users and returns web pages. -- [MySQL](https://www.mysql.com/) is a database that stores and returns information based on queries from the web - application. -- [PHP](https://www.php.net/) is a scripting language used to query the MySQL database and build new pages. - -LAMP stacks are the foundation for tons of end-user applications, with [Wordpress](https://wordpress.org/) being the -most popular. - -## Challenge - -You've already deployed a LAMP stack, either in testing or production. You want to monitor every service's performance -and availability to ensure the best possible experience for your end-users. You might also be particularly interested in -using a free, open-source monitoring tool. - -Depending on your monitoring experience, you may not even know what metrics you're looking for, much less how to build -dashboards using a query language. You need a robust monitoring experience that has the metrics you need without a ton -of required setup. - -## Solution - -In this tutorial, you'll set up robust LAMP stack monitoring with Netdata in just a few minutes. When you're done, -you'll have one dashboard to monitor every part of your web application, including each essential LAMP stack service. - -This dashboard updates every second with new metrics, and pairs those metrics up with preconfigured alerts to keep you -informed of any errors or odd behavior. - -## What you need to get started - -To follow this tutorial, you need: - -- A physical or virtual Linux system, which we'll call a _node_. -- A functional LAMP stack. There's plenty of tutorials for installing a LAMP stack, like [this - one](https://www.digitalocean.com/community/tutorials/how-to-install-linux-apache-mysql-php-lamp-stack-ubuntu-18-04) - from Digital Ocean. -- Optionally, a [Netdata Cloud](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) account, which you can use to view - metrics from multiple nodes in one dashboard, and a whole lot more, for free. - -## Install the Netdata Agent - -If you don't have the free, open-source Netdata monitoring agent installed on your node yet, get started with a [single -kickstart command](/packaging/installer/README.md): - - - -The Netdata Agent is now collecting metrics from your node every second. You don't need to jump into the dashboard yet, -but if you're curious, open your favorite browser and navigate to `http://localhost:19999` or `http://NODE:19999`, -replacing `NODE` with the hostname or IP address of your system. - -## Enable hardware and Linux system monitoring - -There's nothing you need to do to enable system monitoring and Linux monitoring with -the Netdata Agent, which autodetects metrics from CPUs, memory, disks, networking devices, and Linux processes like -systemd without any configuration. If you're using containers, Netdata automatically collects resource utilization -metrics from each using the [cgroups data collector](/src/collectors/cgroups.plugin/README.md). - -## Enable Apache monitoring - -Let's begin by configuring Apache to work with Netdata's [Apache data -collector](/src/go/plugin/go.d/modules/apache/README.md). - -Actually, there's nothing for you to do to enable Apache monitoring with Netdata. - -Apache comes with `mod_status` enabled by default these days, and Netdata is smart enough to look for metrics at that -endpoint without you configuring it. Netdata is already collecting [`mod_status` -metrics](https://httpd.apache.org/docs/2.4/mod/mod_status.html), which is just _part_ of your web server monitoring. - -## Enable web log monitoring - -The Netdata Agent also comes with a [web log -collector](/src/go/plugin/go.d/modules/weblog/README.md), which reads Apache's access -log file, processes each line, and converts them into per-second metrics. On Debian systems, it reads the file at -`/var/log/apache2/access.log`. - -At installation, the Netdata Agent adds itself to the [`adm` -group](https://wiki.debian.org/SystemGroups#Groups_without_an_associated_user), which gives the `netdata` process the -right privileges to read Apache's log files. In other words, you don't need to do anything to enable Apache web log -monitoring. - -## Enable MySQL monitoring - -Because your MySQL database is password-protected, you do need to tell MySQL to allow the `netdata` user to connect to -without a password. Netdata's [MySQL data -collector](/src/go/plugin/go.d/modules/mysql/README.md) collects metrics in _read-only_ -mode, without being able to alter or affect operations in any way. - -First, log into the MySQL shell. Then, run the following three commands, one at a time: - -```mysql -CREATE USER 'netdata'@'localhost'; -GRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost'; -FLUSH PRIVILEGES; -``` - -Run `sudo systemctl restart netdata`, or the [appropriate alternative for your -system](/packaging/installer/README.md#maintaining-a-netdata-agent-installation), to collect dozens of metrics every second for robust MySQL monitoring. - -## Enable PHP monitoring - -Unlike Apache or MySQL, PHP isn't a service that you can monitor directly, unless you instrument a PHP-based application -with [StatsD](/src/collectors/statsd.plugin/README.md). - -However, if you use [PHP-FPM](https://php-fpm.org/) in your LAMP stack, you can monitor that process with our [PHP-FPM -data collector](/src/go/plugin/go.d/modules/phpfpm/README.md). - -Open your PHP-FPM configuration for editing, replacing `7.4` with your version of PHP: - -```bash -sudo nano /etc/php/7.4/fpm/pool.d/www.conf -``` - -> Not sure what version of PHP you're using? Run `php -v`. - -Find the line that reads `;pm.status_path = /status` and remove the `;` so it looks like this: - -```conf -pm.status_path = /status -``` - -Next, add a new `/status` endpoint to Apache. Open the Apache configuration file you're using for your LAMP stack. - -```bash -sudo nano /etc/apache2/sites-available/your_lamp_stack.conf -``` - -Add the following to the end of the file, again replacing `7.4` with your version of PHP: - -```apache -ProxyPass "/status" "unix:/run/php/php7.4-fpm.sock|fcgi://localhost" -``` - -Save and close the file. Finally, restart the PHP-FPM, Apache, and Netdata processes. - -```bash -sudo systemctl restart php7.4-fpm.service -sudo systemctl restart apache2 -sudo systemctl restart netdata -``` - -As the Netdata Agent starts up again, it automatically connects to the new `127.0.0.1/status` page and collects -per-second PHP-FPM metrics to get you started with PHP monitoring. - -## View LAMP stack metrics - -If the Netdata Agent isn't already open in your browser, open a new tab and navigate to `http://localhost:19999` or -`http://NODE:19999`, replacing `NODE` with the hostname or IP address of your system. - -> If you [signed up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) for Netdata Cloud earlier, you can also view -> the exact same LAMP stack metrics there, plus additional features, like drag-and-drop custom dashboards. Be sure to -> [connecting your node](/src/claim/README.md) to start streaming metrics to your browser through Netdata Cloud. - -Netdata automatically organizes all metrics and charts onto a single page for easy navigation. Peek at gauges to see -overall system performance, then scroll down to see more. Click-and-drag with your mouse to pan _all_ charts back and -forth through different time intervals, or hold `SHIFT` and use the scrollwheel (or two-finger scroll) to zoom in and -out. Check out our doc on [interacting with charts](/docs/dashboards-and-charts/netdata-charts.md) for all the details. - -![The Netdata dashboard](https://user-images.githubusercontent.com/1153921/109520555-98e17800-7a69-11eb-86ec-16f689da4527.png) - -The **System Overview** section, which you can also see in the right-hand menu, contains key hardware monitoring charts, -including CPU utilization, memory page faults, network monitoring, and much more. The **Applications** section shows you -exactly which Linux processes are using the most system resources. - -Next, let's check out LAMP-specific metrics. You should see four relevant sections: **Apache local**, **MySQL local**, -**PHP-FPM local**, and **web log apache**. Click on any of these to see metrics from each service in your LAMP stack. - -![LAMP stack monitoring in -Netdata](https://user-images.githubusercontent.com/1153921/109516332-49994880-7a65-11eb-807c-3cba045582e6.png) - -### Key LAMP stack monitoring charts - -Here's a quick reference for what charts you might want to focus on after setting up Netdata. - -| Chart name / context | Type | Why? | -|-------------------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| System Load Average (`system.load`) | Hardware monitoring | A good baseline load average is `0.7`, while `1` (on a 1-core system, `2` on a 2-core system, and so on) means resources are "perfectly" utilized. Higher load indicates a bottleneck somewhere in your system. | -| System RAM (`system.ram`) | Hardware monitoring | Look at the `free` dimension. If that drops to `0`, your system will use swap memory and slow down. | -| Uptime (`apache_local.uptime`) | Apache monitoring | This chart should always be "climbing," indicating a continuous uptime. Investigate any drops back to `0`. | -| Requests By Type (`web_log_apache.requests_by_type`) | Apache monitoring | Check for increases in the `error` or `bad` dimensions, which could indicate users arriving at broken pages or PHP returning errors. | -| Queries (`mysql_local.queries`) | MySQL monitoring | Queries is the total number of queries (queries per second, QPS). Check this chart for sudden spikes or drops, which indicate either increases in traffic/demand or bottlenecks in hardware performance. | -| Active Connections (`mysql_local.connections_active`) | MySQL monitoring | If the `active` dimension nears the `limit`, your MySQL database will bottleneck responses. | -| Performance (phpfpm_local.performance) | PHP monitoring | The `slow requests` dimension lets you know if any requests exceed the configured `request_slowlog_timeout`. If so, users might be having a less-than-ideal experience. | - -## Get alerts for LAMP stack errors - -The Netdata Agent comes with hundreds of pre-configured alerts to help you keep tabs on your system, including 19 alerts -designed for smarter LAMP stack monitoring. - -Click the 🔔 icon in the top navigation to [see active alerts](/docs/dashboards-and-charts/alerts-tab.md). The **Active** tabs -shows any alerts currently triggered, while the **All** tab displays a list of _every_ pre-configured alert. The - -![An example of LAMP stack -alerts](https://user-images.githubusercontent.com/1153921/109524120-5883f900-7a6d-11eb-830e-0e7baaa28163.png) - -[Tweak alerts](/src/health/REFERENCE.md) based on your infrastructure monitoring needs, and to see these alerts -in other places, like your inbox or a Slack channel, [enable a notification -method](/docs/alerts-and-notifications/notifications/README.md). - -## What's next? - -You've now set up robust monitoring for your entire LAMP stack: Linux, Apache, MySQL, and PHP (-FPM, to be exact). These -metrics will help you keep tabs on the performance and availability of your web application and all its essential -services. The per-second metrics granularity means you have the most accurate information possible for troubleshooting -any LAMP-related issues. - -Another powerful way to monitor the availability of a LAMP stack is the [`httpcheck` -collector](/src/go/plugin/go.d/modules/httpcheck/README.md), which pings a web server at -a regular interval and tells you whether if and how quickly it's responding. The `response_match` option also lets you -monitor when the web server's response isn't what you expect it to be, which might happen if PHP-FPM crashes, for -example. - -The best way to use the `httpcheck` collector is from a separate node from the one running your LAMP stack, which is why -we're not covering it here, but it _does_ work in a single-node setup. Just don't expect it to tell you if your whole -node crashed. - -If you're planning on managing more than one node, or want to take advantage of advanced features, like finding the -source of issues faster with [Metric Correlations](/docs/metric-correlations.md), -[sign up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) for a free Netdata Cloud account. - -### Related reference documentation - -- [Netdata Agent · Get started](/packaging/installer/README.md) -- [Netdata Agent · Apache data collector](/src/go/plugin/go.d/modules/apache/README.md) -- [Netdata Agent · Web log collector](/src/go/plugin/go.d/modules/weblog/README.md) -- [Netdata Agent · MySQL data collector](/src/go/plugin/go.d/modules/mysql/README.md) -- [Netdata Agent · PHP-FPM data collector](/src/go/plugin/go.d/modules/phpfpm/README.md) - diff --git a/docs/developer-and-contributor-corner/lamp-stack.txt b/docs/developer-and-contributor-corner/lamp-stack.txt new file mode 100644 index 000000000..bc4611ac1 --- /dev/null +++ b/docs/developer-and-contributor-corner/lamp-stack.txt @@ -0,0 +1,237 @@ +import { OneLineInstallWget } from '@site/src/components/OneLineInstall/' + +# LAMP stack monitoring with Netdata + +Set up robust LAMP stack monitoring (Linux, Apache, MySQL, PHP) in a few minutes using Netdata. + +The LAMP stack is the "hello world" for deploying dynamic web applications. It's fast, flexible, and reliable, which +means a developer or sysadmin won't go far in their career without interacting with the stack and its services. + +_LAMP_ is an acronym of the core services that make up the web application: **L**inux, **A**pache, **M**ySQL, and +**P**HP. + +- [Linux](https://en.wikipedia.org/wiki/Linux) is the operating system running the whole stack. +- [Apache](https://httpd.apache.org/) is a web server that responds to HTTP requests from users and returns web pages. +- [MySQL](https://www.mysql.com/) is a database that stores and returns information based on queries from the web + application. +- [PHP](https://www.php.net/) is a scripting language used to query the MySQL database and build new pages. + +LAMP stacks are the foundation for tons of end-user applications, with [Wordpress](https://wordpress.org/) being the +most popular. + +## Challenge + +You've already deployed a LAMP stack, either in testing or production. You want to monitor every service's performance +and availability to ensure the best possible experience for your end-users. You might also be particularly interested in +using a free, open-source monitoring tool. + +Depending on your monitoring experience, you may not even know what metrics you're looking for, much less how to build +dashboards using a query language. You need a robust monitoring experience that has the metrics you need without a ton +of required setup. + +## Solution + +In this tutorial, you'll set up robust LAMP stack monitoring with Netdata in just a few minutes. When you're done, +you'll have one dashboard to monitor every part of your web application, including each essential LAMP stack service. + +This dashboard updates every second with new metrics, and pairs those metrics up with preconfigured alerts to keep you +informed of any errors or odd behavior. + +## What you need to get started + +To follow this tutorial, you need: + +- A physical or virtual Linux system, which we'll call a _node_. +- A functional LAMP stack. There's plenty of tutorials for installing a LAMP stack, like [this + one](https://www.digitalocean.com/community/tutorials/how-to-install-linux-apache-mysql-php-lamp-stack-ubuntu-18-04) + from Digital Ocean. +- Optionally, a [Netdata Cloud](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) account, which you can use to view + metrics from multiple nodes in one dashboard, and a whole lot more, for free. + +## Install the Netdata Agent + +If you don't have the free, open-source Netdata monitoring agent installed on your node yet, get started with a [single +kickstart command](/packaging/installer/README.md): + + + +The Netdata Agent is now collecting metrics from your node every second. You don't need to jump into the dashboard yet, +but if you're curious, open your favorite browser and navigate to `http://localhost:19999` or `http://NODE:19999`, +replacing `NODE` with the hostname or IP address of your system. + +## Enable hardware and Linux system monitoring + +There's nothing you need to do to enable system monitoring and Linux monitoring with +the Netdata Agent, which autodetects metrics from CPUs, memory, disks, networking devices, and Linux processes like +systemd without any configuration. If you're using containers, Netdata automatically collects resource utilization +metrics from each using the [cgroups data collector](/src/collectors/cgroups.plugin/README.md). + +## Enable Apache monitoring + +Let's begin by configuring Apache to work with Netdata's [Apache data +collector](/src/go/plugin/go.d/modules/apache/README.md). + +Actually, there's nothing for you to do to enable Apache monitoring with Netdata. + +Apache comes with `mod_status` enabled by default these days, and Netdata is smart enough to look for metrics at that +endpoint without you configuring it. Netdata is already collecting [`mod_status` +metrics](https://httpd.apache.org/docs/2.4/mod/mod_status.html), which is just _part_ of your web server monitoring. + +## Enable web log monitoring + +The Netdata Agent also comes with a [web log +collector](/src/go/plugin/go.d/modules/weblog/README.md), which reads Apache's access +log file, processes each line, and converts them into per-second metrics. On Debian systems, it reads the file at +`/var/log/apache2/access.log`. + +At installation, the Netdata Agent adds itself to the [`adm` +group](https://wiki.debian.org/SystemGroups#Groups_without_an_associated_user), which gives the `netdata` process the +right privileges to read Apache's log files. In other words, you don't need to do anything to enable Apache web log +monitoring. + +## Enable MySQL monitoring + +Because your MySQL database is password-protected, you do need to tell MySQL to allow the `netdata` user to connect to +without a password. Netdata's [MySQL data +collector](/src/go/plugin/go.d/modules/mysql/README.md) collects metrics in _read-only_ +mode, without being able to alter or affect operations in any way. + +First, log into the MySQL shell. Then, run the following three commands, one at a time: + +```mysql +CREATE USER 'netdata'@'localhost'; +GRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost'; +FLUSH PRIVILEGES; +``` + +Run `sudo systemctl restart netdata`, or the [appropriate alternative for your system](/docs/netdata-agent/start-stop-restart.md), to collect dozens of metrics every second for robust MySQL monitoring. + +## Enable PHP monitoring + +Unlike Apache or MySQL, PHP isn't a service that you can monitor directly, unless you instrument a PHP-based application +with [StatsD](/src/collectors/statsd.plugin/README.md). + +However, if you use [PHP-FPM](https://php-fpm.org/) in your LAMP stack, you can monitor that process with our [PHP-FPM +data collector](/src/go/plugin/go.d/modules/phpfpm/README.md). + +Open your PHP-FPM configuration for editing, replacing `7.4` with your version of PHP: + +```bash +sudo nano /etc/php/7.4/fpm/pool.d/www.conf +``` + +> Not sure what version of PHP you're using? Run `php -v`. + +Find the line that reads `;pm.status_path = /status` and remove the `;` so it looks like this: + +```text +pm.status_path = /status +``` + +Next, add a new `/status` endpoint to Apache. Open the Apache configuration file you're using for your LAMP stack. + +```bash +sudo nano /etc/apache2/sites-available/your_lamp_stack.conf +``` + +Add the following to the end of the file, again replacing `7.4` with your version of PHP: + +```apache +ProxyPass "/status" "unix:/run/php/php7.4-fpm.sock|fcgi://localhost" +``` + +Save and close the file. Finally, restart the PHP-FPM, Apache, and Netdata processes. + +```bash +sudo systemctl restart php7.4-fpm.service +sudo systemctl restart apache2 +sudo systemctl restart netdata +``` + +As the Netdata Agent starts up again, it automatically connects to the new `127.0.0.1/status` page and collects +per-second PHP-FPM metrics to get you started with PHP monitoring. + +## View LAMP stack metrics + +If the Netdata Agent isn't already open in your browser, open a new tab and navigate to `http://localhost:19999` or +`http://NODE:19999`, replacing `NODE` with the hostname or IP address of your system. + +> If you [signed up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) for Netdata Cloud earlier, you can also view +> the exact same LAMP stack metrics there, plus additional features, like drag-and-drop custom dashboards. Be sure to +> [connecting your node](/src/claim/README.md) to start streaming metrics to your browser through Netdata Cloud. + +Netdata automatically organizes all metrics and charts onto a single page for easy navigation. Peek at gauges to see +overall system performance, then scroll down to see more. Click-and-drag with your mouse to pan _all_ charts back and +forth through different time intervals, or hold `SHIFT` and use the scrollwheel (or two-finger scroll) to zoom in and +out. Check out our doc on [interacting with charts](/docs/dashboards-and-charts/netdata-charts.md) for all the details. + +![The Netdata dashboard](https://user-images.githubusercontent.com/1153921/109520555-98e17800-7a69-11eb-86ec-16f689da4527.png) + +The **System Overview** section, which you can also see in the right-hand menu, contains key hardware monitoring charts, +including CPU utilization, memory page faults, network monitoring, and much more. The **Applications** section shows you +exactly which Linux processes are using the most system resources. + +Next, let's check out LAMP-specific metrics. You should see four relevant sections: **Apache local**, **MySQL local**, +**PHP-FPM local**, and **web log apache**. Click on any of these to see metrics from each service in your LAMP stack. + +![LAMP stack monitoring in +Netdata](https://user-images.githubusercontent.com/1153921/109516332-49994880-7a65-11eb-807c-3cba045582e6.png) + +### Key LAMP stack monitoring charts + +Here's a quick reference for what charts you might want to focus on after setting up Netdata. + +| Chart name / context | Type | Why? | +|-------------------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| System Load Average (`system.load`) | Hardware monitoring | A good baseline load average is `0.7`, while `1` (on a 1-core system, `2` on a 2-core system, and so on) means resources are "perfectly" utilized. Higher load indicates a bottleneck somewhere in your system. | +| System RAM (`system.ram`) | Hardware monitoring | Look at the `free` dimension. If that drops to `0`, your system will use swap memory and slow down. | +| Uptime (`apache_local.uptime`) | Apache monitoring | This chart should always be "climbing," indicating a continuous uptime. Investigate any drops back to `0`. | +| Requests By Type (`web_log_apache.requests_by_type`) | Apache monitoring | Check for increases in the `error` or `bad` dimensions, which could indicate users arriving at broken pages or PHP returning errors. | +| Queries (`mysql_local.queries`) | MySQL monitoring | Queries is the total number of queries (queries per second, QPS). Check this chart for sudden spikes or drops, which indicate either increases in traffic/demand or bottlenecks in hardware performance. | +| Active Connections (`mysql_local.connections_active`) | MySQL monitoring | If the `active` dimension nears the `limit`, your MySQL database will bottleneck responses. | +| Performance (phpfpm_local.performance) | PHP monitoring | The `slow requests` dimension lets you know if any requests exceed the configured `request_slowlog_timeout`. If so, users might be having a less-than-ideal experience. | + +## Get alerts for LAMP stack errors + +The Netdata Agent comes with hundreds of pre-configured alerts to help you keep tabs on your system, including 19 alerts +designed for smarter LAMP stack monitoring. + +Click the 🔔 icon in the top navigation to [see active alerts](/docs/dashboards-and-charts/alerts-tab.md). The **Active** tabs +shows any alerts currently triggered, while the **All** tab displays a list of _every_ pre-configured alert. The + +![An example of LAMP stack +alerts](https://user-images.githubusercontent.com/1153921/109524120-5883f900-7a6d-11eb-830e-0e7baaa28163.png) + +[Tweak alerts](/src/health/REFERENCE.md) based on your infrastructure monitoring needs, and to see these alerts +in other places, like your inbox or a Slack channel, [enable a notification +method](/docs/alerts-and-notifications/notifications/README.md). + +## What's next? + +You've now set up robust monitoring for your entire LAMP stack: Linux, Apache, MySQL, and PHP (-FPM, to be exact). These +metrics will help you keep tabs on the performance and availability of your web application and all its essential +services. The per-second metrics granularity means you have the most accurate information possible for troubleshooting +any LAMP-related issues. + +Another powerful way to monitor the availability of a LAMP stack is the [`httpcheck` +collector](/src/go/plugin/go.d/modules/httpcheck/README.md), which pings a web server at +a regular interval and tells you whether if and how quickly it's responding. The `response_match` option also lets you +monitor when the web server's response isn't what you expect it to be, which might happen if PHP-FPM crashes, for +example. + +The best way to use the `httpcheck` collector is from a separate node from the one running your LAMP stack, which is why +we're not covering it here, but it _does_ work in a single-node setup. Just don't expect it to tell you if your whole +node crashed. + +If you're planning on managing more than one node, or want to take advantage of advanced features, like finding the +source of issues faster with [Metric Correlations](/docs/metric-correlations.md), +[sign up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) for a free Netdata Cloud account. + +### Related reference documentation + +- [Netdata Agent · Get started](/packaging/installer/README.md) +- [Netdata Agent · Apache data collector](/src/go/plugin/go.d/modules/apache/README.md) +- [Netdata Agent · Web log collector](/src/go/plugin/go.d/modules/weblog/README.md) +- [Netdata Agent · MySQL data collector](/src/go/plugin/go.d/modules/mysql/README.md) +- [Netdata Agent · PHP-FPM data collector](/src/go/plugin/go.d/modules/phpfpm/README.md) + diff --git a/docs/developer-and-contributor-corner/monitor-cockroachdb.md b/docs/developer-and-contributor-corner/monitor-cockroachdb.md deleted file mode 100644 index f0db12cc4..000000000 --- a/docs/developer-and-contributor-corner/monitor-cockroachdb.md +++ /dev/null @@ -1,118 +0,0 @@ - - -# Monitor CockroachDB metrics with Netdata - -[CockroachDB](https://github.com/cockroachdb/cockroach) is an open-source project that brings SQL databases into -scalable, disaster-resilient cloud deployments. Thanks to -a [new CockroachDB collector](/src/go/plugin/go.d/modules/cockroachdb/README.md) -released in -[v1.20](https://blog.netdata.cloud/posts/release-1.20/), you can now monitor any number of CockroachDB databases with -maximum granularity using Netdata. Collect more than 50 unique metrics and put them on interactive visualizations -designed for better visual anomaly detection. - -Netdata itself uses CockroachDB as part of its Netdata Cloud infrastructure, so we're happy to introduce this new -collector and help others get started with it straight away. - -Let's dive in and walk through the process of monitoring CockroachDB metrics with Netdata. - -## What's in this guide - -- [Monitor CockroachDB metrics with Netdata](#monitor-cockroachdb-metrics-with-netdata) - - [What's in this guide](#whats-in-this-guide) - - [Configure the CockroachDB collector](#configure-the-cockroachdb-collector) - - [Manual setup for a local CockroachDB database](#manual-setup-for-a-local-cockroachdb-database) - - [Tweak CockroachDB alerts](#tweak-cockroachdb-alerts) - -## Configure the CockroachDB collector - -Because _all_ of Netdata's collectors can auto-detect the services they monitor, you _shouldn't_ need to worry about -configuring CockroachDB. Netdata only needs to regularly query the database's `_status/vars` page to gather metrics and -display them on the dashboard. - -If your CockroachDB instance is accessible through `http://localhost:8080/` or `http://127.0.0.1:8080`, your setup is -complete. Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate -method](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for your system, and refresh your browser. You should see CockroachDB -metrics in your Netdata dashboard! - -
- CPU utilization charts from a CockroachDB database monitored by Netdata -
CPU utilization charts from a CockroachDB database monitored by Netdata
-
- -> Note: Netdata collects metrics from CockroachDB every 10 seconds, instead of our usual 1 second, because CockroachDB -> only updates `_status/vars` every 10 seconds. You can't change this setting in CockroachDB. - -If you don't see CockroachDB charts, you may need to configure the collector manually. - -### Manual setup for a local CockroachDB database - -To configure Netdata's CockroachDB collector, navigate to your Netdata configuration directory (typically at -`/etc/netdata/`) and use `edit-config` to initialize and edit your CockroachDB configuration file. - -```bash -cd /etc/netdata/ # Replace with your Netdata configuration directory, if not /etc/netdata/ -./edit-config go.d/cockroachdb.conf -``` - -Scroll down to the `[JOBS]` section at the bottom of the file. You will see the two default jobs there, which you can -edit, or create a new job with any of the parameters listed above in the file. Both the `name` and `url` values are -required, and everything else is optional. - -For a production cluster, you'll use either an IP address or the system's hostname. Be sure that your remote system -allows TCP communication on port 8080, or whichever port you have configured CockroachDB's -[Admin UI](https://www.cockroachlabs.com/docs/stable/monitoring-and-alerting.html#prometheus-endpoint) to listen on. - -```yaml -# [ JOBS ] -jobs: - - name: remote - url: http://203.0.113.0:8080/_status/vars - - - name: remote_hostname - url: http://cockroachdb.example.com:8080/_status/vars -``` - -For a secure cluster, use `https` in the `url` field instead. - -```yaml -# [ JOBS ] -jobs: - - name: remote - url: https://203.0.113.0:8080/_status/vars - tls_skip_verify: yes # If your certificate is self-signed - - - name: remote_hostname - url: https://cockroachdb.example.com:8080/_status/vars - tls_skip_verify: yes # If your certificate is self-signed -``` - -You can add as many jobs as you'd like based on how many CockroachDB databases you have—Netdata will create separate -charts for each job. Once you've edited `cockroachdb.conf` according to the needs of your infrastructure, restart -Netdata to see your new charts. - -
- Charts showing a node failure during a simulated test -
Charts showing a node failure during a simulated test
-
- -## Tweak CockroachDB alerts - -This release also includes eight pre-configured alerts for live nodes, such as whether the node is live, storage -capacity, issues with replication, and the number of SQL connections/statements. See [health.d/cockroachdb.conf on -GitHub](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/cockroachdb.conf) for details. - -You can also edit these files directly with `edit-config`: - -```bash -cd /etc/netdata/ # Replace with your Netdata configuration directory, if not /etc/netdata/ -./edit-config health.d/cockroachdb.conf # You may need to use `sudo` for write privileges -``` - -For more information about editing the defaults or writing new alert entities, see our documentation on [configuring health alerts](/src/health/REFERENCE.md). diff --git a/docs/developer-and-contributor-corner/monitor-cockroachdb.txt b/docs/developer-and-contributor-corner/monitor-cockroachdb.txt new file mode 100644 index 000000000..d677c376c --- /dev/null +++ b/docs/developer-and-contributor-corner/monitor-cockroachdb.txt @@ -0,0 +1,118 @@ + + +# Monitor CockroachDB metrics with Netdata + +[CockroachDB](https://github.com/cockroachdb/cockroach) is an open-source project that brings SQL databases into +scalable, disaster-resilient cloud deployments. Thanks to +a [new CockroachDB collector](/src/go/plugin/go.d/modules/cockroachdb/README.md) +released in +[v1.20](https://blog.netdata.cloud/posts/release-1.20/), you can now monitor any number of CockroachDB databases with +maximum granularity using Netdata. Collect more than 50 unique metrics and put them on interactive visualizations +designed for better visual anomaly detection. + +Netdata itself uses CockroachDB as part of its Netdata Cloud infrastructure, so we're happy to introduce this new +collector and help others get started with it straight away. + +Let's dive in and walk through the process of monitoring CockroachDB metrics with Netdata. + +## What's in this guide + +- [Monitor CockroachDB metrics with Netdata](#monitor-cockroachdb-metrics-with-netdata) + - [What's in this guide](#whats-in-this-guide) + - [Configure the CockroachDB collector](#configure-the-cockroachdb-collector) + - [Manual setup for a local CockroachDB database](#manual-setup-for-a-local-cockroachdb-database) + - [Tweak CockroachDB alerts](#tweak-cockroachdb-alerts) + +## Configure the CockroachDB collector + +Because _all_ of Netdata's collectors can auto-detect the services they monitor, you _shouldn't_ need to worry about +configuring CockroachDB. Netdata only needs to regularly query the database's `_status/vars` page to gather metrics and +display them on the dashboard. + +If your CockroachDB instance is accessible through `http://localhost:8080/` or `http://127.0.0.1:8080`, your setup is +complete. Restart Netdata with `sudo systemctl restart netdata`, or the appropriate +method for your system, and refresh your browser. You should see CockroachDB +metrics in your Netdata dashboard! + +
+ CPU utilization charts from a CockroachDB database monitored by Netdata +
CPU utilization charts from a CockroachDB database monitored by Netdata
+
+ +> Note: Netdata collects metrics from CockroachDB every 10 seconds, instead of our usual 1 second, because CockroachDB +> only updates `_status/vars` every 10 seconds. You can't change this setting in CockroachDB. + +If you don't see CockroachDB charts, you may need to configure the collector manually. + +### Manual setup for a local CockroachDB database + +To configure Netdata's CockroachDB collector, navigate to your Netdata configuration directory (typically at +`/etc/netdata/`) and use `edit-config` to initialize and edit your CockroachDB configuration file. + +```bash +cd /etc/netdata/ # Replace with your Netdata configuration directory, if not /etc/netdata/ +./edit-config go.d/cockroachdb.conf +``` + +Scroll down to the `[JOBS]` section at the bottom of the file. You will see the two default jobs there, which you can +edit, or create a new job with any of the parameters listed above in the file. Both the `name` and `url` values are +required, and everything else is optional. + +For a production cluster, you'll use either an IP address or the system's hostname. Be sure that your remote system +allows TCP communication on port 8080, or whichever port you have configured CockroachDB's +[Admin UI](https://www.cockroachlabs.com/docs/stable/monitoring-and-alerting.html#prometheus-endpoint) to listen on. + +```yaml +# [ JOBS ] +jobs: + - name: remote + url: http://203.0.113.0:8080/_status/vars + + - name: remote_hostname + url: http://cockroachdb.example.com:8080/_status/vars +``` + +For a secure cluster, use `https` in the `url` field instead. + +```yaml +# [ JOBS ] +jobs: + - name: remote + url: https://203.0.113.0:8080/_status/vars + tls_skip_verify: yes # If your certificate is self-signed + + - name: remote_hostname + url: https://cockroachdb.example.com:8080/_status/vars + tls_skip_verify: yes # If your certificate is self-signed +``` + +You can add as many jobs as you'd like based on how many CockroachDB databases you have—Netdata will create separate +charts for each job. Once you've edited `cockroachdb.conf` according to the needs of your infrastructure, restart +Netdata to see your new charts. + +
+ Charts showing a node failure during a simulated test +
Charts showing a node failure during a simulated test
+
+ +## Tweak CockroachDB alerts + +This release also includes eight pre-configured alerts for live nodes, such as whether the node is live, storage +capacity, issues with replication, and the number of SQL connections/statements. See [health.d/cockroachdb.conf on +GitHub](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/cockroachdb.conf) for details. + +You can also edit these files directly with `edit-config`: + +```bash +cd /etc/netdata/ # Replace with your Netdata configuration directory, if not /etc/netdata/ +./edit-config health.d/cockroachdb.conf # You may need to use `sudo` for write privileges +``` + +For more information about editing the defaults or writing new alert entities, see our documentation on [configuring health alerts](/src/health/REFERENCE.md). diff --git a/docs/developer-and-contributor-corner/monitor-debug-applications-ebpf.md b/docs/developer-and-contributor-corner/monitor-debug-applications-ebpf.md index 91d2a2ef2..56f0276bb 100644 --- a/docs/developer-and-contributor-corner/monitor-debug-applications-ebpf.md +++ b/docs/developer-and-contributor-corner/monitor-debug-applications-ebpf.md @@ -1,13 +1,3 @@ - - # Monitor, troubleshoot, and debug applications with eBPF metrics When trying to troubleshoot or debug a finicky application, there's no such thing as too much information. At Netdata, @@ -48,7 +38,7 @@ your application's process name. Your file should now look like this: -```conf +```text ... # ----------------------------------------------------------------------------- # Custom applications to monitor with apps.plugin and ebpf.plugin @@ -60,15 +50,14 @@ dev: custom-app ... ``` -Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate -method](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for your system, to begin seeing metrics for this particular +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate method](/docs/netdata-agent/start-stop-restart.md) for your system, to begin seeing metrics for this particular group+process. You can also add additional processes to the same group. You can set up `apps_groups.conf` to more show more precise eBPF metrics for any application or service running on your system, even if it's a standard package like Redis, Apache, or any other [application/service Netdata collects from](/src/collectors/COLLECTORS.md). -```conf +```text # ----------------------------------------------------------------------------- # Custom applications to monitor with apps.plugin and ebpf.plugin @@ -99,7 +88,7 @@ sudo ./edit-config ebpf.d.conf Replace `entry` with `return`: -```conf +```text [global] ebpf load mode = return disable apps = no @@ -109,8 +98,7 @@ Replace `entry` with `return`: network viewer = yes ``` -Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate -method](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for your system. +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate method](/docs/netdata-agent/start-stop-restart.md) for your system. ## Get familiar with per-application eBPF metrics and charts @@ -139,7 +127,7 @@ In these charts, you can see first a spike in syscalls to open and close files f followed by a similar spike from the Apache benchmark. > 👋 Don't forget that you can view chart data directly via Netdata's API! -> +> > For example, open your browser and navigate to `http://NODE:19999/api/v1/data?chart=apps.file_open`, replacing `NODE` > with the IP address or hostname of your Agent. The API returns JSON of that chart's dimensions and metrics, which you > can use in other operations. @@ -245,10 +233,7 @@ Once you've added one or more nodes to a Space in Netdata Cloud, you can see agg dashboard under the same **Applications** or **eBPF** sections that you find on the local Agent dashboard. Or, [create new dashboards](/docs/dashboards-and-charts/dashboards-tab.md) using eBPF metrics from any number of distributed nodes to see how your application interacts with multiple Linux kernels on multiple Linux -systems. +systems. Now that you can see eBPF metrics in Netdata Cloud, you can [invite your team](/docs/netdata-cloud/organize-your-infrastructure-invite-your-team.md#invite-your-team) and share your findings with others. - - - diff --git a/docs/developer-and-contributor-corner/monitor-hadoop-cluster.md b/docs/developer-and-contributor-corner/monitor-hadoop-cluster.md index 98bf3d21f..8638f6d66 100644 --- a/docs/developer-and-contributor-corner/monitor-hadoop-cluster.md +++ b/docs/developer-and-contributor-corner/monitor-hadoop-cluster.md @@ -1,12 +1,3 @@ - - # Monitor a Hadoop cluster with Netdata Hadoop is an [Apache project](https://hadoop.apache.org/) is a framework for processing large sets of data across a @@ -27,8 +18,8 @@ alternative, like the guide available from For more specifics on the collection modules used in this guide, read the respective pages in our documentation: -- [HDFS](/src/go/plugin/go.d/modules/hdfs/README.md) -- [Zookeeper](/src/go/plugin/go.d/modules/zookeeper/README.md) +- [HDFS](/src/go/plugin/go.d/modules/hdfs/README.md) +- [Zookeeper](/src/go/plugin/go.d/modules/zookeeper/README.md) ## Set up your HDFS and Zookeeper installations @@ -164,7 +155,7 @@ jobs: address : 203.0.113.10:2182 ``` -Finally, [restart Netdata](/packaging/installer/README.md#maintaining-a-netdata-agent-installation). +Finally, [restart Netdata](/docs/netdata-agent/start-stop-restart.md). ```sh sudo systemctl restart netdata @@ -178,7 +169,7 @@ showing real-time metrics for both in your Netdata dashboard. 🎉 The Netdata community helped us create sane defaults for alerts related to both HDFS and Zookeeper. You may want to investigate these to ensure they work well with your Hadoop implementation. -- [HDFS alerts](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/hdfs.conf) +- [HDFS alerts](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/hdfs.conf) You can also access/edit these files directly with `edit-config`: @@ -187,5 +178,4 @@ sudo /etc/netdata/edit-config health.d/hdfs.conf sudo /etc/netdata/edit-config health.d/zookeeper.conf ``` -For more information about editing the defaults or writing new alert entities, see our -[health monitoring documentation](/src/health/README.md). +For more information about editing the defaults or writing new alert entities, see our [health monitoring documentation](/src/health/README.md). diff --git a/docs/developer-and-contributor-corner/pi-hole-raspberry-pi.md b/docs/developer-and-contributor-corner/pi-hole-raspberry-pi.md deleted file mode 100644 index df6bb0809..000000000 --- a/docs/developer-and-contributor-corner/pi-hole-raspberry-pi.md +++ /dev/null @@ -1,140 +0,0 @@ - - -# Monitor Pi-hole (and a Raspberry Pi) with Netdata - -import { OneLineInstallWget } from '@site/src/components/OneLineInstall/' - -Between intrusive ads, invasive trackers, and vicious malware, many techies and homelab enthusiasts are advancing their -networks' security and speed with a tiny computer and a powerful piece of software: [Pi-hole](https://pi-hole.net/). - -Pi-hole is a DNS sinkhole that prevents unwanted content from even reaching devices on your home network. It blocks ads -and malware at the network, instead of using extensions/add-ons for individual browsers, so you'll stop seeing ads in -some of the most intrusive places, like your smart TV. Pi-hole can even [improve your network's speed and reduce -bandwidth](https://discourse.pi-hole.net/t/will-pi-hole-slow-down-my-network/2048). - -Most Pi-hole users run it on a [Raspberry Pi](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) (hence the -name), a credit card-sized, super-capable computer that costs about $35. - -And to keep tabs on how both Pi-hole and the Raspberry Pi are working to protect your network, you can use the -open-source [Netdata monitoring agent](https://github.com/netdata/netdata). - -To get started, all you need is a [Raspberry Pi](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) with -Raspbian installed. This guide uses a Raspberry Pi 4 Model B and Raspbian GNU/Linux 10 (buster). This guide assumes -you're connecting to a Raspberry Pi remotely over SSH, but you could also complete all these steps on the system -directly using a keyboard, mouse, and monitor. - -## Why monitor Pi-hole and a Raspberry Pi with Netdata? - -Netdata helps you monitor and troubleshoot all kinds of devices and the applications they run, including IoT devices -like the Raspberry Pi and applications like Pi-hole. - -After a two-minute installation and with zero configuration, you'll be able to see all of Pi-hole's metrics, including -the volume of queries, connected clients, DNS queries per type, top clients, top blocked domains, and more. - -With Netdata installed, you can also monitor system metrics and any other applications you might be running. By default, -Netdata collects metrics on CPU usage, disk IO, bandwidth, per-application resource usage, and a ton more. With the -Raspberry Pi used for this guide, Netdata automatically collects about 1,500 metrics every second! - -![Real-time Pi-hole monitoring with -Netdata](https://user-images.githubusercontent.com/1153921/90447745-c8fe9600-e098-11ea-8a57-4f07339f002b.png) - -## Install Netdata - -Let's start by installing Netdata first so that it can start collecting system metrics as soon as possible for the most -possible historic data. - -> ⚠️ Don't install Netdata using `apt` and the default package available in Raspbian. The Netdata team does not maintain -> this package, and can't guarantee it works properly. - -On Raspberry Pis running Raspbian, the best way to install Netdata is our one-line kickstart script. This script asks -you to install dependencies, then compiles Netdata from source via [GitHub](https://github.com/netdata/netdata). - - - -Once installed on a Raspberry Pi 4 with no accessories, Netdata starts collecting roughly 1,500 metrics every second and -populates its dashboard with more than 250 charts. - -Open your browser of choice and navigate to `http://NODE:19999/`, replacing `NODE` with the IP address of your Raspberry -Pi. Not sure what that IP is? Try running `hostname -I | awk '{print $1}'` from the Pi itself. - -You'll see Netdata's dashboard and a few hundred real-time, interactive charts. Feel free to explore, but let's turn our attention to installing Pi-hole. - -## Install Pi-Hole - -Like Netdata, Pi-hole has a one-line script for simple installation. From your Raspberry Pi, run the following: - -```bash -curl -sSL https://install.pi-hole.net | bash -``` - -The installer will help you set up Pi-hole based on the topology of your network. Once finished, you should set up your -devices—or your router for system-wide sinkhole protection—to [use Pi-hole as their DNS -service](https://discourse.pi-hole.net/t/how-do-i-configure-my-devices-to-use-pi-hole-as-their-dns-server/245). You've -finished setting up Pi-hole at this point. - -As far as configuring Netdata to monitor Pi-hole metrics, there's nothing you actually need to do. Netdata's [Pi-hole -collector](/src/go/plugin/go.d/modules/pihole/README.md) will autodetect the new service -running on your Raspberry Pi and immediately start collecting metrics every second. - -Restart Netdata with `sudo systemctl restart netdata`, which will then recognize that Pi-hole is running and start a -per-second collection job. When you refresh your Netdata dashboard or load it up again in a new tab, you'll see a new -entry in the menu for **Pi-hole** metrics. - -## Use Netdata to explore and monitor your Raspberry Pi and Pi-hole - -By the time you've reached this point in the guide, Netdata has already collected a ton of valuable data about your -Raspberry Pi, Pi-hole, and any other apps/services you might be running. Even a few minutes of collecting 1,500 metrics -per second adds up quickly. - -You can now use Netdata's synchronized charts to zoom, highlight, scrub through time, and discern how an anomaly in one -part of your system might affect another. - -![The Netdata dashboard in -action](https://user-images.githubusercontent.com/1153921/80827388-b9fee100-8b98-11ea-8f60-0d7824667cd3.gif) - -### Enable temperature sensor monitoring - -You need to manually enable Netdata's built-in [temperature sensor -collector](/src/collectors/charts.d.plugin/sensors/README.md) to start collecting metrics. - -> Netdata uses a few plugins to manage its [collectors](/src/collectors/REFERENCE.md), each using a different language: Go, -> Python, Node.js, and Bash. While our Go collectors are undergoing the most active development, we still support the -> other languages. In this case, you need to enable a temperature sensor collector that's written in Bash. - -First, open the `charts.d.conf` file for editing. You should always use the `edit-config` script to edit Netdata's -configuration files, as it ensures your settings persist across updates to the Netdata Agent. - -```bash -cd /etc/netdata -sudo ./edit-config charts.d.conf -``` - -Uncomment the `sensors=force` line and save the file. Restart Netdata with `sudo systemctl restart netdata` to enable -Raspberry Pi temperature sensor monitoring. - -### Storing historical metrics on your Raspberry Pi - -By default, Netdata allocates 256 MiB in disk space to store historical metrics inside the [database -engine](/src/database/engine/README.md). On the Raspberry Pi used for this guide, Netdata collects 1,500 metrics every -second, which equates to storing 3.5 days worth of historical metrics. - -You can increase this allocation by editing `netdata.conf` and increasing the `dbengine multihost disk space` setting to -more than 256. - -```yaml -[global] - dbengine multihost disk space = 512 -``` - -Use our [database sizing -calculator](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics) -and the [Database configuration documentation](/src/database/README.md) to help you determine the right -setting for your Raspberry Pi. diff --git a/docs/developer-and-contributor-corner/pi-hole-raspberry-pi.txt b/docs/developer-and-contributor-corner/pi-hole-raspberry-pi.txt new file mode 100644 index 000000000..e150cebdc --- /dev/null +++ b/docs/developer-and-contributor-corner/pi-hole-raspberry-pi.txt @@ -0,0 +1,120 @@ + + +# Monitor Pi-hole (and a Raspberry Pi) with Netdata + +import { OneLineInstallWget } from '@site/src/components/OneLineInstall/' + +Between intrusive ads, invasive trackers, and vicious malware, many techies and homelab enthusiasts are advancing their +networks' security and speed with a tiny computer and a powerful piece of software: [Pi-hole](https://pi-hole.net/). + +Pi-hole is a DNS sinkhole that prevents unwanted content from even reaching devices on your home network. It blocks ads +and malware at the network, instead of using extensions/add-ons for individual browsers, so you'll stop seeing ads in +some of the most intrusive places, like your smart TV. Pi-hole can even [improve your network's speed and reduce +bandwidth](https://discourse.pi-hole.net/t/will-pi-hole-slow-down-my-network/2048). + +Most Pi-hole users run it on a [Raspberry Pi](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) (hence the +name), a credit card-sized, super-capable computer that costs about $35. + +And to keep tabs on how both Pi-hole and the Raspberry Pi are working to protect your network, you can use the +open-source [Netdata monitoring agent](https://github.com/netdata/netdata). + +To get started, all you need is a [Raspberry Pi](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) with +Raspbian installed. This guide uses a Raspberry Pi 4 Model B and Raspbian GNU/Linux 10 (buster). This guide assumes +you're connecting to a Raspberry Pi remotely over SSH, but you could also complete all these steps on the system +directly using a keyboard, mouse, and monitor. + +## Why monitor Pi-hole and a Raspberry Pi with Netdata? + +Netdata helps you monitor and troubleshoot all kinds of devices and the applications they run, including IoT devices +like the Raspberry Pi and applications like Pi-hole. + +After a two-minute installation and with zero configuration, you'll be able to see all of Pi-hole's metrics, including +the volume of queries, connected clients, DNS queries per type, top clients, top blocked domains, and more. + +With Netdata installed, you can also monitor system metrics and any other applications you might be running. By default, +Netdata collects metrics on CPU usage, disk IO, bandwidth, per-application resource usage, and a ton more. With the +Raspberry Pi used for this guide, Netdata automatically collects about 1,500 metrics every second! + +![Real-time Pi-hole monitoring with +Netdata](https://user-images.githubusercontent.com/1153921/90447745-c8fe9600-e098-11ea-8a57-4f07339f002b.png) + +## Install Netdata + +Let's start by installing Netdata first so that it can start collecting system metrics as soon as possible for the most +possible historic data. + +> ⚠️ Don't install Netdata using `apt` and the default package available in Raspbian. The Netdata team does not maintain +> this package, and can't guarantee it works properly. + +On Raspberry Pis running Raspbian, the best way to install Netdata is our one-line kickstart script. This script asks +you to install dependencies, then compiles Netdata from source via [GitHub](https://github.com/netdata/netdata). + + + +Once installed on a Raspberry Pi 4 with no accessories, Netdata starts collecting roughly 1,500 metrics every second and +populates its dashboard with more than 250 charts. + +Open your browser of choice and navigate to `http://NODE:19999/`, replacing `NODE` with the IP address of your Raspberry +Pi. Not sure what that IP is? Try running `hostname -I | awk '{print $1}'` from the Pi itself. + +You'll see Netdata's dashboard and a few hundred real-time, interactive charts. Feel free to explore, but let's turn our attention to installing Pi-hole. + +## Install Pi-Hole + +Like Netdata, Pi-hole has a one-line script for simple installation. From your Raspberry Pi, run the following: + +```bash +curl -sSL https://install.pi-hole.net | bash +``` + +The installer will help you set up Pi-hole based on the topology of your network. Once finished, you should set up your +devices—or your router for system-wide sinkhole protection—to [use Pi-hole as their DNS +service](https://discourse.pi-hole.net/t/how-do-i-configure-my-devices-to-use-pi-hole-as-their-dns-server/245). You've +finished setting up Pi-hole at this point. + +As far as configuring Netdata to monitor Pi-hole metrics, there's nothing you actually need to do. Netdata's [Pi-hole +collector](/src/go/plugin/go.d/modules/pihole/README.md) will autodetect the new service +running on your Raspberry Pi and immediately start collecting metrics every second. + +Restart Netdata with `sudo systemctl restart netdata`, which will then recognize that Pi-hole is running and start a +per-second collection job. When you refresh your Netdata dashboard or load it up again in a new tab, you'll see a new +entry in the menu for **Pi-hole** metrics. + +## Use Netdata to explore and monitor your Raspberry Pi and Pi-hole + +By the time you've reached this point in the guide, Netdata has already collected a ton of valuable data about your +Raspberry Pi, Pi-hole, and any other apps/services you might be running. Even a few minutes of collecting 1,500 metrics +per second adds up quickly. + +You can now use Netdata's synchronized charts to zoom, highlight, scrub through time, and discern how an anomaly in one +part of your system might affect another. + +![The Netdata dashboard in +action](https://user-images.githubusercontent.com/1153921/80827388-b9fee100-8b98-11ea-8f60-0d7824667cd3.gif) + +### Storing historical metrics on your Raspberry Pi + +By default, Netdata allocates 256 MiB in disk space to store historical metrics inside the [database +engine](/src/database/engine/README.md). On the Raspberry Pi used for this guide, Netdata collects 1,500 metrics every +second, which equates to storing 3.5 days worth of historical metrics. + +You can increase this allocation by editing `netdata.conf` and increasing the `dbengine multihost disk space` setting to +more than 256. + +```yaml +[global] + dbengine multihost disk space = 512 +``` + +Use our [database sizing +calculator](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics) +and the [Database configuration documentation](/src/database/README.md) to help you determine the right +setting for your Raspberry Pi. diff --git a/docs/developer-and-contributor-corner/process.md b/docs/developer-and-contributor-corner/process.md deleted file mode 100644 index 2902a24f6..000000000 --- a/docs/developer-and-contributor-corner/process.md +++ /dev/null @@ -1,270 +0,0 @@ - - -# Monitor any process in real-time with Netdata - -Netdata is more than a multitude of generic system-level metrics and visualizations. Instead of providing only a bird's -eye view of your system, leaving you to wonder exactly _what_ is taking up 99% CPU, Netdata also gives you visibility -into _every layer_ of your node. These additional layers give you context, and meaningful insights, into the true health -and performance of your infrastructure. - -One of these layers is the _process_. Every time a Linux system runs a program, it creates an independent process that -executes the program's instructions in parallel with anything else happening on the system. Linux systems track the -state and resource utilization of processes using the [`/proc` filesystem](https://en.wikipedia.org/wiki/Procfs), and -Netdata is designed to hook into those metrics to create meaningful visualizations out of the box. - -While there are a lot of existing command-line tools for tracking processes on Linux systems, such as `ps` or `top`, -only Netdata provides dozens of real-time charts, at both per-second and event frequency, without you having to write -SQL queries or know a bunch of arbitrary command-line flags. - -With Netdata's process monitoring, you can: - -- Benchmark/optimize performance of standard applications, like web servers or databases -- Benchmark/optimize performance of custom applications -- Troubleshoot CPU/memory/disk utilization issues (why is my system's CPU spiking right now?) -- Perform granular capacity planning based on the specific needs of your infrastructure -- Search for leaking file descriptors -- Investigate zombie processes - -... and much more. Let's get started. - -## Prerequisites - -- One or more Linux nodes running [Netdata](/packaging/installer/README.md) -- A general understanding of how - to [configure the Netdata Agent](/docs/netdata-agent/configuration/README.md) - using `edit-config`. -- A Netdata Cloud account. [Sign up](https://app.netdata.cloud) if you don't have one already. - -## How does Netdata do process monitoring? - -The Netdata Agent already knows to look for hundreds -of [standard applications that we support via collectors](/src/collectors/COLLECTORS.md), -and groups them based on their -purpose. Let's say you want to monitor a MySQL -database using its process. The Netdata Agent already knows to look for processes with the string `mysqld` in their -name, along with a few others, and puts them into the `sql` group. This `sql` group then becomes a dimension in all -process-specific charts. - -The process and groups settings are used by two unique and powerful collectors. - -[**`apps.plugin`**](/src/collectors/apps.plugin/README.md) looks at the Linux -process tree every second, much like `top` or -`ps fax`, and collects resource utilization information on every running process. It then automatically adds a layer of -meaningful visualization on top of these metrics, and creates per-process/application charts. - -[**`ebpf.plugin`**](/src/collectors/ebpf.plugin/README.md): Netdata's extended -Berkeley Packet Filter (eBPF) collector -monitors Linux kernel-level metrics for file descriptors, virtual filesystem IO, and process management, and then hands -process-specific metrics over to `apps.plugin` for visualization. The eBPF collector also collects and visualizes -metrics on an _event frequency_, which means it captures every kernel interaction, and not just the volume of -interaction at every second in time. That's even more precise than Netdata's standard per-second granularity. - -### Per-process metrics and charts in Netdata - -With these collectors working in parallel, Netdata visualizes the following per-second metrics for _any_ process on your -Linux systems: - -- CPU utilization (`apps.cpu`) - - Total CPU usage - - User/system CPU usage (`apps.cpu_user`/`apps.cpu_system`) -- Disk I/O - - Physical reads/writes (`apps.preads`/`apps.pwrites`) - - Logical reads/writes (`apps.lreads`/`apps.lwrites`) - - Open unique files (if a file is found open multiple times, it is counted just once, `apps.files`) -- Memory - - Real Memory Used (non-shared, `apps.mem`) - - Virtual Memory Allocated (`apps.vmem`) - - Minor page faults (i.e. memory activity, `apps.minor_faults`) -- Processes - - Threads running (`apps.threads`) - - Processes running (`apps.processes`) - - Carried over uptime (since the last Netdata Agent restart, `apps.uptime`) - - Minimum uptime (`apps.uptime_min`) - - Average uptime (`apps.uptime_average`) - - Maximum uptime (`apps.uptime_max`) - - Pipes open (`apps.pipes`) -- Swap memory - - Swap memory used (`apps.swap`) - - Major page faults (i.e. swap activity, `apps.major_faults`) -- Network - - Sockets open (`apps.sockets`) -- eBPF file - - Number of calls to open files. (`apps.file_open`) - - Number of files closed. (`apps.file_closed`) - - Number of calls to open files that returned errors. - - Number of calls to close files that returned errors. -- eBPF syscall - - Number of calls to delete files. (`apps.file_deleted`) - - Number of calls to `vfs_write`. (`apps.vfs_write_call`) - - Number of calls to `vfs_read`. (`apps.vfs_read_call`) - - Number of bytes written with `vfs_write`. (`apps.vfs_write_bytes`) - - Number of bytes read with `vfs_read`. (`apps.vfs_read_bytes`) - - Number of calls to write a file that returned errors. - - Number of calls to read a file that returned errors. -- eBPF process - - Number of process created with `do_fork`. (`apps.process_create`) - - Number of threads created with `do_fork` or `__x86_64_sys_clone`, depending on your system's kernel - version. (`apps.thread_create`) - - Number of times that a process called `do_exit`. (`apps.task_close`) -- eBPF net - - Number of bytes sent. (`apps.bandwidth_sent`) - - Number of bytes received. (`apps.bandwidth_recv`) - -As an example, here's the per-process CPU utilization chart, including a `sql` group/dimension. - -![A per-process CPU utilization chart in Netdata Cloud](https://user-images.githubusercontent.com/1153921/101217226-3a5d5700-363e-11eb-8610-aa1640aefb5d.png) - -## Configure the Netdata Agent to recognize a specific process - -To monitor any process, you need to make sure the Netdata Agent is aware of it. As mentioned above, the Agent is already -aware of hundreds of processes, and collects metrics from them automatically. - -But, if you want to change the grouping behavior, add an application that isn't yet supported in the Netdata Agent, or -monitor a custom application, you need to edit the `apps_groups.conf` configuration file. - -Navigate to your [Netdata config directory](/docs/netdata-agent/configuration/README.md) and -use `edit-config` to edit the file. - -```bash -cd /etc/netdata # Replace this with your Netdata config directory if not at /etc/netdata. -sudo ./edit-config apps_groups.conf -``` - -Inside the file are lists of process names, oftentimes using wildcards (`*`), that the Netdata Agent looks for and -groups together. For example, the Netdata Agent looks for processes starting with `mysqld`, `mariad`, `postgres`, and -others, and groups them into `sql`. That makes sense, since all these processes are for SQL databases. - -```conf -sql: mysqld* mariad* postgres* postmaster* oracle_* ora_* sqlservr -``` - -These groups are then reflected as [dimensions](/src/web/README.md#dimensions) -within Netdata's charts. - -![An example per-process CPU utilization chart in Netdata -Cloud](https://user-images.githubusercontent.com/1153921/101369156-352e2100-3865-11eb-9f0d-b8fac162e034.png) - -See the following two sections for details based on your needs. If you don't need to configure `apps_groups.conf`, jump -down to [visualizing process metrics](#visualize-process-metrics). - -### Standard applications (web servers, databases, containers, and more) - -As explained above, the Netdata Agent is already aware of most standard applications you run on Linux nodes, and you -shouldn't need to configure it to discover them. - -However, if you're using multiple applications that the Netdata Agent groups together you may want to separate them for -more precise monitoring. If you're not running any other types of SQL databases on that node, you don't need to change -the grouping, since you know that any MySQL is the only process contributing to the `sql` group. - -Let's say you're using both MySQL and PostgreSQL databases on a single node, and want to monitor their processes -independently. Open the `apps_groups.conf` file as explained in -the [section above](#configure-the-netdata-agent-to-recognize-a-specific-process) and scroll down until you find -the `database servers` section. Create new groups for MySQL and PostgreSQL, and move their process queries into the -unique groups. - -```conf -# ----------------------------------------------------------------------------- -# database servers - -mysql: mysqld* -postgres: postgres* -sql: mariad* postmaster* oracle_* ora_* sqlservr -``` - -Restart Netdata with `sudo systemctl restart netdata`, or -the [appropriate method](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for your system, to start collecting utilization metrics -from your application. Time to [visualize your process metrics](#visualize-process-metrics). - -### Custom applications - -Let's assume you have an application that runs on the process `custom-app`. To monitor eBPF metrics for that application -separate from any others, you need to create a new group in `apps_groups.conf` and associate that process name with it. - -Open the `apps_groups.conf` file as explained in -the [section above](#configure-the-netdata-agent-to-recognize-a-specific-process). Scroll down -to `# NETDATA processes accounting`. -Above that, paste in the following text, which creates a new `custom-app` group with the `custom-app` process. Replace -`custom-app` with the name of your application's Linux process. `apps_groups.conf` should now look like this: - -```conf -... -# ----------------------------------------------------------------------------- -# Custom applications to monitor with apps.plugin and ebpf.plugin - -custom-app: custom-app - -# ----------------------------------------------------------------------------- -# NETDATA processes accounting -... -``` - -Restart Netdata with `sudo systemctl restart netdata`, or -the [appropriate method](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) for your system, to start collecting utilization metrics -from your application. - -## Visualize process metrics - -Now that you're collecting metrics for your process, you'll want to visualize them using Netdata's real-time, -interactive charts. Find these visualizations in the same section regardless of whether you -use [Netdata Cloud](https://app.netdata.cloud) for infrastructure monitoring, or single-node monitoring with the local -Agent's dashboard at `http://localhost:19999`. - -If you need a refresher on all the available per-process charts, see -the [above list](#per-process-metrics-and-charts-in-netdata). - -### Using Netdata's application collector (`apps.plugin`) - -`apps.plugin` puts all of its charts under the **Applications** section of any Netdata dashboard. - -![Screenshot of the Applications section on a Netdata dashboard](https://user-images.githubusercontent.com/1153921/101401172-2ceadb80-388f-11eb-9e9a-88443894c272.png) - -Let's continue with the MySQL example. We can create a [test -database](https://www.digitalocean.com/community/tutorials/how-to-measure-mysql-query-performance-with-mysqlslap) in -MySQL to generate load on the `mysql` process. - -`apps.plugin` immediately collects and visualizes this activity `apps.cpu` chart, which shows an increase in CPU -utilization from the `sql` group. There is a parallel increase in `apps.pwrites`, which visualizes writes to disk. - -![Per-application CPU utilization metrics](https://user-images.githubusercontent.com/1153921/101409725-8527da80-389b-11eb-96e9-9f401535aafc.png) - -![Per-application disk writing metrics](https://user-images.githubusercontent.com/1153921/101409728-85c07100-389b-11eb-83fd-d79dd1545b5a.png) - -Next, the `mysqlslap` utility queries the database to provide some benchmarking load on the MySQL database. It won't -look exactly like a production database executing lots of user queries, but it gives you an idea into the possibility of -these visualizations. - -```bash -sudo mysqlslap --user=sysadmin --password --host=localhost --concurrency=50 --iterations=10 --create-schema=employees --query="SELECT * FROM dept_emp;" --verbose -``` - -The following per-process disk utilization charts show spikes under the `sql` group at the same time `mysqlslap` was run -numerous times, with slightly different concurrency and query options. - -![Per-application disk metrics](https://user-images.githubusercontent.com/1153921/101411810-d08fb800-389e-11eb-85b3-f3fa41f1f887.png) - -> 💡 Click on any dimension below a chart in Netdata Cloud (or to the right of a chart on a local Agent dashboard), to -> visualize only that dimension. This can be particularly useful in process monitoring to separate one process' -> utilization from the rest of the system. - -### Using Netdata's eBPF collector (`ebpf.plugin`) - -Netdata's eBPF collector puts its charts in two places. Of most importance to process monitoring are the **ebpf file**, -**ebpf syscall**, **ebpf process**, and **ebpf net** sub-sections under **Applications**, shown in the above screenshot. - -For example, running the above workload shows the entire "story" how MySQL interacts with the Linux kernel to open -processes/threads to handle a large number of SQL queries, then subsequently close the tasks as each query returns the -relevant data. - -![Per-process eBPF charts](https://user-images.githubusercontent.com/1153921/101412395-c8844800-389f-11eb-86d2-20c8a0f7b3c0.png) - -`ebpf.plugin` visualizes additional eBPF metrics, which are system-wide and not per-process, under the **eBPF** section. - - diff --git a/docs/developer-and-contributor-corner/process.txt b/docs/developer-and-contributor-corner/process.txt new file mode 100644 index 000000000..dbb36c550 --- /dev/null +++ b/docs/developer-and-contributor-corner/process.txt @@ -0,0 +1,270 @@ + + +# Monitor any process in real-time with Netdata + +Netdata is more than a multitude of generic system-level metrics and visualizations. Instead of providing only a bird's +eye view of your system, leaving you to wonder exactly _what_ is taking up 99% CPU, Netdata also gives you visibility +into _every layer_ of your node. These additional layers give you context, and meaningful insights, into the true health +and performance of your infrastructure. + +One of these layers is the _process_. Every time a Linux system runs a program, it creates an independent process that +executes the program's instructions in parallel with anything else happening on the system. Linux systems track the +state and resource utilization of processes using the [`/proc` filesystem](https://en.wikipedia.org/wiki/Procfs), and +Netdata is designed to hook into those metrics to create meaningful visualizations out of the box. + +While there are a lot of existing command-line tools for tracking processes on Linux systems, such as `ps` or `top`, +only Netdata provides dozens of real-time charts, at both per-second and event frequency, without you having to write +SQL queries or know a bunch of arbitrary command-line flags. + +With Netdata's process monitoring, you can: + +- Benchmark/optimize performance of standard applications, like web servers or databases +- Benchmark/optimize performance of custom applications +- Troubleshoot CPU/memory/disk utilization issues (why is my system's CPU spiking right now?) +- Perform granular capacity planning based on the specific needs of your infrastructure +- Search for leaking file descriptors +- Investigate zombie processes + +... and much more. Let's get started. + +## Prerequisites + +- One or more Linux nodes running [Netdata](/packaging/installer/README.md) +- A general understanding of how + to [configure the Netdata Agent](/docs/netdata-agent/configuration/README.md) + using `edit-config`. +- A Netdata Cloud account. [Sign up](https://app.netdata.cloud) if you don't have one already. + +## How does Netdata do process monitoring? + +The Netdata Agent already knows to look for hundreds +of [standard applications that we support via collectors](/src/collectors/COLLECTORS.md), +and groups them based on their +purpose. Let's say you want to monitor a MySQL +database using its process. The Netdata Agent already knows to look for processes with the string `mysqld` in their +name, along with a few others, and puts them into the `sql` group. This `sql` group then becomes a dimension in all +process-specific charts. + +The process and groups settings are used by two unique and powerful collectors. + +[**`apps.plugin`**](/src/collectors/apps.plugin/README.md) looks at the Linux +process tree every second, much like `top` or +`ps fax`, and collects resource utilization information on every running process. It then automatically adds a layer of +meaningful visualization on top of these metrics, and creates per-process/application charts. + +[**`ebpf.plugin`**](/src/collectors/ebpf.plugin/README.md): Netdata's extended +Berkeley Packet Filter (eBPF) collector +monitors Linux kernel-level metrics for file descriptors, virtual filesystem IO, and process management, and then hands +process-specific metrics over to `apps.plugin` for visualization. The eBPF collector also collects and visualizes +metrics on an _event frequency_, which means it captures every kernel interaction, and not just the volume of +interaction at every second in time. That's even more precise than Netdata's standard per-second granularity. + +### Per-process metrics and charts in Netdata + +With these collectors working in parallel, Netdata visualizes the following per-second metrics for _any_ process on your +Linux systems: + +- CPU utilization (`apps.cpu`) + - Total CPU usage + - User/system CPU usage (`apps.cpu_user`/`apps.cpu_system`) +- Disk I/O + - Physical reads/writes (`apps.preads`/`apps.pwrites`) + - Logical reads/writes (`apps.lreads`/`apps.lwrites`) + - Open unique files (if a file is found open multiple times, it is counted just once, `apps.files`) +- Memory + - Real Memory Used (non-shared, `apps.mem`) + - Virtual Memory Allocated (`apps.vmem`) + - Minor page faults (i.e. memory activity, `apps.minor_faults`) +- Processes + - Threads running (`apps.threads`) + - Processes running (`apps.processes`) + - Carried over uptime (since the last Netdata Agent restart, `apps.uptime`) + - Minimum uptime (`apps.uptime_min`) + - Average uptime (`apps.uptime_average`) + - Maximum uptime (`apps.uptime_max`) + - Pipes open (`apps.pipes`) +- Swap memory + - Swap memory used (`apps.swap`) + - Major page faults (i.e. swap activity, `apps.major_faults`) +- Network + - Sockets open (`apps.sockets`) +- eBPF file + - Number of calls to open files. (`apps.file_open`) + - Number of files closed. (`apps.file_closed`) + - Number of calls to open files that returned errors. + - Number of calls to close files that returned errors. +- eBPF syscall + - Number of calls to delete files. (`apps.file_deleted`) + - Number of calls to `vfs_write`. (`apps.vfs_write_call`) + - Number of calls to `vfs_read`. (`apps.vfs_read_call`) + - Number of bytes written with `vfs_write`. (`apps.vfs_write_bytes`) + - Number of bytes read with `vfs_read`. (`apps.vfs_read_bytes`) + - Number of calls to write a file that returned errors. + - Number of calls to read a file that returned errors. +- eBPF process + - Number of process created with `do_fork`. (`apps.process_create`) + - Number of threads created with `do_fork` or `__x86_64_sys_clone`, depending on your system's kernel + version. (`apps.thread_create`) + - Number of times that a process called `do_exit`. (`apps.task_close`) +- eBPF net + - Number of bytes sent. (`apps.bandwidth_sent`) + - Number of bytes received. (`apps.bandwidth_recv`) + +As an example, here's the per-process CPU utilization chart, including a `sql` group/dimension. + +![A per-process CPU utilization chart in Netdata Cloud](https://user-images.githubusercontent.com/1153921/101217226-3a5d5700-363e-11eb-8610-aa1640aefb5d.png) + +## Configure the Netdata Agent to recognize a specific process + +To monitor any process, you need to make sure the Netdata Agent is aware of it. As mentioned above, the Agent is already +aware of hundreds of processes, and collects metrics from them automatically. + +But, if you want to change the grouping behavior, add an application that isn't yet supported in the Netdata Agent, or +monitor a custom application, you need to edit the `apps_groups.conf` configuration file. + +Navigate to your [Netdata config directory](/docs/netdata-agent/configuration/README.md) and +use `edit-config` to edit the file. + +```bash +cd /etc/netdata # Replace this with your Netdata config directory if not at /etc/netdata. +sudo ./edit-config apps_groups.conf +``` + +Inside the file are lists of process names, oftentimes using wildcards (`*`), that the Netdata Agent looks for and +groups together. For example, the Netdata Agent looks for processes starting with `mysqld`, `mariad`, `postgres`, and +others, and groups them into `sql`. That makes sense, since all these processes are for SQL databases. + +```text +sql: mysqld* mariad* postgres* postmaster* oracle_* ora_* sqlservr +``` + +These groups are then reflected as [dimensions](/src/web/README.md#dimensions) +within Netdata's charts. + +![An example per-process CPU utilization chart in Netdata +Cloud](https://user-images.githubusercontent.com/1153921/101369156-352e2100-3865-11eb-9f0d-b8fac162e034.png) + +See the following two sections for details based on your needs. If you don't need to configure `apps_groups.conf`, jump +down to [visualizing process metrics](#visualize-process-metrics). + +### Standard applications (web servers, databases, containers, and more) + +As explained above, the Netdata Agent is already aware of most standard applications you run on Linux nodes, and you +shouldn't need to configure it to discover them. + +However, if you're using multiple applications that the Netdata Agent groups together you may want to separate them for +more precise monitoring. If you're not running any other types of SQL databases on that node, you don't need to change +the grouping, since you know that any MySQL is the only process contributing to the `sql` group. + +Let's say you're using both MySQL and PostgreSQL databases on a single node, and want to monitor their processes +independently. Open the `apps_groups.conf` file as explained in +the [section above](#configure-the-netdata-agent-to-recognize-a-specific-process) and scroll down until you find +the `database servers` section. Create new groups for MySQL and PostgreSQL, and move their process queries into the +unique groups. + +```text +# ----------------------------------------------------------------------------- +# database servers + +mysql: mysqld* +postgres: postgres* +sql: mariad* postmaster* oracle_* ora_* sqlservr +``` + +Restart Netdata with `sudo systemctl restart netdata`, or +the appropriate method for your system, to start collecting utilization metrics +from your application. Time to [visualize your process metrics](#visualize-process-metrics). + +### Custom applications + +Let's assume you have an application that runs on the process `custom-app`. To monitor eBPF metrics for that application +separate from any others, you need to create a new group in `apps_groups.conf` and associate that process name with it. + +Open the `apps_groups.conf` file as explained in +the [section above](#configure-the-netdata-agent-to-recognize-a-specific-process). Scroll down +to `# NETDATA processes accounting`. +Above that, paste in the following text, which creates a new `custom-app` group with the `custom-app` process. Replace +`custom-app` with the name of your application's Linux process. `apps_groups.conf` should now look like this: + +```text +... +# ----------------------------------------------------------------------------- +# Custom applications to monitor with apps.plugin and ebpf.plugin + +custom-app: custom-app + +# ----------------------------------------------------------------------------- +# NETDATA processes accounting +... +``` + +Restart Netdata with `sudo systemctl restart netdata`, or +the appropriate method for your system, to start collecting utilization metrics +from your application. + +## Visualize process metrics + +Now that you're collecting metrics for your process, you'll want to visualize them using Netdata's real-time, +interactive charts. Find these visualizations in the same section regardless of whether you +use [Netdata Cloud](https://app.netdata.cloud) for infrastructure monitoring, or single-node monitoring with the local +Agent's dashboard at `http://localhost:19999`. + +If you need a refresher on all the available per-process charts, see +the [above list](#per-process-metrics-and-charts-in-netdata). + +### Using Netdata's application collector (`apps.plugin`) + +`apps.plugin` puts all of its charts under the **Applications** section of any Netdata dashboard. + +![Screenshot of the Applications section on a Netdata dashboard](https://user-images.githubusercontent.com/1153921/101401172-2ceadb80-388f-11eb-9e9a-88443894c272.png) + +Let's continue with the MySQL example. We can create a [test +database](https://www.digitalocean.com/community/tutorials/how-to-measure-mysql-query-performance-with-mysqlslap) in +MySQL to generate load on the `mysql` process. + +`apps.plugin` immediately collects and visualizes this activity `apps.cpu` chart, which shows an increase in CPU +utilization from the `sql` group. There is a parallel increase in `apps.pwrites`, which visualizes writes to disk. + +![Per-application CPU utilization metrics](https://user-images.githubusercontent.com/1153921/101409725-8527da80-389b-11eb-96e9-9f401535aafc.png) + +![Per-application disk writing metrics](https://user-images.githubusercontent.com/1153921/101409728-85c07100-389b-11eb-83fd-d79dd1545b5a.png) + +Next, the `mysqlslap` utility queries the database to provide some benchmarking load on the MySQL database. It won't +look exactly like a production database executing lots of user queries, but it gives you an idea into the possibility of +these visualizations. + +```bash +sudo mysqlslap --user=sysadmin --password --host=localhost --concurrency=50 --iterations=10 --create-schema=employees --query="SELECT * FROM dept_emp;" --verbose +``` + +The following per-process disk utilization charts show spikes under the `sql` group at the same time `mysqlslap` was run +numerous times, with slightly different concurrency and query options. + +![Per-application disk metrics](https://user-images.githubusercontent.com/1153921/101411810-d08fb800-389e-11eb-85b3-f3fa41f1f887.png) + +> 💡 Click on any dimension below a chart in Netdata Cloud (or to the right of a chart on a local Agent dashboard), to +> visualize only that dimension. This can be particularly useful in process monitoring to separate one process' +> utilization from the rest of the system. + +### Using Netdata's eBPF collector (`ebpf.plugin`) + +Netdata's eBPF collector puts its charts in two places. Of most importance to process monitoring are the **ebpf file**, +**ebpf syscall**, **ebpf process**, and **ebpf net** sub-sections under **Applications**, shown in the above screenshot. + +For example, running the above workload shows the entire "story" how MySQL interacts with the Linux kernel to open +processes/threads to handle a large number of SQL queries, then subsequently close the tasks as each query returns the +relevant data. + +![Per-process eBPF charts](https://user-images.githubusercontent.com/1153921/101412395-c8844800-389f-11eb-86d2-20c8a0f7b3c0.png) + +`ebpf.plugin` visualizes additional eBPF metrics, which are system-wide and not per-process, under the **eBPF** section. + + diff --git a/docs/developer-and-contributor-corner/python-collector.md b/docs/developer-and-contributor-corner/python-collector.md deleted file mode 100644 index 0b7aa96a6..000000000 --- a/docs/developer-and-contributor-corner/python-collector.md +++ /dev/null @@ -1,626 +0,0 @@ -# Develop a custom data collector in Python - -The Netdata Agent uses [data collectors](/src/collectors/README.md) to -fetch metrics from hundreds of system, container, and service endpoints. While the Netdata team and community has built -[powerful collectors](/src/collectors/COLLECTORS.md) for most system, container, -and service/application endpoints, some custom applications can't be monitored by default. - -In this tutorial, you'll learn how to leverage the [Python programming language](https://www.python.org/) to build a -custom data collector for the Netdata Agent. Follow along with your own dataset, using the techniques and best practices -covered here, or use the included examples for collecting and organizing either random or weather data. - -## Disclaimer - -If you're comfortable with Golang, consider instead writing a module for the [go.d.plugin](https://github.com/netdata/go.d.plugin). -Golang is more performant, easier to maintain, and simpler for users since it doesn't require a particular runtime on the node to -execute. Python plugins require Python on the machine to be executed. Netdata uses Go as the platform of choice for -production-grade collectors. - -We generally do not accept contributions of Python modules to the GitHub project netdata/netdata. If you write a Python collector and -want to make it available for other users, you should create the pull request in https://github.com/netdata/community. - -## What you need to get started - - - A physical or virtual Linux system, which we'll call a _node_. - - A working [installation of Netdata](/packaging/installer/README.md) monitoring agent. - -### Quick start - -For a quick start, you can look at the -[example plugin](https://raw.githubusercontent.com/netdata/netdata/master/src/collectors/python.d.plugin/example/example.chart.py). - -**Note**: If you are working 'locally' on a new collector and would like to run it in an already installed and running -Netdata (as opposed to having to install Netdata from source again with your new changes) you can copy over the relevant -file to where Netdata expects it and then either `sudo systemctl restart netdata` to have it be picked up and used by -Netdata or you can just run the updated collector in debug mode by following a process like below (this assumes you have -[installed Netdata from a GitHub fork](/packaging/installer/methods/manual.md) you -have made to do your development on). - -```bash -# clone your fork (done once at the start but shown here for clarity) -#git clone --branch my-example-collector https://github.com/mygithubusername/netdata.git --depth=100 --recursive -# go into your netdata source folder -cd netdata -# git pull your latest changes (assuming you built from a fork you are using to develop on) -git pull -# instead of running the installer we can just copy over the updated collector files -#sudo ./netdata-installer.sh --dont-wait -# copy over the file you have updated locally (pretending we are working on the 'example' collector) -sudo cp collectors/python.d.plugin/example/example.chart.py /usr/libexec/netdata/python.d/ -# become user netdata -sudo su -s /bin/bash netdata -# run your updated collector in debug mode to see if it works without having to reinstall netdata -/usr/libexec/netdata/plugins.d/python.d.plugin example debug trace nolock -``` - -## Jobs and elements of a Python collector - -A Python collector for Netdata is a Python script that gathers data from an external source and transforms these data -into charts to be displayed by Netdata dashboard. The basic jobs of the plugin are: - -- Gather the data from the service/application. -- Create the required charts. -- Parse the data to extract or create the actual data to be represented. -- Assign the correct values to the charts -- Set the order for the charts to be displayed. -- Give the charts data to Netdata for visualization. - -The basic elements of a Netdata collector are: - -- `ORDER[]`: A list containing the charts to be displayed. -- `CHARTS{}`: A dictionary containing the details for the charts to be displayed. -- `data{}`: A dictionary containing the values to be displayed. -- `get_data()`: The basic function of the plugin which will return to Netdata the correct values. - -**Note**: All names are better explained in the -[External Plugins Documentation](/src/collectors/plugins.d/README.md). -Parameters like `priority` and `update_every` mentioned in that documentation are handled by the `python.d.plugin`, -not by each collection module. - -Let's walk through these jobs and elements as independent elements first, then apply them to example Python code. - -### Determine how to gather metrics data - -Netdata can collect data from any program that can print to stdout. Common input sources for collectors can be logfiles, -HTTP requests, executables, and more. While this tutorial will offer some example inputs, your custom application will -have different inputs and metrics. - -A great deal of the work in developing a Netdata collector is investigating the target application and understanding -which metrics it exposes and how to - -### Create charts - -For the data to be represented in the Netdata dashboard, you need to create charts. Charts (in general) are defined by -several characteristics: title, legend, units, type, and presented values. Each chart is represented as a dictionary -entry: - -```python -chart= { - "chart_name": - { - "options": [option_list], - "lines": [ - [dimension_list] - ] - } - } -``` - -Use the `options` field to set the chart's options, which is a list in the form `options: [name, title, units, family, -context, charttype]`, where: - -- `name`: The name of the chart. -- `title` : The title to be displayed in the chart. -- `units` : The units for this chart. -- `family`: An identifier used to group charts together (can be null). -- `context`: An identifier used to group contextually similar charts together. The best practice is to provide a context - that is `A.B`, with `A` being the name of the collector, and `B` being the name of the specific metric. -- `charttype`: Either `line`, `area`, or `stacked`. If null line is the default value. - -You can read more about `family` and `context` in the [web dashboard](/src/web/README.md#families) doc. - -Once the chart has been defined, you should define the dimensions of the chart. Dimensions are basically the metrics to -be represented in this chart and each chart can have more than one dimension. In order to define the dimensions, the -"lines" list should be filled in with the required dimensions. Each dimension is a list: - -`dimension: [id, name, algorithm, multiplier, divisor]` -- `id` : The id of the dimension. Mandatory unique field (string) required in order to set a value. -- `name`: The name to be presented in the chart. If null id will be used. -- `algorithm`: Can be absolute or incremental. If null absolute is used. Incremental shows the difference from the - previous value. -- `multiplier`: an integer value to divide the collected value, if null, 1 is used -- `divisor`: an integer value to divide the collected value, if null, 1 is used - -The multiplier/divisor fields are used in cases where the value to be displayed should be decimal since Netdata only -gathers integer values. - -### Parse the data to extract or create the actual data to be represented - -Once the data is received, your collector should process it in order to get the values required. If, for example, the -received data is a JSON string, you should parse the data to get the required data to be used for the charts. - -### Assign the correct values to the charts - -Once you have process your data and get the required values, you need to assign those values to the charts you created. -This is done using the `data` dictionary, which is in the form: - -`"data": {dimension_id: value }`, where: -- `dimension_id`: The id of a defined dimension in a created chart. -- `value`: The numerical value to associate with this dimension. - -### Set the order for the charts to be displayed - -Next, set the order of chart appearance with the `ORDER` list, which is in the form: - -`"ORDER": [chart_name_1,chart_name_2, …., chart_name_X]`, where: -- `chart_name_x`: is the chart name to be shown in X order. - -### Give the charts data to Netdata for visualization - -Our plugin should just rerun the data dictionary. If everything is set correctly the charts should be updated with the -correct values. - -## Framework classes - -Every module needs to implement its own `Service` class. This class should inherit from one of the framework classes: - -- `SimpleService` -- `UrlService` -- `SocketService` -- `LogService` -- `ExecutableService` - -Also it needs to invoke the parent class constructor in a specific way as well as assign global variables to class variables. - -For example, the snippet below is from the -[RabbitMQ collector](https://github.com/netdata/netdata/blob/91f3268e9615edd393bd43de4ad8068111024cc9/collectors/python.d.plugin/rabbitmq/rabbitmq.chart.py#L273). -This collector uses an HTTP endpoint and uses the `UrlService` framework class, which only needs to define an HTTP -endpoint for data collection. - -```python -class Service(UrlService): - def __init__(self, configuration=None, name=None): - UrlService.__init__(self, configuration=configuration, name=name) - self.order = ORDER - self.definitions = CHARTS - self.url = '{0}://{1}:{2}'.format( - configuration.get('scheme', 'http'), - configuration.get('host', '127.0.0.1'), - configuration.get('port', 15672), - ) - self.node_name = str() - self.vhost = VhostStatsBuilder() - self.collected_vhosts = set() - self.collect_queues_metrics = configuration.get('collect_queues_metrics', False) - self.debug("collect_queues_metrics is {0}".format("enabled" if self.collect_queues_metrics else "disabled")) - if self.collect_queues_metrics: - self.queue = QueueStatsBuilder() - self.collected_queues = set() -``` - -In our use-case, we use the `SimpleService` framework, since there is no framework class that suits our needs. - -You can find below the [framework class reference](#framework-class-reference). - -## An example collector using weather station data - -Let's build a custom Python collector for visualizing data from a weather monitoring station. - -### Determine how to gather metrics data - -This example assumes you can gather metrics data through HTTP requests to a web server, and that the data provided are -numeric values for temperature, humidity and pressure. It also assumes you can get the `min`, `max`, and `average` -values for these metrics. - -### Chart creation - -First, create a single chart that shows the latest temperature metric: - -```python -CHARTS = { - "temp_current": { - "options": ["my_temp", "Temperature", "Celsius", "TEMP", "weather_station.temperature", "line"], - "lines": [ - ["current_temp_id","current_temperature"] - ] - } -} -``` - -## Parse the data to extract or create the actual data to be represented - -Every collector must implement `_get_data`. This method should grab raw data from `_get_raw_data`, -parse it, and return a dictionary where keys are unique dimension names, or `None` if no data is collected. - -For example: -```py -def _get_data(self): - try: - raw = self._get_raw_data().split(" ") - return {'active': int(raw[2])} - except (ValueError, AttributeError): - return None -``` - -In our weather data collector we declare `_get_data` as follows: - -```python - def get_data(self): - #The data dict is basically all the values to be represented - # The entries are in the format: { "dimension": value} - #And each "dimension" should belong to a chart. - data = dict() - - self.populate_data() - - data['current_temperature'] = self.weather_data["temp"] - - return data -``` - -A standard practice would be to either get the data on JSON format or transform them to JSON format. We use a dictionary -to give this format and issue random values to simulate received data. - -The following code iterates through the names of the expected values and creates a dictionary with the name of the value -as `key`, and a random value as `value`. - -```python - weather_data=dict() - weather_metrics=[ - "temp","av_temp","min_temp","max_temp", - "humid","av_humid","min_humid","max_humid", - "pressure","av_pressure","min_pressure","max_pressure", - ] - - def populate_data(self): - for metric in self.weather_metrics: - self.weather_data[metric]=random.randint(0,100) -``` - -### Assign the correct values to the charts - -Our chart has a dimension called `current_temp_id`, which should have the temperature value received. - -```python -data['current_temp_id'] = self.weather_data["temp"] -``` - -### Set the order for the charts to be displayed - -```python -ORDER = [ - "temp_current" -] -``` - -### Give the charts data to Netdata for visualization - -```python -return data -``` - -A snapshot of the chart created by this plugin: - -![A snapshot of the chart created by this plugin](https://i.imgur.com/2tR9KvF.png) - -Here's the current source code for the data collector: - -```python -# -*- coding: utf-8 -*- -# Description: howto weather station netdata python.d module -# Author: Panagiotis Papaioannou (papajohn-uop) -# SPDX-License-Identifier: GPL-3.0-or-later - -from bases.FrameworkServices.SimpleService import SimpleService - -import random - -NETDATA_UPDATE_EVERY=1 -priority = 90000 - -ORDER = [ - "temp_current" -] - -CHARTS = { - "temp_current": { - "options": ["my_temp", "Temperature", "Celsius", "TEMP", "weather_station.temperature", "line"], - "lines": [ - ["current_temperature"] - ] - } -} - -class Service(SimpleService): - def __init__(self, configuration=None, name=None): - SimpleService.__init__(self, configuration=configuration, name=name) - self.order = ORDER - self.definitions = CHARTS - #values to show at graphs - self.values=dict() - - @staticmethod - def check(): - return True - - weather_data=dict() - weather_metrics=[ - "temp","av_temp","min_temp","max_temp", - "humid","av_humid","min_humid","max_humid", - "pressure","av_pressure","min_pressure","max_pressure", - ] - - def logMe(self,msg): - self.debug(msg) - - def populate_data(self): - for metric in self.weather_metrics: - self.weather_data[metric]=random.randint(0,100) - - def get_data(self): - #The data dict is basically all the values to be represented - # The entries are in the format: { "dimension": value} - #And each "dimension" should belong to a chart. - data = dict() - - self.populate_data() - - data['current_temperature'] = self.weather_data["temp"] - - return data -``` - -## Add more charts to the existing weather station collector - -To enrich the example, add another chart the collector which to present the humidity metric. - -Add a new entry in the `CHARTS` dictionary with the definition for the new chart. - -```python -CHARTS = { - 'temp_current': { - 'options': ['my_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature', 'line'], - 'lines': [ - ['current_temperature'] - ] - }, - 'humid_current': { - 'options': ['my_humid', 'Humidity', '%', 'HUMIDITY', 'weather_station.humidity', 'line'], - 'lines': [ - ['current_humidity'] - ] - } -} -``` - -The data has already been created and parsed by the `weather_data=dict()` function, so you only need to populate the -`current_humidity` dimension `self.weather_data["humid"]`. - -```python - data['current_temperature'] = self.weather_data["temp"] - data['current_humidity'] = self.weather_data["humid"] -``` - -Next, put the new `humid_current` chart into the `ORDER` list: - -```python -ORDER = [ - 'temp_current', - 'humid_current' -] -``` - -[Restart Netdata](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) with `sudo systemctl restart netdata` to see the new humidity -chart: - -![A snapshot of the modified chart](https://i.imgur.com/XOeCBmg.png) - -Next, time to add one more chart that visualizes the average, minimum, and maximum temperature values. - -Add a new entry in the `CHARTS` dictionary with the definition for the new chart. Since you want three values -represented in this this chart, add three dimensions. You should also use the same `FAMILY` value in the charts (`TEMP`) -so that those two charts are grouped together. - -```python -CHARTS = { - 'temp_current': { - 'options': ['my_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature', 'line'], - 'lines': [ - ['current_temperature'] - ] - }, - 'temp_stats': { - 'options': ['stats_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature_stats', 'line'], - 'lines': [ - ['min_temperature'], - ['max_temperature'], - ['avg_temperature'] - ] - }, - 'humid_current': { - 'options': ['my_humid', 'Humidity', '%', 'HUMIDITY', 'weather_station.humidity', 'line'], - 'lines': [ - ['current_humidity'] - ] - } - -} -``` - -As before, initiate new dimensions and add data to them: - -```python - data['current_temperature'] = self.weather_data["temp"] - data['min_temperature'] = self.weather_data["min_temp"] - data['max_temperature'] = self.weather_data["max_temp"] - data['avg_temperature`'] = self.weather_data["av_temp"] - data['current_humidity'] = self.weather_data["humid"] -``` - -Finally, set the order for the `temp_stats` chart: - -```python -ORDER = [ - 'temp_current', - ‘temp_stats’ - 'humid_current' -] -``` - -[Restart Netdata](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) with `sudo systemctl restart netdata` to see the new -min/max/average temperature chart with multiple dimensions: - -![A snapshot of the modified chart](https://i.imgur.com/g7E8lnG.png) - -## Add a configuration file - -The last piece of the puzzle to create a fully robust Python collector is the configuration file. Python.d uses -configuration in [YAML](https://www.tutorialspoint.com/yaml/yaml_basics.htm) format and is used as follows: - -- Create a configuration file in the same directory as the `.chart.py`. Name it `.conf`. -- Define a `job`, which is an instance of the collector. It is useful when you want to collect data from different - sources with different attributes. For example, we could gather data from 2 different weather stations, which use - different temperature measures: Fahrenheit and Celsius. -- You can define many different jobs with the same name, but with different attributes. Netdata will try each job - serially and will stop at the first job that returns data. If multiple jobs have the same name, only one of them can - run. This enables you to define different "ways" to fetch data from a particular data source so that the collector has - more chances to work out-of-the-box. For example, if the data source supports both `HTTP` and `linux socket`, you can - define 2 jobs named `local`, with each using a different method. -- Check the `example` collector configuration file on - [GitHub](https://github.com/netdata/netdata/blob/master/src/collectors/python.d.plugin/example/example.conf) to get a - sense of the structure. - -```yaml -weather_station_1: - name: 'Greece' - endpoint: 'https://endpoint_1.com' - port: 67 - type: 'celsius' -weather_station_2: - name: 'Florida USA' - endpoint: 'https://endpoint_2.com' - port: 67 - type: 'fahrenheit' -``` - -Next, access the above configuration variables in the `__init__` function: - -```python -def __init__(self, configuration=None, name=None): - SimpleService.__init__(self, configuration=configuration, name=name) - self.endpoint = self.configuration.get('endpoint', ) -``` - -Because you initiate the `framework class` (e.g `SimpleService.__init__`), the configuration will be available -throughout the whole `Service` class of your module, as `self.configuration`. Finally, note that the `configuration.get` -function takes 2 arguments, one with the name of the configuration field and one with a default value in case it doesn't -find the configuration field. This allows you to define sane defaults for your collector. - -Moreover, when creating the configuration file, create a large comment section that describes the configuration -variables and inform the user about the defaults. For example, take a look at the `example` collector on -[GitHub](https://github.com/netdata/netdata/blob/master/src/collectors/python.d.plugin/example/example.conf). - -You can read more about the configuration file on the [`python.d.plugin` -documentation](/src/collectors/python.d.plugin/README.md). - -You can find the source code for the above examples on [GitHub](https://github.com/papajohn-uop/netdata). - -## Pull Request Checklist for Python Plugins - -Pull requests should be created in https://github.com/netdata/community. - -This is a generic checklist for submitting a new Python plugin for Netdata. It is by no means comprehensive. - -At minimum, to be buildable and testable, the PR needs to include: - -- The module itself, following proper naming conventions: `collectors/python.d.plugin//.chart.py` -- A README.md file for the plugin under `collectors/python.d.plugin/`. -- The configuration file for the module: `collectors/python.d.plugin//.conf`. Python config files are in YAML format, and should include comments describing what options are present. The instructions are also needed in the configuration section of the README.md -- A basic configuration for the plugin in the appropriate global config file: `collectors/python.d.plugin/python.d.conf`, which is also in YAML format. Either add a line that reads `# : yes` if the module is to be enabled by default, or one that reads `: no` if it is to be disabled by default. -- A makefile for the plugin at `collectors/python.d.plugin//Makefile.inc`. Check an existing plugin for what this should look like. -- A line in `collectors/python.d.plugin/Makefile.am` including the above-mentioned makefile. Place it with the other plugin includes (please keep the includes sorted alphabetically). -- Optionally, chart information in `src/web/gui/dashboard_info.js`. This generally involves specifying a name and icon for the section, and may include descriptions for the section or individual charts. -- Optionally, some default alert configurations for your collector in `health/health.d/.conf` and a line adding `.conf` in `health/Makefile.am`. - -## Framework class reference - -Every framework class has some user-configurable variables which are specific to this particular class. Those variables should have default values initialized in the child class constructor. - -If module needs some additional user-configurable variable, it can be accessed from the `self.configuration` list and assigned in constructor or custom `check` method. Example: - -```py -def __init__(self, configuration=None, name=None): - UrlService.__init__(self, configuration=configuration, name=name) - try: - self.baseurl = str(self.configuration['baseurl']) - except (KeyError, TypeError): - self.baseurl = "http://localhost:5001" -``` - -Classes implement `_get_raw_data` which should be used to grab raw data. This method usually returns a list of strings. - -### `SimpleService` - -This is last resort class, if a new module cannot be written by using other framework class this one can be used. - -Example: `ceph`, `sensors` - -It is the lowest-level class which implements most of module logic, like: - -- threading -- handling run times -- chart formatting -- logging -- chart creation and updating - -### `LogService` - -Examples: `apache_cache`, `nginx_log`_ - -Variable from config file: `log_path`. - -Object created from this class reads new lines from file specified in `log_path` variable. It will check if file exists and is readable. Also `_get_raw_data` returns list of strings where each string is one line from file specified in `log_path`. - -### `ExecutableService` - -Examples: `exim`, `postfix`_ - -Variable from config file: `command`. - -This allows to execute a shell command in a secure way. It will check for invalid characters in `command` variable and won't proceed if there is one of: - -- '&' -- '|' -- ';' -- '>' -- '\<' - -For additional security it uses python `subprocess.Popen` (without `shell=True` option) to execute command. Command can be specified with absolute or relative name. When using relative name, it will try to find `command` in `PATH` environment variable as well as in `/sbin` and `/usr/sbin`. - -`_get_raw_data` returns list of decoded lines returned by `command`. - -### UrlService - -Examples: `apache`, `nginx`, `tomcat`_ - -Variables from config file: `url`, `user`, `pass`. - -If data is grabbed by accessing service via HTTP protocol, this class can be used. It can handle HTTP Basic Auth when specified with `user` and `pass` credentials. - -Please note that the config file can use different variables according to the specification of each module. - -`_get_raw_data` returns list of utf-8 decoded strings (lines). - -### SocketService - -Examples: `dovecot`, `redis` - -Variables from config file: `unix_socket`, `host`, `port`, `request`. - -Object will try execute `request` using either `unix_socket` or TCP/IP socket with combination of `host` and `port`. This can access unix sockets with SOCK_STREAM or SOCK_DGRAM protocols and TCP/IP sockets in version 4 and 6 with SOCK_STREAM setting. - -Sockets are accessed in non-blocking mode with 15 second timeout. - -After every execution of `_get_raw_data` socket is closed, to prevent this module needs to set `_keep_alive` variable to `True` and implement custom `_check_raw_data` method. - -`_check_raw_data` should take raw data and return `True` if all data is received otherwise it should return `False`. Also it should do it in fast and efficient way. diff --git a/docs/developer-and-contributor-corner/python-collector.txt b/docs/developer-and-contributor-corner/python-collector.txt new file mode 100644 index 000000000..f846b347b --- /dev/null +++ b/docs/developer-and-contributor-corner/python-collector.txt @@ -0,0 +1,629 @@ +# Develop a custom data collector in Python + +The Netdata Agent uses [data collectors](/src/collectors/README.md) to +fetch metrics from hundreds of system, container, and service endpoints. While the Netdata team and community has built +[powerful collectors](/src/collectors/COLLECTORS.md) for most system, container, +and service/application endpoints, some custom applications can't be monitored by default. + +In this tutorial, you'll learn how to leverage the [Python programming language](https://www.python.org/) to build a +custom data collector for the Netdata Agent. Follow along with your own dataset, using the techniques and best practices +covered here, or use the included examples for collecting and organizing either random or weather data. + +## Disclaimer + +If you're comfortable with Golang, consider instead writing a module for the [go.d.plugin](https://github.com/netdata/go.d.plugin). +Golang is more performant, easier to maintain, and simpler for users since it doesn't require a particular runtime on the node to +execute. Python plugins require Python on the machine to be executed. Netdata uses Go as the platform of choice for +production-grade collectors. + +We generally do not accept contributions of Python modules to the GitHub project netdata/netdata. If you write a Python collector and +want to make it available for other users, you should create the pull request in . + +## What you need to get started + +- A physical or virtual Linux system, which we'll call a _node_. +- A working [installation of Netdata](/packaging/installer/README.md) monitoring agent. + +### Quick start + +For a quick start, you can look at the +[example plugin](https://raw.githubusercontent.com/netdata/netdata/master/src/collectors/python.d.plugin/example/example.chart.py). + +**Note**: If you are working 'locally' on a new collector and would like to run it in an already installed and running +Netdata (as opposed to having to install Netdata from source again with your new changes) you can copy over the relevant +file to where Netdata expects it and then either `sudo systemctl restart netdata` to have it be picked up and used by +Netdata or you can just run the updated collector in debug mode by following a process like below (this assumes you have +[installed Netdata from a GitHub fork](/packaging/installer/methods/manual.md) you +have made to do your development on). + +```bash +# clone your fork (done once at the start but shown here for clarity) +#git clone --branch my-example-collector https://github.com/mygithubusername/netdata.git --depth=100 --recursive +# go into your netdata source folder +cd netdata +# git pull your latest changes (assuming you built from a fork you are using to develop on) +git pull +# instead of running the installer we can just copy over the updated collector files +#sudo ./netdata-installer.sh --dont-wait +# copy over the file you have updated locally (pretending we are working on the 'example' collector) +sudo cp collectors/python.d.plugin/example/example.chart.py /usr/libexec/netdata/python.d/ +# become user netdata +sudo su -s /bin/bash netdata +# run your updated collector in debug mode to see if it works without having to reinstall netdata +/usr/libexec/netdata/plugins.d/python.d.plugin example debug trace nolock +``` + +## Jobs and elements of a Python collector + +A Python collector for Netdata is a Python script that gathers data from an external source and transforms these data +into charts to be displayed by Netdata dashboard. The basic jobs of the plugin are: + +- Gather the data from the service/application. +- Create the required charts. +- Parse the data to extract or create the actual data to be represented. +- Assign the correct values to the charts +- Set the order for the charts to be displayed. +- Give the charts data to Netdata for visualization. + +The basic elements of a Netdata collector are: + +- `ORDER[]`: A list containing the charts to be displayed. +- `CHARTS{}`: A dictionary containing the details for the charts to be displayed. +- `data{}`: A dictionary containing the values to be displayed. +- `get_data()`: The basic function of the plugin which will return to Netdata the correct values. + +**Note**: All names are better explained in the +[External Plugins Documentation](/src/plugins.d/README.md). +Parameters like `priority` and `update_every` mentioned in that documentation are handled by the `python.d.plugin`, +not by each collection module. + +Let's walk through these jobs and elements as independent elements first, then apply them to example Python code. + +### Determine how to gather metrics data + +Netdata can collect data from any program that can print to stdout. Common input sources for collectors can be log files, +HTTP requests, executables, and more. While this tutorial will offer some example inputs, your custom application will +have different inputs and metrics. + +A great deal of the work in developing a Netdata collector is investigating the target application and understanding +which metrics it exposes and how to + +### Create charts + +For the data to be represented in the Netdata dashboard, you need to create charts. Charts (in general) are defined by +several characteristics: title, legend, units, type, and presented values. Each chart is represented as a dictionary +entry: + +```python +chart= { + "chart_name": + { + "options": [option_list], + "lines": [ + [dimension_list] + ] + } + } +``` + +Use the `options` field to set the chart's options, which is a list in the form `options: [name, title, units, family, +context, charttype]`, where: + +- `name`: The name of the chart. +- `title` : The title to be displayed in the chart. +- `units` : The units for this chart. +- `family`: An identifier used to group charts together (can be null). +- `context`: An identifier used to group contextually similar charts together. The best practice is to provide a context + that is `A.B`, with `A` being the name of the collector, and `B` being the name of the specific metric. +- `charttype`: Either `line`, `area`, or `stacked`. If null line is the default value. + +You can read more about `family` and `context` in the [Netdata Charts](/docs/dashboards-and-charts/netdata-charts.md) doc. + +Once the chart has been defined, you should define the dimensions of the chart. Dimensions are basically the metrics to +be represented in this chart and each chart can have more than one dimension. In order to define the dimensions, the +"lines" list should be filled in with the required dimensions. Each dimension is a list: + +`dimension: [id, name, algorithm, multiplier, divisor]` + +- `id` : The id of the dimension. Mandatory unique field (string) required in order to set a value. +- `name`: The name to be presented in the chart. If null id will be used. +- `algorithm`: Can be absolute or incremental. If null absolute is used. Incremental shows the difference from the + previous value. +- `multiplier`: an integer value to divide the collected value, if null, 1 is used +- `divisor`: an integer value to divide the collected value, if null, 1 is used + +The multiplier/divisor fields are used in cases where the value to be displayed should be decimal since Netdata only +gathers integer values. + +### Parse the data to extract or create the actual data to be represented + +Once the data is received, your collector should process it in order to get the values required. If, for example, the +received data is a JSON string, you should parse the data to get the required data to be used for the charts. + +### Assign the correct values to the charts + +Once you have process your data and get the required values, you need to assign those values to the charts you created. +This is done using the `data` dictionary, which is in the form: + +`"data": {dimension_id: value }`, where: + +- `dimension_id`: The id of a defined dimension in a created chart. +- `value`: The numerical value to associate with this dimension. + +### Set the order for the charts to be displayed + +Next, set the order of chart appearance with the `ORDER` list, which is in the form: + +`"ORDER": [chart_name_1,chart_name_2, …., chart_name_X]`, where: + +- `chart_name_x`: is the chart name to be shown in X order. + +### Give the charts data to Netdata for visualization + +Our plugin should just rerun the data dictionary. If everything is set correctly the charts should be updated with the +correct values. + +## Framework classes + +Every module needs to implement its own `Service` class. This class should inherit from one of the framework classes: + +- `SimpleService` +- `UrlService` +- `SocketService` +- `LogService` +- `ExecutableService` + +Also it needs to invoke the parent class constructor in a specific way as well as assign global variables to class variables. + +For example, the snippet below is from the +[RabbitMQ collector](https://github.com/netdata/netdata/blob/91f3268e9615edd393bd43de4ad8068111024cc9/collectors/python.d.plugin/rabbitmq/rabbitmq.chart.py#L273). +This collector uses an HTTP endpoint and uses the `UrlService` framework class, which only needs to define an HTTP +endpoint for data collection. + +```python +class Service(UrlService): + def __init__(self, configuration=None, name=None): + UrlService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + self.url = '{0}://{1}:{2}'.format( + configuration.get('scheme', 'http'), + configuration.get('host', '127.0.0.1'), + configuration.get('port', 15672), + ) + self.node_name = str() + self.vhost = VhostStatsBuilder() + self.collected_vhosts = set() + self.collect_queues_metrics = configuration.get('collect_queues_metrics', False) + self.debug("collect_queues_metrics is {0}".format("enabled" if self.collect_queues_metrics else "disabled")) + if self.collect_queues_metrics: + self.queue = QueueStatsBuilder() + self.collected_queues = set() +``` + +In our use-case, we use the `SimpleService` framework, since there is no framework class that suits our needs. + +You can find below the [framework class reference](#framework-class-reference). + +## An example collector using weather station data + +Let's build a custom Python collector for visualizing data from a weather monitoring station. + +### Determine how to gather metrics data + +This example assumes you can gather metrics data through HTTP requests to a web server, and that the data provided are +numeric values for temperature, humidity and pressure. It also assumes you can get the `min`, `max`, and `average` +values for these metrics. + +### Chart creation + +First, create a single chart that shows the latest temperature metric: + +```python +CHARTS = { + "temp_current": { + "options": ["my_temp", "Temperature", "Celsius", "TEMP", "weather_station.temperature", "line"], + "lines": [ + ["current_temp_id","current_temperature"] + ] + } +} +``` + +## Parse the data to extract or create the actual data to be represented + +Every collector must implement `_get_data`. This method should grab raw data from `_get_raw_data`, +parse it, and return a dictionary where keys are unique dimension names, or `None` if no data is collected. + +For example: + +```py +def _get_data(self): + try: + raw = self._get_raw_data().split(" ") + return {'active': int(raw[2])} + except (ValueError, AttributeError): + return None +``` + +In our weather data collector we declare `_get_data` as follows: + +```python + def get_data(self): + #The data dict is basically all the values to be represented + # The entries are in the format: { "dimension": value} + #And each "dimension" should belong to a chart. + data = dict() + + self.populate_data() + + data['current_temperature'] = self.weather_data["temp"] + + return data +``` + +A standard practice would be to either get the data on JSON format or transform them to JSON format. We use a dictionary +to give this format and issue random values to simulate received data. + +The following code iterates through the names of the expected values and creates a dictionary with the name of the value +as `key`, and a random value as `value`. + +```python + weather_data=dict() + weather_metrics=[ + "temp","av_temp","min_temp","max_temp", + "humid","av_humid","min_humid","max_humid", + "pressure","av_pressure","min_pressure","max_pressure", + ] + + def populate_data(self): + for metric in self.weather_metrics: + self.weather_data[metric]=random.randint(0,100) +``` + +### Assign the correct values to the charts + +Our chart has a dimension called `current_temp_id`, which should have the temperature value received. + +```python +data['current_temp_id'] = self.weather_data["temp"] +``` + +### Set the order for the charts to be displayed + +```python +ORDER = [ + "temp_current" +] +``` + +### Give the charts data to Netdata for visualization + +```python +return data +``` + +A snapshot of the chart created by this plugin: + +![A snapshot of the chart created by this plugin](https://i.imgur.com/2tR9KvF.png) + +Here's the current source code for the data collector: + +```python +# -*- coding: utf-8 -*- +# Description: howto weather station netdata python.d module +# Author: Panagiotis Papaioannou (papajohn-uop) +# SPDX-License-Identifier: GPL-3.0-or-later + +from bases.FrameworkServices.SimpleService import SimpleService + +import random + +NETDATA_UPDATE_EVERY=1 +priority = 90000 + +ORDER = [ + "temp_current" +] + +CHARTS = { + "temp_current": { + "options": ["my_temp", "Temperature", "Celsius", "TEMP", "weather_station.temperature", "line"], + "lines": [ + ["current_temperature"] + ] + } +} + +class Service(SimpleService): + def __init__(self, configuration=None, name=None): + SimpleService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + #values to show at graphs + self.values=dict() + + @staticmethod + def check(): + return True + + weather_data=dict() + weather_metrics=[ + "temp","av_temp","min_temp","max_temp", + "humid","av_humid","min_humid","max_humid", + "pressure","av_pressure","min_pressure","max_pressure", + ] + + def logMe(self,msg): + self.debug(msg) + + def populate_data(self): + for metric in self.weather_metrics: + self.weather_data[metric]=random.randint(0,100) + + def get_data(self): + #The data dict is basically all the values to be represented + # The entries are in the format: { "dimension": value} + #And each "dimension" should belong to a chart. + data = dict() + + self.populate_data() + + data['current_temperature'] = self.weather_data["temp"] + + return data +``` + +## Add more charts to the existing weather station collector + +To enrich the example, add another chart the collector which to present the humidity metric. + +Add a new entry in the `CHARTS` dictionary with the definition for the new chart. + +```python +CHARTS = { + 'temp_current': { + 'options': ['my_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature', 'line'], + 'lines': [ + ['current_temperature'] + ] + }, + 'humid_current': { + 'options': ['my_humid', 'Humidity', '%', 'HUMIDITY', 'weather_station.humidity', 'line'], + 'lines': [ + ['current_humidity'] + ] + } +} +``` + +The data has already been created and parsed by the `weather_data=dict()` function, so you only need to populate the +`current_humidity` dimension `self.weather_data["humid"]`. + +```python + data['current_temperature'] = self.weather_data["temp"] + data['current_humidity'] = self.weather_data["humid"] +``` + +Next, put the new `humid_current` chart into the `ORDER` list: + +```python +ORDER = [ + 'temp_current', + 'humid_current' +] +``` + +[Restart Netdata](/docs/netdata-agent/start-stop-restart.md) to see the new humidity +chart: + +![A snapshot of the modified chart](https://i.imgur.com/XOeCBmg.png) + +Next, time to add one more chart that visualizes the average, minimum, and maximum temperature values. + +Add a new entry in the `CHARTS` dictionary with the definition for the new chart. Since you want three values +represented in this this chart, add three dimensions. You should also use the same `FAMILY` value in the charts (`TEMP`) +so that those two charts are grouped together. + +```python +CHARTS = { + 'temp_current': { + 'options': ['my_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature', 'line'], + 'lines': [ + ['current_temperature'] + ] + }, + 'temp_stats': { + 'options': ['stats_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature_stats', 'line'], + 'lines': [ + ['min_temperature'], + ['max_temperature'], + ['avg_temperature'] + ] + }, + 'humid_current': { + 'options': ['my_humid', 'Humidity', '%', 'HUMIDITY', 'weather_station.humidity', 'line'], + 'lines': [ + ['current_humidity'] + ] + } + +} +``` + +As before, initiate new dimensions and add data to them: + +```python + data['current_temperature'] = self.weather_data["temp"] + data['min_temperature'] = self.weather_data["min_temp"] + data['max_temperature'] = self.weather_data["max_temp"] + data['avg_temperature`'] = self.weather_data["av_temp"] + data['current_humidity'] = self.weather_data["humid"] +``` + +Finally, set the order for the `temp_stats` chart: + +```python +ORDER = [ + 'temp_current', + ‘temp_stats’ + 'humid_current' +] +``` + +[Restart Netdata](/docs/netdata-agent/start-stop-restart.md) to see the new min/max/average temperature chart with multiple dimensions: + +![A snapshot of the modified chart](https://i.imgur.com/g7E8lnG.png) + +## Add a configuration file + +The last piece of the puzzle to create a fully robust Python collector is the configuration file. Python.d uses +configuration in [YAML](https://www.tutorialspoint.com/yaml/yaml_basics.htm) format and is used as follows: + +- Create a configuration file in the same directory as the `.chart.py`. Name it `.conf`. +- Define a `job`, which is an instance of the collector. It is useful when you want to collect data from different + sources with different attributes. For example, we could gather data from 2 different weather stations, which use + different temperature measures: Fahrenheit and Celsius. +- You can define many different jobs with the same name, but with different attributes. Netdata will try each job + serially and will stop at the first job that returns data. If multiple jobs have the same name, only one of them can + run. This enables you to define different "ways" to fetch data from a particular data source so that the collector has + more chances to work out-of-the-box. For example, if the data source supports both `HTTP` and `linux socket`, you can + define 2 jobs named `local`, with each using a different method. +- Check the `example` collector configuration file on + [GitHub](https://github.com/netdata/netdata/blob/master/src/collectors/python.d.plugin/example/example.conf) to get a + sense of the structure. + +```yaml +weather_station_1: + name: 'Greece' + endpoint: 'https://endpoint_1.com' + port: 67 + type: 'celsius' +weather_station_2: + name: 'Florida USA' + endpoint: 'https://endpoint_2.com' + port: 67 + type: 'fahrenheit' +``` + +Next, access the above configuration variables in the `__init__` function: + +```python +def __init__(self, configuration=None, name=None): + SimpleService.__init__(self, configuration=configuration, name=name) + self.endpoint = self.configuration.get('endpoint', ) +``` + +Because you initiate the `framework class` (e.g `SimpleService.__init__`), the configuration will be available +throughout the whole `Service` class of your module, as `self.configuration`. Finally, note that the `configuration.get` +function takes 2 arguments, one with the name of the configuration field and one with a default value in case it doesn't +find the configuration field. This allows you to define sane defaults for your collector. + +Moreover, when creating the configuration file, create a large comment section that describes the configuration +variables and inform the user about the defaults. For example, take a look at the `example` collector on +[GitHub](https://github.com/netdata/netdata/blob/master/src/collectors/python.d.plugin/example/example.conf). + +You can read more about the configuration file on the [`python.d.plugin` +documentation](/src/collectors/python.d.plugin/README.md). + +You can find the source code for the above examples on [GitHub](https://github.com/papajohn-uop/netdata). + +## Pull Request Checklist for Python Plugins + +Pull requests should be created in . + +This is a generic checklist for submitting a new Python plugin for Netdata. It is by no means comprehensive. + +At minimum, to be buildable and testable, the PR needs to include: + +- The module itself, following proper naming conventions: `collectors/python.d.plugin//.chart.py` +- A README.md file for the plugin under `collectors/python.d.plugin/`. +- The configuration file for the module: `collectors/python.d.plugin//.conf`. Python config files are in YAML format, and should include comments describing what options are present. The instructions are also needed in the configuration section of the README.md +- A basic configuration for the plugin in the appropriate global config file: `collectors/python.d.plugin/python.d.conf`, which is also in YAML format. Either add a line that reads `# : yes` if the module is to be enabled by default, or one that reads `: no` if it is to be disabled by default. +- A makefile for the plugin at `collectors/python.d.plugin//Makefile.inc`. Check an existing plugin for what this should look like. +- A line in `collectors/python.d.plugin/Makefile.am` including the above-mentioned makefile. Place it with the other plugin includes (please keep the includes sorted alphabetically). +- Optionally, chart information in `src/web/gui/dashboard_info.js`. This generally involves specifying a name and icon for the section, and may include descriptions for the section or individual charts. +- Optionally, some default alert configurations for your collector in `health/health.d/.conf` and a line adding `.conf` in `health/Makefile.am`. + +## Framework class reference + +Every framework class has some user-configurable variables which are specific to this particular class. Those variables should have default values initialized in the child class constructor. + +If module needs some additional user-configurable variable, it can be accessed from the `self.configuration` list and assigned in constructor or custom `check` method. Example: + +```py +def __init__(self, configuration=None, name=None): + UrlService.__init__(self, configuration=configuration, name=name) + try: + self.baseurl = str(self.configuration['baseurl']) + except (KeyError, TypeError): + self.baseurl = "http://localhost:5001" +``` + +Classes implement `_get_raw_data` which should be used to grab raw data. This method usually returns a list of strings. + +### `SimpleService` + +This is last resort class, if a new module cannot be written by using other framework class this one can be used. + +Example: `ceph`, `sensors` + +It is the lowest-level class which implements most of module logic, like: + +- threading +- handling run times +- chart formatting +- logging +- chart creation and updating + +### `LogService` + +Examples: `apache_cache`, `nginx_log`_ + +Variable from config file: `log_path`. + +Object created from this class reads new lines from file specified in `log_path` variable. It will check if file exists and is readable. Also `_get_raw_data` returns list of strings where each string is one line from file specified in `log_path`. + +### `ExecutableService` + +Examples: `exim`, `postfix`_ + +Variable from config file: `command`. + +This allows to execute a shell command in a secure way. It will check for invalid characters in `command` variable and won't proceed if there is one of: + +- '&' +- '|' +- ';' +- '>' +- '\<' + +For additional security it uses python `subprocess.Popen` (without `shell=True` option) to execute command. Command can be specified with absolute or relative name. When using relative name, it will try to find `command` in `PATH` environment variable as well as in `/sbin` and `/usr/sbin`. + +`_get_raw_data` returns list of decoded lines returned by `command`. + +### UrlService + +Examples: `apache`, `nginx`, `tomcat`_ + +Variables from config file: `url`, `user`, `pass`. + +If data is grabbed by accessing service via HTTP protocol, this class can be used. It can handle HTTP Basic Auth when specified with `user` and `pass` credentials. + +Please note that the config file can use different variables according to the specification of each module. + +`_get_raw_data` returns list of utf-8 decoded strings (lines). + +### SocketService + +Examples: `dovecot`, `redis` + +Variables from config file: `unix_socket`, `host`, `port`, `request`. + +Object will try execute `request` using either `unix_socket` or TCP/IP socket with combination of `host` and `port`. This can access unix sockets with SOCK_STREAM or SOCK_DGRAM protocols and TCP/IP sockets in version 4 and 6 with SOCK_STREAM setting. + +Sockets are accessed in non-blocking mode with 15 second timeout. + +After every execution of `_get_raw_data` socket is closed, to prevent this module needs to set `_keep_alive` variable to `True` and implement custom `_check_raw_data` method. + +`_check_raw_data` should take raw data and return `True` if all data is received otherwise it should return `False`. Also it should do it in fast and efficient way. diff --git a/docs/developer-and-contributor-corner/raspberry-pi-anomaly-detection.md b/docs/developer-and-contributor-corner/raspberry-pi-anomaly-detection.md deleted file mode 100644 index 41cf007eb..000000000 --- a/docs/developer-and-contributor-corner/raspberry-pi-anomaly-detection.md +++ /dev/null @@ -1,96 +0,0 @@ -# Anomaly detection for RPi monitoring - -Learn how to use a low-overhead machine learning algorithm alongside Netdata to detect anomalous metrics on a Raspberry Pi. - -We love IoT and edge at Netdata, we also love machine learning. Even better if we can combine the two to ease the pain -of monitoring increasingly complex systems. - -We recently explored what might be involved in enabling our Python-based [anomalies -collector](/src/collectors/python.d.plugin/anomalies/README.md) on a Raspberry Pi. To our delight, it's actually quite -straightforward! - -Read on to learn all the steps and enable unsupervised anomaly detection on your on Raspberry Pi(s). - -> Spoiler: It's just a couple of extra commands that will make you feel like a pro. - -## What you need to get started - -- A Raspberry Pi running Raspbian, which we'll call a _node_. -- The [open-source Netdata](https://github.com/netdata/netdata) monitoring agent. If you don't have it installed on your - node yet, [get started now](/packaging/installer/README.md). - -## Install dependencies - -First make sure Netdata is using Python 3 when it runs Python-based data collectors. - -Next, open `netdata.conf` using [`edit-config`](/docs/netdata-agent/configuration/README.md#edit-netdataconf) -from within the [Netdata config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). Scroll down to the -`[plugin:python.d]` section to pass in the `-ppython3` command option. - -```conf -[plugin:python.d] - # update every = 1 - command options = -ppython3 -``` - -Next, install some of the underlying libraries used by the Python packages the collector depends upon. - -```bash -sudo apt install llvm-9 libatlas3-base libgfortran5 libatlas-base-dev -``` - -Now you're ready to install the Python packages used by the collector itself. First, become the `netdata` user. - -```bash -sudo su -s /bin/bash netdata -``` - -Then pass in the location to find `llvm` as an environment variable for `pip3`. - -```bash -LLVM_CONFIG=llvm-config-9 pip3 install --user llvmlite numpy==1.20.1 netdata-pandas==0.0.38 numba==0.50.1 scikit-learn==0.23.2 pyod==0.8.3 -``` - -## Enable the anomalies collector - -Now you're ready to enable the collector and [restart Netdata](/packaging/installer/README.md#maintaining-a-netdata-agent-installation). - -```bash -sudo ./edit-config python.d.conf - -# restart netdata -sudo systemctl restart netdata -``` - -And that should be it! Wait a minute or two, refresh your Netdata dashboard, you should see the default anomalies -charts under the **Anomalies** section in the dashboard's menu. - -![Anomaly detection on the Raspberry -Pi](https://user-images.githubusercontent.com/1153921/110149717-9d749c00-7d9b-11eb-853c-e041a36f0a41.png) - -## Overhead on system - -Of course one of the most important considerations when trying to do anomaly detection at the edge (as opposed to in a -centralized cloud somewhere) is the resource utilization impact of running a monitoring tool. - -With the default configuration, the anomalies collector uses about 6.5% of CPU at each run. During the retraining step, -CPU utilization jumps to between 20-30% for a few seconds, but you can [configure -retraining](/src/collectors/python.d.plugin/anomalies/README.md#configuration) to happen less often if you wish. - -![CPU utilization of anomaly detection on the Raspberry -Pi](https://user-images.githubusercontent.com/1153921/110149718-9d749c00-7d9b-11eb-9af8-46e2032cd1d0.png) - -In terms of the runtime of the collector, it was averaging around 250ms during each prediction step, jumping to about -8-10 seconds during a retraining step. This jump equates only to a small gap in the anomaly charts for a few seconds. - -![Execution time of anomaly detection on the Raspberry -Pi](https://user-images.githubusercontent.com/1153921/110149715-9cdc0580-7d9b-11eb-826d-faf6f620621a.png) - -The last consideration then is the amount of RAM the collector needs to store both the models and some of the data -during training. By default, the anomalies collector, along with all other running Python-based collectors, uses about -100MB of system memory. - -![RAM utilization of anomaly detection on the Raspberry -Pi](https://user-images.githubusercontent.com/1153921/110149720-9e0d3280-7d9b-11eb-883d-b1d4d9b9b5e1.png) - - diff --git a/docs/developer-and-contributor-corner/raspberry-pi-anomaly-detection.txt b/docs/developer-and-contributor-corner/raspberry-pi-anomaly-detection.txt new file mode 100644 index 000000000..9bdacf274 --- /dev/null +++ b/docs/developer-and-contributor-corner/raspberry-pi-anomaly-detection.txt @@ -0,0 +1,96 @@ +# Anomaly detection for RPi monitoring + +Learn how to use a low-overhead machine learning algorithm alongside Netdata to detect anomalous metrics on a Raspberry Pi. + +We love IoT and edge at Netdata, we also love machine learning. Even better if we can combine the two to ease the pain +of monitoring increasingly complex systems. + +We recently explored what might be involved in enabling our Python-based [anomalies +collector](/src/collectors/python.d.plugin/anomalies/README.md) on a Raspberry Pi. To our delight, it's actually quite +straightforward! + +Read on to learn all the steps and enable unsupervised anomaly detection on your on Raspberry Pi(s). + +> Spoiler: It's just a couple of extra commands that will make you feel like a pro. + +## What you need to get started + +- A Raspberry Pi running Raspbian, which we'll call a _node_. +- The [open-source Netdata](https://github.com/netdata/netdata) monitoring agent. If you don't have it installed on your + node yet, [get started now](/packaging/installer/README.md). + +## Install dependencies + +First make sure Netdata is using Python 3 when it runs Python-based data collectors. + +Next, open `netdata.conf` using [`edit-config`](/docs/netdata-agent/configuration/README.md#edit-a-configuration-file-using-edit-config) +from within the [Netdata config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). Scroll down to the +`[plugin:python.d]` section to pass in the `-ppython3` command option. + +```text +[plugin:python.d] + # update every = 1 + command options = -ppython3 +``` + +Next, install some of the underlying libraries used by the Python packages the collector depends upon. + +```bash +sudo apt install llvm-9 libatlas3-base libgfortran5 libatlas-base-dev +``` + +Now you're ready to install the Python packages used by the collector itself. First, become the `netdata` user. + +```bash +sudo su -s /bin/bash netdata +``` + +Then pass in the location to find `llvm` as an environment variable for `pip3`. + +```bash +LLVM_CONFIG=llvm-config-9 pip3 install --user llvmlite numpy==1.20.1 netdata-pandas==0.0.38 numba==0.50.1 scikit-learn==0.23.2 pyod==0.8.3 +``` + +## Enable the anomalies collector + +Now you're ready to enable the collector and restart Netdata. + +```bash +sudo ./edit-config python.d.conf + +# restart netdata +sudo systemctl restart netdata +``` + +And that should be it! Wait a minute or two, refresh your Netdata dashboard, you should see the default anomalies +charts under the **Anomalies** section in the dashboard's menu. + +![Anomaly detection on the Raspberry +Pi](https://user-images.githubusercontent.com/1153921/110149717-9d749c00-7d9b-11eb-853c-e041a36f0a41.png) + +## Overhead on system + +Of course one of the most important considerations when trying to do anomaly detection at the edge (as opposed to in a +centralized cloud somewhere) is the resource utilization impact of running a monitoring tool. + +With the default configuration, the anomalies collector uses about 6.5% of CPU at each run. During the retraining step, +CPU utilization jumps to between 20-30% for a few seconds, but you can [configure +retraining](/src/collectors/python.d.plugin/anomalies/README.md#configuration) to happen less often if you wish. + +![CPU utilization of anomaly detection on the Raspberry +Pi](https://user-images.githubusercontent.com/1153921/110149718-9d749c00-7d9b-11eb-9af8-46e2032cd1d0.png) + +In terms of the runtime of the collector, it was averaging around 250ms during each prediction step, jumping to about +8-10 seconds during a retraining step. This jump equates only to a small gap in the anomaly charts for a few seconds. + +![Execution time of anomaly detection on the Raspberry +Pi](https://user-images.githubusercontent.com/1153921/110149715-9cdc0580-7d9b-11eb-826d-faf6f620621a.png) + +The last consideration then is the amount of RAM the collector needs to store both the models and some of the data +during training. By default, the anomalies collector, along with all other running Python-based collectors, uses about +100MB of system memory. + +![RAM utilization of anomaly detection on the Raspberry +Pi](https://user-images.githubusercontent.com/1153921/110149720-9e0d3280-7d9b-11eb-883d-b1d4d9b9b5e1.png) + + diff --git a/docs/developer-and-contributor-corner/running-through-cf-tunnels.md b/docs/developer-and-contributor-corner/running-through-cf-tunnels.md index 3179d5805..588740bc9 100644 --- a/docs/developer-and-contributor-corner/running-through-cf-tunnels.md +++ b/docs/developer-and-contributor-corner/running-through-cf-tunnels.md @@ -102,7 +102,7 @@ You can edit the configuration file using the `edit-config` script from the Netd destination = tcp:127.0.0.1:19999 ``` -[Restart the Agents](/packaging/installer/README.md#maintaining-a-netdata-agent-installation), and you are done! +[Restart the Agents](/docs/netdata-agent/start-stop-restart.md), and you are done! You should now be able to have a Local Dashboard that gets its metrics from Child instances, running through Cloudflare tunnels. diff --git a/docs/developer-and-contributor-corner/style-guide.md b/docs/developer-and-contributor-corner/style-guide.md index 94656bd76..b64a9df0b 100644 --- a/docs/developer-and-contributor-corner/style-guide.md +++ b/docs/developer-and-contributor-corner/style-guide.md @@ -2,7 +2,7 @@ The _Netdata style guide_ establishes editorial guidelines for any writing produced by the Netdata team or the Netdata community, including documentation, articles, in-product UX copy, and more. -> ### Note +> **Note** > This document is meant to be accompanied by the [Documentation Guidelines](/docs/guidelines.md). If you want to contribute to Netdata's documentation, please read it too. Both internal Netdata teams and external contributors to any of Netdata's open-source projects should reference and adhere to this style guide as much as possible. @@ -30,7 +30,6 @@ you're around. In writing, you reflect tone in your word choice, punctuation, se The same idea about voice and tone applies to organizations, too. Our voice shouldn't change much between two pieces of content, no matter who wrote each, but the tone might be quite different based on who we think is reading. - ### Voice Netdata's voice is authentic, passionate, playful, and respectful. @@ -63,7 +62,7 @@ the [language, grammar, and mechanics](#language-grammar-and-mechanics) section - Would this language make sense to someone who doesn't work here? - Could someone quickly scan this document and understand the material? -- Create an information hierarchy with key information presented first and clearly called out to improve scannability. +- Create an information hierarchy with key information presented first and clearly called out to improve clarity and readability. - Avoid directional language like "sidebar on the right of the page" or "header at the top of the page" since presentation elements may adapt for devices. - Use descriptive links rather than "click here" or "learn more". @@ -236,8 +235,8 @@ must reflect the _current state of [production](https://app.netdata.cloud). Every link should clearly state its destination. Don't use words like "here" to describe where a link will take your reader. -| | | -|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------| +| | | +|-----------------|-------------------------------------------------------------------------------------------| | Not recommended | To install Netdata, click [here](/packaging/installer/README.md). | | **Recommended** | To install Netdata, read the [installation instructions](/packaging/installer/README.md). | @@ -300,9 +299,9 @@ universal. Don't include full paths, beginning from the system's root (`/`), as these might not work on certain systems. -| | | -|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Not recommended | Use `edit-config` to edit Netdata's configuration: `sudo /etc/netdata/edit-config netdata.conf`. | +| | | +|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Not recommended | Use `edit-config` to edit Netdata's configuration: `sudo /etc/netdata/edit-config netdata.conf`. | | **Recommended** | Use `edit-config` to edit Netdata's configuration by first navigating to your [Netdata config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory), which is typically at `/etc/netdata`, then running `sudo edit-config netdata.conf`. | ### `sudo` @@ -394,27 +393,26 @@ the [Docusaurus documentation](https://v2.docusaurus.io/docs/markdown-features#c Notes inside files should render properly both in GitHub and in Learn, to do that, it is best to use the format listed below: -``` -> ### Note +```md +> **Note** > This is an info or a note block. -> ### Tip, Best Practice +> **Tip, Best Practice** > This is a tip or a best practice block. -> ### Warning, Caution +> **Warning, Caution** > This is a warning or a caution block. ``` Which renders into: - -> ### Note +> **Note** > This is an info or a note block. -> ### Tip, Best Practice +> **Tip, Best Practice** > This is a tip or a best practice block. -> ### Warning, Caution +> **Warning, Caution** > This is a warning or a caution block. ### Tabs @@ -450,21 +448,21 @@ The following tables describe the standard spelling, capitalization, and usage o | Term | Definition | |-----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **claimed node** | A node that you've proved ownership of by completing the [connecting to Cloud process](/src/claim/README.md). The claimed node will then appear in your Space and any Rooms you added it to. | +| **Connected Node** | A node that you've proved ownership of by completing the [connecting to Cloud process](/src/claim/README.md). The claimed node will then appear in your Space and any Rooms you added it to. | | **Netdata** | The company behind the open-source Netdata Agent and the Netdata Cloud web application. Never use _netdata_ or _NetData_.

In general, focus on the user's goals, actions, and solutions rather than what the company provides. For example, write _Learn more about enabling alert notifications on your preferred platforms_ instead of _Netdata sends alert notifications to your preferred platforms_. | | **Netdata Agent** | The free and open source [monitoring agent](https://github.com/netdata/netdata) that you can install on all of your distributed systems, whether they're physical, virtual, containerized, ephemeral, and more. The Agent monitors systems running Linux, Docker, Kubernetes, macOS, FreeBSD, and more, and collects metrics from hundreds of popular services and applications. | | **Netdata Cloud** | The web application hosted at [https://app.netdata.cloud](https://app.netdata.cloud) that helps you monitor an entire infrastructure of distributed systems in real time.

Never use _Cloud_ without the preceding _Netdata_ to avoid ambiguity. | | **Netdata community forum** | The Discourse-powered forum for feature requests, Netdata Cloud technical support, and conversations about Netdata's monitoring and troubleshooting products. | -| **node** | A system on which the Netdata Agent is installed. The system can be physical, virtual, in a Docker container, and more. Depending on your infrastructure, you may have one, dozens, or hundreds of nodes. Some nodes are _ephemeral_, in that they're created/destroyed automatically by an orchestrator service. | +| **Node** | A system on which the Netdata Agent is installed. The system can be physical, virtual, in a Docker container, and more. Depending on your infrastructure, you may have one, dozens, or hundreds of nodes. Some nodes are _ephemeral_, in that they're created/destroyed automatically by an orchestrator service. | | **Space** | The highest level container within Netdata Cloud for a user to organize their team members and nodes within their infrastructure. A Space likely represents an entire organization or a large team.

_Space_ is always capitalized. | -| **unreachable node** | A connected node with a disrupted [Agent-Cloud link](/src/aclk/README.md). Unreachable could mean the node no longer exists or is experiencing network connectivity issues with Cloud. | -| **visited node** | A node which has had its Agent dashboard directly visited by a user. A list of these is maintained on a per-user basis. | -| **Room** | A smaller grouping of nodes where users can view key metrics in real-time and monitor the health of many nodes with their alert status. Rooms can be used to organize nodes in any way that makes sense for your infrastructure, such as by a service, purpose, physical location, and more.

_Room_ is always capitalized. | +| **Unreachable node** | A connected node with a disrupted [Agent-Cloud link](/src/aclk/README.md). Unreachable could mean the node no longer exists or is experiencing network connectivity issues with Cloud. | +| **Visited Node** | A node which has had its Agent dashboard directly visited by a user. A list of these is maintained on a per-user basis. | +| **Room** | A smaller grouping of nodes where users can view key metrics in real-time and monitor the health of many nodes with their alert status. Rooms can be used to organize nodes in any way that makes sense for your infrastructure, such as by a service, purpose, physical location, and more.

_Room_ is always capitalized. | ### Other technical terms | Term | Definition | |-----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **filesystem** | Use instead of _file system_. | -| **preconfigured** | The concept that many of Netdata's features come with sane defaults that users don't need to configure to find immediate value. | +| **pre-configured** | The concept that many of Netdata's features come with sane defaults that users don't need to configure to find immediate value. | | **real time**/**real-time** | Use _real time_ as a noun phrase, most often with _in_: _Netdata collects metrics in real time_. Use _real-time_ as an adjective: _Netdata collects real-time metrics from hundreds of supported applications and services. | diff --git a/docs/diagrams/netdata-overview.xml b/docs/diagrams/netdata-overview.xml index 16c967e6e..2967f915c 100644 --- a/docs/diagrams/netdata-overview.xml +++ b/docs/diagrams/netdata-overview.xml @@ -78,7 +78,7 @@ - + diff --git a/docs/exporting-metrics/README.md b/docs/exporting-metrics/README.md index d667cea15..24e33ad46 100644 --- a/docs/exporting-metrics/README.md +++ b/docs/exporting-metrics/README.md @@ -3,7 +3,7 @@ Netdata allows you to export metrics to external time-series databases with the [exporting engine](/src/exporting/README.md). This system uses a number of **connectors** to initiate connections to [more than thirty](#supported-databases) supported databases, including InfluxDB, Prometheus, Graphite, ElasticSearch, and much -more. +more. The exporting engine resamples Netdata's thousands of per-second metrics at a user-configurable interval, and can export metrics to multiple time-series databases simultaneously. @@ -22,45 +22,45 @@ Netdata supports exporting metrics to the following databases through several [connectors](/src/exporting/README.md#features). Once you find the connector that works for your database, open its documentation and the [enabling a connector](/docs/exporting-metrics/enable-an-exporting-connector.md) doc for details on enabling it. -- **AppOptics**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **AWS Kinesis**: [AWS Kinesis Data Streams](/src/exporting/aws_kinesis/README.md) -- **Azure Data Explorer**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Azure Event Hubs**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Blueflood**: [Graphite](/src/exporting/graphite/README.md) -- **Chronix**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Cortex**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **CrateDB**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **ElasticSearch**: [Graphite](/src/exporting/graphite/README.md), [Prometheus remote +- **AppOptics**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **AWS Kinesis**: [AWS Kinesis Data Streams](/src/exporting/aws_kinesis/README.md) +- **Azure Data Explorer**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **Azure Event Hubs**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **Blueflood**: [Graphite](/src/exporting/graphite/README.md) +- **Chronix**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **Cortex**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **CrateDB**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **ElasticSearch**: [Graphite](/src/exporting/graphite/README.md), [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Gnocchi**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Google BigQuery**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Google Cloud Pub/Sub**: [Google Cloud Pub/Sub Service](/src/exporting/pubsub/README.md) -- **Graphite**: [Graphite](/src/exporting/graphite/README.md), [Prometheus remote +- **Gnocchi**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **Google BigQuery**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **Google Cloud Pub/Sub**: [Google Cloud Pub/Sub Service](/src/exporting/pubsub/README.md) +- **Graphite**: [Graphite](/src/exporting/graphite/README.md), [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **InfluxDB**: [Graphite](/src/exporting/graphite/README.md), [Prometheus remote +- **InfluxDB**: [Graphite](/src/exporting/graphite/README.md), [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **IRONdb**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **JSON**: [JSON document databases](/src/exporting/json/README.md) -- **Kafka**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **KairosDB**: [Graphite](/src/exporting/graphite/README.md), [OpenTSDB](/src/exporting/opentsdb/README.md) -- **M3DB**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **MetricFire**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **MongoDB**: [MongoDB](/src/exporting/mongodb/README.md) -- **New Relic**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **OpenTSDB**: [OpenTSDB](/src/exporting/opentsdb/README.md), [Prometheus remote +- **IRONdb**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **JSON**: [JSON document databases](/src/exporting/json/README.md) +- **Kafka**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **KairosDB**: [Graphite](/src/exporting/graphite/README.md), [OpenTSDB](/src/exporting/opentsdb/README.md) +- **M3DB**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **MetricFire**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **MongoDB**: [MongoDB](/src/exporting/mongodb/README.md) +- **New Relic**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **OpenTSDB**: [OpenTSDB](/src/exporting/opentsdb/README.md), [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **PostgreSQL**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **PostgreSQL**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) via [PostgreSQL Prometheus Adapter](https://github.com/CrunchyData/postgresql-prometheus-adapter) -- **Prometheus**: [Prometheus scraper](/src/exporting/prometheus/README.md) -- **TimescaleDB**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md), +- **Prometheus**: [Prometheus scraper](/src/exporting/prometheus/README.md) +- **TimescaleDB**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md), [netdata-timescale-relay](/src/exporting/TIMESCALE.md) -- **QuasarDB**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **SignalFx**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Splunk**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **TiKV**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Thanos**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **VictoriaMetrics**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) -- **Wavefront**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **QuasarDB**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **SignalFx**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **Splunk**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **TiKV**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **Thanos**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **VictoriaMetrics**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) +- **Wavefront**: [Prometheus remote write](/src/exporting/prometheus/remote_write/README.md) Can't find your preferred external time-series database? Ask our [community](https://community.netdata.cloud/) for solutions, or file an [issue on diff --git a/docs/exporting-metrics/enable-an-exporting-connector.md b/docs/exporting-metrics/enable-an-exporting-connector.md index 6a5542fdb..16fbe0b9b 100644 --- a/docs/exporting-metrics/enable-an-exporting-connector.md +++ b/docs/exporting-metrics/enable-an-exporting-connector.md @@ -19,7 +19,7 @@ Use `edit-config` from your [Netdata config directory](/docs/netdata-agent/confi Enable the exporting engine itself by setting `enabled` to `yes`: -```conf +```text [exporting:global] enabled = yes ``` @@ -30,7 +30,7 @@ Save the file but keep it open, as you will edit it again to enable specific con Use the following configuration as a starting point. Copy and paste it into `exporting.conf`. -```conf +```text [opentsdb:http:my_opentsdb_http_instance] enabled = yes destination = localhost:4242 diff --git a/docs/glossary.md b/docs/glossary.md index bcada6030..78ba18072 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -6,7 +6,7 @@ As such, we want to provide a little Glossary as a reference starting point for If you're here looking for the definition of a term you heard elsewhere in our community or products, or if you just want to learn Netdata from the ground up, you've come to the right page. -Use the alphabatized list below to find the answer to your single-term questions, and click the bolded list items to explore more on the topics! We'll be sure to keep constantly updating this list, so if you hear a word that you would like for us to cover, just let us know or submit a request! +Use the alphabetized list below to find the answer to your single-term questions, and click the bolded list items to explore more on the topics! We'll be sure to keep constantly updating this list, so if you hear a word that you would like for us to cover, just let us know or submit a request! [A](#a) | [B](#b) | [C](#c) | [D](#d)| [E](#e) | [F](#f) | [G](#g) | [H](#h) | [I](#i) | [J](#j) | [K](#k) | [L](#l) | [M](#m) | [N](#n) | [O](#o) | [P](#p) | [Q](#q) | [R](#r) | [S](#s) | [T](#t) | [U](#u) | [V](#v) | [W](#w) | [X](#x) | [Y](#y) | [Z](#z) @@ -53,7 +53,7 @@ Use the alphabatized list below to find the answer to your single-term questions ## E -- [**External Plugins**](/src/collectors/plugins.d/README.md): These gather metrics from external processes, such as a webserver or database, and run as independent processes that communicate with the Netdata daemon via pipes. +- [**External Plugins**](/src/plugins.d/README.md): These gather metrics from external processes, such as a webserver or database, and run as independent processes that communicate with the Netdata daemon via pipes. ## F @@ -65,7 +65,7 @@ Use the alphabatized list below to find the answer to your single-term questions ## G -- [**Group by**](/docs/dashboards-and-charts/netdata-charts.md#group-by-dimension-node-or-chart): The drop-down on the dimension bar of a composite chart that allows you to group metrics by dimension, node, or chart. +- [**Group by**](/docs/dashboards-and-charts/netdata-charts.md#group-by-dropdown): The drop-down on the dimension bar of a composite chart that allows you to group metrics by dimension, node, or chart. - [**Health Configuration Files**](/src/health/REFERENCE.md#edit-health-configuration-files): Files that you can edit to configure your Agent's health watchdog service. @@ -110,7 +110,7 @@ metrics, troubleshoot complex performance problems, and make data interoperable ## O -- [**Obsoletion**(of nodes)](/docs/netdata-cloud/organize-your-infrastructure-invite-your-team.md#obsoleting-offline-nodes-from-a-space): Removing nodes from a space. +- [**Obsoletion**(of nodes)](/docs/dashboards-and-charts/nodes-tab.md): Removing nodes from a space. - [**Orchestrators**](/src/collectors/README.md#collector-architecture-and-terminology): External plugins that run and manage one or more modules. They run as independent processes. @@ -145,8 +145,8 @@ even thousands of nodes. There are no actual bottlenecks especially if you retai ## V -- [**Visualizations**](/docs/category-overview-pages/visualizations-overview.md): Netdata uses dimensions, contexts, and families to sort your metric data into graphs, charts, and alerts that maximize your understand of your infrastructure and your ability to troubleshoot it, along or on a team. +- [**Visualizations**](/docs/dashboards-and-charts/README.md): Netdata uses dimensions, contexts, and families to sort your metric data into graphs, charts, and alerts that maximize your understand of your infrastructure and your ability to troubleshoot it, along or on a team. ## Z -- **Zero Configuration**: Netdata is preconfigured and capable to autodetect and monitor any well known application that runs on your system. You just deploy and claim Netdata Agents in your Netdata space, and monitor them in seconds. +- **Zero Configuration**: Netdata is pre-configured and capable to autodetect and monitor any well known application that runs on your system. You just deploy and claim Netdata Agents in your Netdata space, and monitor them in seconds. diff --git a/docs/guidelines.md b/docs/guidelines.md index b0e6759cc..02e7a386f 100644 --- a/docs/guidelines.md +++ b/docs/guidelines.md @@ -49,7 +49,7 @@ Please ensure that any links to a different documentation resource are fully exp e.g. -``` +```text [Correct link to this document](/docs/guidelines.md) vs [Incorrect link to this document](https://learn.netdata.cloud/XYZ) diff --git a/docs/netdata-agent/README.md b/docs/netdata-agent/README.md index 75bd4898e..8096e911a 100644 --- a/docs/netdata-agent/README.md +++ b/docs/netdata-agent/README.md @@ -1,6 +1,6 @@ # Netdata Agent -The Netdata Agent is the main building block in a Netdata ecosystem. It is installed on all monitored systems to monitor system components, containers and applications. +The Netdata Agent is the main building block in the Netdata ecosystem. It is installed on all monitored systems to monitor system components, containers and applications. The Netdata Agent is an **observability pipeline in a box** that can either operate standalone, or blend into a bigger pipeline made by more Netdata Agents (Children and Parents). @@ -53,7 +53,7 @@ stateDiagram-v2 1. **Discover**: auto-detect metric sources on localhost, auto-discover metric sources on Kubernetes. 2. **Collect**: query data sources to collect metric samples, using the optimal protocol for each data source. 800+ integrations supported, including dozens of native application protocols, OpenMetrics and StatsD. -3. **Detect Anomalies**: use the trained machine learning models for each metric, to detect in real-time if each sample collected is an outlier (an anomaly), or not. +3. **Detect Anomalies**: use the trained machine learning models for each metric to detect in real-time if each sample collected is an outlier (an anomaly), or not. 4. **Store**: keep collected samples and their anomaly status, in the time-series database (database mode `dbengine`) or a ring buffer (database modes `ram` and `alloc`). 5. **Learn**: train multiple machine learning models for each metric collected, learning behaviors and patterns for detecting anomalies. 6. **Check**: a health engine, triggering alerts and sending notifications. Netdata comes with hundreds of alert configurations that are automatically attached to metrics when they get collected, detecting errors, common configuration errors and performance issues. @@ -69,7 +69,7 @@ stateDiagram-v2 2. **Automation**: Netdata is designed to automate most of the process of setting up and running an observability solution. It is designed to instantly provide comprehensive dashboards and fully automated alerts, with zero configuration. -3. **High Fidelity Monitoring**: Netdata was born from our need to kill the console for observability. So, it provides metrics and logs in the same granularity and fidelity console tools do, but also comes with tools that go beyond metrics and logs, to provide a holistic view of the monitored infrastructure (e.g. check [Top Monitoring](/docs/top-monitoring-netdata-functions.md)). +3. **High Fidelity Monitoring**: Netdata was born from our need to kill the console for observability. So, it provides metrics and logs in the same granularity and fidelity console tools do, but also comes with tools that go beyond metrics and logs, to provide a holistic view of the monitored infrastructure (e.g., check [Top Monitoring](/docs/top-monitoring-netdata-functions.md)). 4. **Minimal impact on monitored systems and applications**: Netdata has been designed to have a minimal impact on the monitored systems and their applications. There are [independent studies](https://www.ivanomalavolta.com/files/papers/ICSOC_2023.pdf) reporting that Netdata excels in CPU usage, RAM utilization, Execution Time and the impact Netdata has on monitored applications and containers. @@ -77,8 +77,8 @@ stateDiagram-v2 ## Dashboard Versions -The Netdata agents (Standalone, Children and Parents) **share the dashboard** of Netdata Cloud. However, when the user is logged-in and the Netdata agent is connected to Netdata Cloud, the following are enabled (which are otherwise disabled): +The Netdata agents (Standalone, Children and Parents) **share the dashboard** of Netdata Cloud. However, when the user is logged in and the Netdata agent is connected to Netdata Cloud, the following are enabled (which are otherwise disabled): 1. **Access to Sensitive Data**: Some data, like systemd-journal logs and several [Top Monitoring](/docs/top-monitoring-netdata-functions.md) features expose sensitive data, like IPs, ports, process command lines and more. To access all these when the dashboard is served directly from a Netdata agent, Netdata Cloud is required to verify that the user accessing the dashboard has the required permissions. -2. **Dynamic Configuration**: Netdata agents are configured via configuration files, manually or through some provisioning system. The latest Netdata includes a feature to allow users change some of the configuration (collectors, alerts) via the dashboard. This feature is only available to users of paid Netdata Cloud plan. +2. **Dynamic Configuration**: Netdata agents are configured via configuration files, manually or through some provisioning system. The latest Netdata includes a feature to allow users to change some configurations (collectors, alerts) via the dashboard. This feature is only available to users of paid Netdata Cloud plan. diff --git a/docs/netdata-agent/backup-and-restore-an-agent.md b/docs/netdata-agent/backup-and-restore-an-agent.md index d17cad604..db9398b27 100644 --- a/docs/netdata-agent/backup-and-restore-an-agent.md +++ b/docs/netdata-agent/backup-and-restore-an-agent.md @@ -1,44 +1,43 @@ # Backing up a Netdata Agent > **Note** -> +> > Users are responsible for backing up, recovering, and ensuring their data's availability because Netdata stores data locally on each system due to its decentralized architecture. ## Introduction -When preparing to backup a Netdata Agent it is worth considering that there are different kinds of data that you may wish to backup independently or all together: +When planning a Netdata Agent backup, it's essential to recognize the types of data that can be backed up, either individually or collectively: -| Data type | Description | Location | -|---------------------|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------| +| Data type | Description | Location | +|---------------------|------------------------------------------------------|-----------------------------------------------------------------| | Agent configuration | Files controlling configuration of the Netdata Agent | [config directory](/docs/netdata-agent/configuration/README.md) | -| Metrics | Database files | /var/cache/netdata | -| Identity | Claim token, API key and some other files | /var/lib/netdata | - +| Metrics | Database files | /var/cache/netdata | +| Identity | Claim token, API key and some other files | /var/lib/netdata | ## Scenarios ### Backing up to restore data in case of a node failure -In this standard scenario, you are backing up your Netdata Agent in case of a node failure or data corruption so that the metrics and the configuration can be recovered. The purpose is not to backup/restore the application itself. +In this standard scenario, you’re backing up your Netdata Agent in case of a node failure or data corruption so that the metrics and the configuration can be recovered. The purpose is not to backup/restore the application itself. -1. Verify that the directory paths in the table above contain the information you expect. +1. Verify that the directory paths in the table above contain the information you expect. > **Note** > The specific paths may vary depending on installation method, Operating System, and whether it is a Docker/Kubernetes deployment. 2. It is recommended that you [stop the Netdata Agent](/docs/netdata-agent/start-stop-restart.md) when backing up the Metrics/database files. - Backing up the Agent configuration and Identity folders is straightforward as they should not be changing very frequently. + Backing up the Agent configuration and Identity folders is straightforward as they shouldn’t be changing very frequently. 3. Using a backup tool such as `tar` you will need to run the backup as _root_ or as the _netdata_ user to access all the files in the directories. - - ``` + + ```bash sudo tar -cvpzf netdata_backup.tar.gz /etc/netdata/ /var/cache/netdata /var/lib/netdata ``` - + Stopping the Netdata agent is typically necessary to back up the database files of the Netdata Agent. If you want to minimize the gap in metrics caused by stopping the Netdata Agent, consider implementing a backup job or script that follows this sequence: - + - Backup the Agent configuration Identity directories - Stop the Netdata service - Backup up the database files @@ -46,25 +45,25 @@ If you want to minimize the gap in metrics caused by stopping the Netdata Agent, ### Restoring Netdata -1. Ensure that the Netdata agent is installed and is [stopped](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) +1. Ensure that the Netdata agent is installed and is [stopped](/docs/netdata-agent/start-stop-restart.md) If you plan to deploy the Agent and restore a backup on top of it, then you might find it helpful to use the [`--dont-start-it`](/packaging/installer/methods/kickstart.md#other-options) option upon installation. - ``` + ```bash wget -O /tmp/netdata-kickstart.sh https://get.netdata.cloud/kickstart.sh && sh /tmp/netdata-kickstart.sh --dont-start-it ``` - - > **Note** - > If you are going to restore the database files then you should first ensure that the Metrics directory is empty. - > - > ``` + + > **Note** + > If you are going to restore the database files, then you should first ensure that the Metrics directory is empty. + > + > ```bash > sudo rm -Rf /var/cache/netdata > ``` 2. Restore the backup from the archive - ``` + ```bash sudo tar -xvpzf /path/to/netdata_backup.tar.gz -C / ``` -3. [Start the Netdata agent](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) +3. [Start the Netdata agent](/docs/netdata-agent/start-stop-restart.md) diff --git a/docs/netdata-agent/configuration/README.md b/docs/netdata-agent/configuration/README.md index 097fb9310..abe511313 100644 --- a/docs/netdata-agent/configuration/README.md +++ b/docs/netdata-agent/configuration/README.md @@ -1,21 +1,28 @@ # Netdata Agent Configuration -The main Netdata agent configuration is `netdata.conf`. +> **Info** +> +> Netdata Cloud lets you configure Agents on the fly. Check the [Dynamic Configuration Manager](/docs/netdata-agent/configuration/dynamic-configuration.md) documentation for details. + +The main Netdata Agent configuration is `netdata.conf`. ## The Netdata config directory -On most Linux systems, by using our [recommended one-line installation](/packaging/installer/README.md#install-on-linux-with-one-line-installer), the **Netdata config +On most Linux systems, the **Netdata config directory** will be `/etc/netdata/`. The config directory contains several configuration files with the `.conf` extension, a few directories, and a shell script named `edit-config`. > Some operating systems will use `/opt/netdata/etc/netdata/` as the config directory. If you're not sure where yours > is, navigate to `http://NODE:19999/netdata.conf` in your browser, replacing `NODE` with the IP address or hostname of -> your node, and find the `# config directory = ` setting. The value listed is the config directory for your system. +> your node, and find the `# config directory =` setting. The value listed is the config directory for your system. All of Netdata's documentation assumes that your config directory is at `/etc/netdata`, and that you're running any scripts from inside that directory. +## Edit a configuration file using `edit-config` + +We recommend the use of the `edit-config` script for configuration changes. -## edit `netdata.conf` +It exists inside your config directory (read above) and helps manage and safely edit configuration files. To edit `netdata.conf`, run this on your terminal: @@ -28,9 +35,9 @@ Your editor will open. ## downloading `netdata.conf` -The running version of `netdata.conf` can be downloaded from a running Netdata agent, at this URL: +The running version of `netdata.conf` can be downloaded from a running Netdata Agent, at this URL: -``` +```url http://agent-ip:19999/netdata.conf ``` @@ -40,4 +47,3 @@ You can save and use this version, using these commands: cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata curl -ksSLo /tmp/netdata.conf.new http://localhost:19999/netdata.conf && sudo mv -i /tmp/netdata.conf.new netdata.conf ``` - diff --git a/docs/netdata-agent/configuration/anonymous-telemetry-events.md b/docs/netdata-agent/configuration/anonymous-telemetry-events.md index b943ea9a3..4d48de4a2 100644 --- a/docs/netdata-agent/configuration/anonymous-telemetry-events.md +++ b/docs/netdata-agent/configuration/anonymous-telemetry-events.md @@ -1,30 +1,22 @@ - - # Anonymous telemetry events -By default, Netdata collects anonymous usage information from the open-source monitoring agent. For agent events like start,stop,crash etc we use our own cloud function in GCP. For frontend telemetry (pageviews etc.) on the agent dashboard itself we use the open-source +By default, Netdata collects anonymous usage information from the open-source monitoring agent. For agent events like start, stop, crash, etc. we use our own cloud function in GCP. For frontend telemetry (page views etc.) on the agent dashboard itself, we use the open-source product analytics platform [PostHog](https://github.com/PostHog/posthog). We are strongly committed to your [data privacy](https://netdata.cloud/privacy/). We use the statistics gathered from this information for two purposes: -1. **Quality assurance**, to help us understand if Netdata behaves as expected, and to help us classify repeated - issues with certain distributions or environments. +1. **Quality assurance**, to help us understand if Netdata behaves as expected, and to help us classify repeated + issues with certain distributions or environments. -2. **Usage statistics**, to help us interpret how people use the Netdata agent in real-world environments, and to help - us identify how our development/design decisions influence the community. +2. **Usage statistics**, to help us interpret how people use the Netdata agent in real-world environments, and to help + us identify how our development/design decisions influence the community. Netdata collects usage information via two different channels: -- **Agent dashboard**: We use the [PostHog JavaScript integration](https://posthog.com/docs/integrations/js-integration) (with sensitive event attributes overwritten to be anonymized) to send product usage events when you access an [Agent's dashboard](/docs/dashboards-and-charts/README.md). -- **Agent backend**: The `netdata` daemon executes the [`anonymous-statistics.sh`](https://github.com/netdata/netdata/blob/6469cf92724644f5facf343e4bdd76ac0551a418/daemon/anonymous-statistics.sh.in) script when Netdata starts, stops cleanly, or fails. +- **Agent dashboard**: We use the [PostHog JavaScript integration](https://posthog.com/docs/integrations/js-integration) (with sensitive event attributes overwritten to be anonymized) to send product usage events when you access an [Agent's dashboard](/docs/dashboards-and-charts/README.md). +- **Agent backend**: The `netdata` daemon executes the [`anonymous-statistics.sh`](https://github.com/netdata/netdata/blob/6469cf92724644f5facf343e4bdd76ac0551a418/daemon/anonymous-statistics.sh.in) script when Netdata starts, stops cleanly, or fails. You can opt-out from sending anonymous statistics to Netdata through three different [opt-out mechanisms](#opt-out). @@ -32,7 +24,7 @@ You can opt-out from sending anonymous statistics to Netdata through three diffe When you kick off an Agent dashboard session by visiting `http://NODE:19999`, Netdata initializes a PostHog session and masks various event attributes. -_Note_: You can see the relevant code in the [dashboard repository](https://github.com/netdata/dashboard/blob/master/src/domains/global/sagas.ts#L107) where the `window.posthog.register()` call is made. +_Note_: You can see the relevant code in the [dashboard repository](https://github.com/netdata/dashboard/blob/master/src/domains/global/sagas.ts#L107) where the `window.posthog.register()` call is made. ```JavaScript window.posthog.register({ @@ -52,28 +44,28 @@ variable is controlled via the [opt-out mechanism](#opt-out). ## Agent Backend - Anonymous Statistics Script Every time the daemon is started or stopped and every time a fatal condition is encountered, Netdata uses the anonymous -statistics script to collect system information and send it to the Netdata telemetry cloud function via an http call. The information collected for all +statistics script to collect system information and send it to the Netdata telemetry cloud function via a http call. The information collected for all events is: -- Netdata version -- OS name, version, id, id_like -- Kernel name, version, architecture -- Virtualization technology -- Containerization technology +- Netdata version +- OS name, version, id, id_like +- Kernel name, version, architecture +- Virtualization technology +- Containerization technology -Furthermore, the FATAL event sends the Netdata process & thread name, along with the source code function, source code +Furthermore, the FATAL event sends the Netdata process and thread name, along with the source code function, source code filename and source code line number of the fatal error. Starting with v1.21, we additionally collect information about: -- Failures to build the dependencies required to use Cloud features. -- Unavailability of Cloud features in an agent. -- Failures to connect to the Cloud in case the [connection process](/src/claim/README.md) has been completed. This includes error codes - to inform the Netdata team about the reason why the connection failed. +- Failures to build the dependencies required to use Cloud features. +- Unavailability of Cloud features in an agent. +- Failures to connect to the Cloud in case the [connection process](/src/claim/README.md) has been completed. This includes error codes + to inform the Netdata team about the reason why the connection failed. To see exactly what and how is collected, you can review the script template `daemon/anonymous-statistics.sh.in`. The template is converted to a bash script called `anonymous-statistics.sh`, installed under the Netdata `plugins -directory`, which is usually `/usr/libexec/netdata/plugins.d`. +directory`, which is usually `/usr/libexec/netdata/plugins.d`. ## Opt-out @@ -87,17 +79,15 @@ installation, including manual, offline, and macOS installations. Create the fil **Pass the option `--disable-telemetry` to any of the installer scripts in the [installation docs](/packaging/installer/README.md).** You can append this option during the initial installation or a manual update. You can also export the environment variable `DISABLE_TELEMETRY` with a non-zero or non-empty value -(e.g: `export DISABLE_TELEMETRY=1`). +(e.g.,: `export DISABLE_TELEMETRY=1`). When using Docker, **set your `DISABLE_TELEMETRY` environment variable to `1`.** You can set this variable with the following command: `export DISABLE_TELEMETRY=1`. When creating a container using Netdata's [Docker image](/packaging/docker/README.md#create-a-new-netdata-agent-container) for the first time, this variable will disable -the anonymous statistics script inside of the container. +the anonymous statistics script inside the container. Each of these opt-out processes does the following: -- Prevents the daemon from executing the anonymous statistics script. -- Forces the anonymous statistics script to exit immediately. -- Stops the PostHog JavaScript snippet, which remains on the dashboard, from firing and sending any data to the Netdata PostHog. - - +- Prevents the daemon from executing the anonymous statistics script. +- Forces the anonymous statistics script to exit immediately. +- Stops the PostHog JavaScript snippet, which remains on the dashboard, from firing and sending any data to the Netdata PostHog. diff --git a/docs/netdata-agent/configuration/cheatsheet.md b/docs/netdata-agent/configuration/cheatsheet.md index 3e1428694..ecd8e8a84 100644 --- a/docs/netdata-agent/configuration/cheatsheet.md +++ b/docs/netdata-agent/configuration/cheatsheet.md @@ -1,8 +1,8 @@ # Useful management and configuration actions -Below you will find some of the most common actions that one can take while using Netdata. You can use this page as a quick reference for installing Netdata, connecting a node to the Cloud, properly editing the configuration, accessing Netdata's API, and more! +Below are some of the most common actions one can take while using Netdata. You can use this page as a quick reference for installing Netdata, connecting a node to the Cloud, properly editing the configuration, accessing Netdata's API, and more! -### Install Netdata +## Install Netdata ```bash wget -O /tmp/netdata-kickstart.sh https://get.netdata.cloud/kickstart.sh && sh /tmp/netdata-kickstart.sh @@ -11,12 +11,12 @@ wget -O /tmp/netdata-kickstart.sh https://get.netdata.cloud/kickstart.sh && sh / curl https://get.netdata.cloud/kickstart.sh > /tmp/netdata-kickstart.sh && sh /tmp/netdata-kickstart.sh ``` -#### Connect a node to Netdata Cloud +### Connect a node to Netdata Cloud To do so, sign in to Netdata Cloud, on your Space under the Nodes tab, click `Add Nodes` and paste the provided command into your node’s terminal and run it. You can also copy the Claim token and pass it to the installation script with `--claim-token` and re-run it. -### Configuration +## Configuration **Netdata's config directory** is `/etc/netdata/` but in some operating systems it might be `/opt/netdata/etc/netdata/`. Look for the `# config directory =` line over at `http://NODE_IP:19999/netdata.conf` to find your config directory. @@ -25,63 +25,19 @@ From within that directory you can run `sudo ./edit-config netdata.conf` **to ed You can edit other config files too, by specifying their filename after `./edit-config`. You are expected to use this method in all following configuration changes. - - ---- - -#### Enable/disable plugins (groups of collectors) +### Enable/disable plugins (groups of collectors) ```bash sudo ./edit-config netdata.conf ``` -```conf +```text [plugins] go.d = yes # enabled node.d = no # disabled ``` -#### Enable/disable specific collectors +### Enable/disable specific collectors ```bash sudo ./edit-config go.d.conf # edit a plugin's config @@ -89,24 +45,18 @@ sudo ./edit-config go.d.conf # edit a plugin's config ```yaml modules: - activemq: no # disabled - cockroachdb: yes # enabled + activemq: no # disabled + cockroachdb: yes # enabled ``` -#### Edit a collector's config +### Edit a collector's config ```bash sudo ./edit-config go.d/mysql.conf ``` -### Alerts & notifications - - After any change, reload the Netdata health configuration: ```bash @@ -115,32 +65,23 @@ netdatacli reload-health killall -USR2 netdata ``` -#### Configure a specific alert +### Configure a specific alert ```bash sudo ./edit-config health.d/example-alert.conf ``` -#### Silence a specific alert +### Silence a specific alert ```bash sudo ./edit-config health.d/example-alert.conf ``` -``` +```text to: silent ``` - - ---- - -### Manage the daemon +## Manage the daemon | Intent | Action | |:----------------------------|------------------------------------------------------------:| @@ -151,65 +92,22 @@ sudo ./edit-config health.d/example-alert.conf | View error logs | `less /var/log/netdata/error.log` | | View collectors logs | `less /var/log/netdata/collector.log` | -#### Change the port Netdata listens to (example, set it to port 39999) +### Change the port Netdata listens to (example, set it to port 39999) -```conf +```text [web] default port = 39999 ``` -### See metrics and dashboards +## See metrics and dashboards -#### Netdata Cloud: `https://app.netdata.cloud` +### Netdata Cloud: `https://app.netdata.cloud` -#### Local dashboard: `https://NODE:19999` +### Local dashboard: `https://NODE:19999` > Replace `NODE` with the IP address or hostname of your node. Often `localhost`. -### Access the Netdata API +## Access the Netdata API You can access the API like this: `http://NODE:19999/api/VERSION/REQUEST`. If you want to take a look at all the API requests, check our API page at - - - - - - - diff --git a/docs/netdata-agent/configuration/common-configuration-changes.md b/docs/netdata-agent/configuration/common-configuration-changes.md index e9d8abadc..0eda7dd86 100644 --- a/docs/netdata-agent/configuration/common-configuration-changes.md +++ b/docs/netdata-agent/configuration/common-configuration-changes.md @@ -19,11 +19,7 @@ changes reflected in those visualizations due to the way Netdata Cloud proxies m ### Increase the long-term metrics retention period -Read our doc -on [increasing long-term metrics storage](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md) -for details, including a -[calculator](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics) -to help you determine the exact settings for your desired retention period. +Read our doc on [increasing long-term metrics storage](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md) for details. ### Reduce the data collection frequency @@ -33,7 +29,7 @@ of `netdata.conf` so that it is greater than `1`. An `update every` of `5` means the Netdata Agent enforces a _minimum_ collection frequency of 5 seconds. -```conf +```text [global] update every = 5 ``` @@ -56,7 +52,7 @@ for that specific module. Uncomment the line and change its value to `no`. ## Modify alerts and notifications -Netdata's health monitoring watchdog uses hundreds of preconfigured health entities, with intelligent thresholds, to +Netdata's health monitoring watchdog uses hundreds of pre-configured health entities, with intelligent thresholds, to generate warning and critical alerts for most production systems and their applications without configuration. However, each alert and notification method is completely customizable. @@ -94,7 +90,7 @@ Because the source path contains `health.d/cpu.conf`, run `sudo edit-config heal Open the configuration file for that alert and set the `to` line to `silent`. -```conf +```text template: disk_fill_rate on: disk.space lookup: max -1s at -30m unaligned of avail @@ -111,7 +107,7 @@ section of `netdata.conf`. ### Enable alert notifications -Open `health_alarm_notify.conf` for editing. First, read the [enabling notifications](/docs/alerts-and-notifications/notifications/README.md#netdata-agent) doc +Open `health_alarm_notify.conf` for editing. First, read the [enabling notifications](/src/health/notifications/README.md) doc for an example of the process using Slack, then click on the link to your preferred notification method to find documentation for that specific endpoint. @@ -143,6 +139,6 @@ The following restrictions apply to host label names: - Names cannot start with `_`, but it can be present in other parts of the name. - Names only accept alphabet letters, numbers, dots, and dashes. -The policy for values is more flexible, but you can not use exclamation marks (`!`), whitespaces (` `), single quotes +The policy for values is more flexible, but you cannot use exclamation marks (`!`), whitespaces (` `), single quotes (`'`), double quotes (`"`), or asterisks (`*`), because they are used to compare label values in health alerts and templates. diff --git a/docs/netdata-agent/configuration/dynamic-configuration.md b/docs/netdata-agent/configuration/dynamic-configuration.md index 7064abf9a..c419a82d9 100644 --- a/docs/netdata-agent/configuration/dynamic-configuration.md +++ b/docs/netdata-agent/configuration/dynamic-configuration.md @@ -1,6 +1,8 @@ # Dynamic Configuration Manager -**Netdata Cloud paid subscription required.** +> **Info** +> +> Netdata Cloud paid subscription is required. The Dynamic Configuration Manager allows direct configuration of collectors and alerts through the Netdata UI. This feature allows users to: @@ -11,7 +13,7 @@ The Dynamic Configuration Manager allows direct configuration of collectors and > **Info** > -> To understand what actions users can perform based on their role, refer to the [Role Based Access documentation](/docs/netdata-cloud/authentication-and-authorization/role-based-access-model.md#dynamic-configuration-manager). +> To understand what actions users can perform based on their role, refer to the [Role-Based Access documentation](/docs/netdata-cloud/authentication-and-authorization/role-based-access-model.md#dynamic-configuration-manager). ## Collectors @@ -35,9 +37,9 @@ A job represents a running instance of a module with a specific configuration. T Every job has a designated "source type" indicating its origin: - **Stock**: Pre-installed with Netdata and provides basic data collection for common services. -- **User**: Originates from user-created files on the node. +- **User**: Created from user-defined configuration files on the node. - **Discovered**: Automatically generated by Netdata upon discovering a service running on the node. -- **Dynamic Configuration**: Created and managed using the Dynamic Configuration Manager. +- **Dynamic Configuration**: Managed and created through the Dynamic Configuration Manager. You can manage individual jobs using the following actions: @@ -51,7 +53,7 @@ You can manage individual jobs using the following actions: ## Health -Each entry in the Health tab contains an Alert template, that then is used to create Alerts. +Each entry in the Health tab contains an Alert template that then is used to create Alerts. The functionality in the main view is the same as with the [Collectors tab](#collectors). diff --git a/docs/netdata-agent/configuration/optimize-the-netdata-agents-performance.md b/docs/netdata-agent/configuration/optimize-the-netdata-agents-performance.md index 6acbd4977..ff51fbf78 100644 --- a/docs/netdata-agent/configuration/optimize-the-netdata-agents-performance.md +++ b/docs/netdata-agent/configuration/optimize-the-netdata-agents-performance.md @@ -1,9 +1,9 @@ # How to optimize the Netdata Agent's performance We designed the Netdata Agent to be incredibly lightweight, even when it's collecting a few thousand dimensions every -second and visualizing that data into hundreds of charts. However, the default settings of the Netdata Agent are not -optimized for performance, but for a simple, standalone setup. We want the first install to give you something you can -run without any configuration. Most of the settings and options are enabled, since we want you to experience the full +second and visualizing that data into hundreds of charts. However, the default settings of the Netdata Agent aren’t +optimized for performance, but for a simple, standalone setup. We want the first installation to give you something you can +run without any configuration. Most of the settings and options are enabled since we want you to experience the full thing. By default, Netdata will automatically detect applications running on the node it is installed to start collecting @@ -17,16 +17,16 @@ Netdata for production use. The following table summarizes the effect of each optimization on the CPU, RAM and Disk IO utilization in production. -| Optimization | CPU | RAM | Disk IO | -|-------------------------------------------------------------------------------------------------------------------------------|--------------------|--------------------|--------------------| -| [Use streaming and replication](#use-streaming-and-replication) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| [Disable unneeded plugins or collectors](#disable-unneeded-plugins-or-collectors) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| [Reduce data collection frequency](#reduce-collection-frequency) | :heavy_check_mark: | | :heavy_check_mark: | +| Optimization | CPU | RAM | Disk IO | +|-----------------------------------------------------------------------------------------------------------------------------------|--------------------|--------------------|--------------------| +| [Use streaming and replication](#use-streaming-and-replication) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| [Disable unneeded plugins or collectors](#disable-unneeded-plugins-or-collectors) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| [Reduce data collection frequency](#reduce-collection-frequency) | :heavy_check_mark: | | :heavy_check_mark: | | [Change how long Netdata stores metrics](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md) | | :heavy_check_mark: | :heavy_check_mark: | -| [Use a different metric storage database](/src/database/README.md) | | :heavy_check_mark: | :heavy_check_mark: | -| [Disable machine learning](#disable-machine-learning) | :heavy_check_mark: | | | -| [Use a reverse proxy](#run-netdata-behind-a-proxy) | :heavy_check_mark: | | | -| [Disable/lower gzip compression for the agent dashboard](#disablelower-gzip-compression-for-the-dashboard) | :heavy_check_mark: | | | +| [Use a different metric storage database](/src/database/README.md) | | :heavy_check_mark: | :heavy_check_mark: | +| [Disable machine learning](#disable-machine-learning) | :heavy_check_mark: | | | +| [Use a reverse proxy](#run-netdata-behind-a-proxy) | :heavy_check_mark: | | | +| [Disable/lower gzip compression for the agent dashboard](#disablelower-gzip-compression-for-the-dashboard) | :heavy_check_mark: | | | ## Resources required by a default Netdata installation @@ -39,15 +39,15 @@ You can configure almost all aspects of data collection/retention, and certain a Expect about: - 1-3% of a single core for the netdata core -- 1-3% of a single core for the various collectors (e.g. go.d.plugin, apps.plugin) +- 1-3% of a single core for the various collectors (e.g., go.d.plugin, apps.plugin) - 5-10% of a single core, when ML training runs Your experience may vary depending on the number of metrics collected, the collectors enabled and the specific -environment they run on, i.e. the work they have to do to collect these metrics. +environment they run on, i.e., the work they have to do to collect these metrics. As a general rule, for modern hardware and VMs, the total CPU consumption of a standalone Netdata installation, including all its components, should be below 5 - 15% of a single core. For example, on 8 core server it will use only -0.6% - 1.8% of a total CPU capacity, depending on the CPU characteristics. +0.6% - 1.8% of the total CPU capacity, depending on the CPU characteristics. The Netdata Agent runs with the lowest possible [process scheduling policy](/src/daemon/README.md#netdata-process-scheduling-policy), @@ -55,7 +55,7 @@ which is `nice 19`, and uses the `idle` process scheduler. Together, these setti resources when the node has CPU resources to space. If the node reaches 100% CPU utilization, the Agent is stopped first to ensure your applications get any available resources. -To reduce CPU usage you can (either one or a combination of the following actions): +To reduce CPU usage, you can (either one or a combination of the following actions): 1. [Disable machine learning](#disable-machine-learning), 2. [Use streaming and replication](#use-streaming-and-replication), @@ -77,19 +77,18 @@ To estimate and control memory consumption, you can (either one or a combination ### Disk footprint and I/O -By default, Netdata should not use more than 1GB of disk space, most of which is dedicated for storing metric data and -metadata. For typical installations collecting 2000 - 3000 metrics, this storage should provide a few days of +By default, Netdata shouldn’t use more than 1GB of disk space, most of which is dedicated to storing metric data and +metadata. For typical installations collecting 2000–3000 metrics, this storage should provide a few days of high-resolution retention (per second), about a month of mid-resolution retention (per minute) and more than a year of low-resolution retention (per hour). -Netdata spreads I/O operations across time. For typical standalone installations there should be a few write operations -every 5-10 seconds of a few kilobytes each, occasionally up to 1MB. In addition, under heavy load, collectors that +Netdata spreads I/O operations across time. For typical standalone installations, there should be a few write operations +every 5–10 seconds of a few kilobytes each, occasionally up to 1MB. In addition, under a heavy load, collectors that require disk I/O may stop and show gaps in charts. -To optimize your disk footprint in any aspect described below you can: +To optimize your disk footprint in any aspect described below, you can: - -To configure retention, you can: +To configure retention, you can: 1. [Change how long Netdata stores metrics](/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md). @@ -97,7 +96,6 @@ To control disk I/O: 1. [Use a different metric storage database](/src/database/README.md), - Minimize deployment impact on the production system by optimizing disk footprint: 1. [Using streaming and replication](#use-streaming-and-replication) @@ -118,7 +116,7 @@ and makes it easier to configure or disable alerts and agent notifications. The parents by default run health checks for each child, as long as the child is connected (the details are in `stream.conf`). On the child nodes you should add to `netdata.conf` the following: -```conf +```text [health] enabled = no ``` @@ -131,19 +129,18 @@ See [using a different metric storage database](/src/database/README.md). If you know that you don't need an [entire plugin or a specific collector](/src/collectors/README.md#collector-architecture-and-terminology), -you can disable any of them. Keep in mind that if a plugin/collector has nothing to do, it simply shuts down and does -not consume system resources. You will only improve the Agent's performance by disabling plugins/collectors that are +you can disable any of them. Keep in mind that if a plugin/collector has nothing to do, it simply shuts down and doesn’t consume system resources. You will only improve the Agent's performance by disabling plugins/collectors that are actively collecting metrics. Open `netdata.conf` and scroll down to the `[plugins]` section. To disable any plugin, uncomment it and set the value to `no`. For example, to explicitly keep the `proc` and `go.d` plugins enabled while disabling `python.d` and `charts.d`. -```conf +```text [plugins] proc = yes - python.d = no - charts.d = no - go.d = yes + python.d = no + charts.d = no + go.d = yes ``` Disable specific collectors by opening their respective plugin configuration files, uncommenting the line for the @@ -157,11 +154,11 @@ sudo ./edit-config charts.d.conf For example, to disable a few Python collectors: -```conf +```text modules: - apache: no - dockerd: no - fail2ban: no + apache: no + dockerd: no + fail2ban: no ``` ## Reduce collection frequency @@ -181,7 +178,7 @@ If you change this to `2`, Netdata enforces a minimum `update every` setting of other second, which will effectively halve CPU utilization. Set this to `5` or `10` to collect metrics every 5 or 10 seconds, respectively. -```conf +```text [global] update every = 5 ``` @@ -199,7 +196,7 @@ an [internal_plugin/collector](/src/collectors/README.md#collector-architecture- open `netdata.conf` and find the appropriate section. For example, to reduce the frequency of the `apps` plugin, which collects and visualizes metrics on application resource utilization: -```conf +```text [plugin:apps] update every = 5 ``` @@ -208,7 +205,7 @@ To [configure an individual collector](/src/collectors/REFERENCE.md#configure-a- open its specific configuration file with `edit-config` and look for the `update_every` setting. For example, to reduce the frequency of the `nginx` collector, run `sudo ./edit-config go.d/nginx.conf`: -```conf +```text # [ GLOBAL ] update_every: 10 ``` @@ -229,7 +226,7 @@ on [streaming and replication](/docs/observability-centralization-points/README. Automated anomaly detection may be a powerful tool, but we recommend it to only be enabled on Netdata parents that sit outside your production infrastructure, or if you have cpu and memory to spare. You can disable ML with the following: -```conf +```text [ml] enabled = no ``` @@ -251,16 +248,15 @@ looking at the local Agent dashboard. To disable gzip compression, open `netdata.conf` and find the `[web]` section: -```conf +```text [web] enable gzip compression = no ``` Or to lower the default compression level: -```conf +```text [web] enable gzip compression = yes gzip compression level = 1 ``` - diff --git a/docs/netdata-agent/configuration/optimizing-metrics-database/README.md b/docs/netdata-agent/configuration/optimizing-metrics-database/README.md index fdbd3b690..c5769ccd4 100644 --- a/docs/netdata-agent/configuration/optimizing-metrics-database/README.md +++ b/docs/netdata-agent/configuration/optimizing-metrics-database/README.md @@ -1,3 +1,3 @@ # Optimizing Metrics Database Overview -This section contains documentation to help you understand how the metrics DB works, understand the key features and configure them to suit your needs. \ No newline at end of file +This section contains documentation to help you understand how the metrics DB works, understand the key features and configure them to suit your needs. diff --git a/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md b/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md index 8a8659eff..2282cbc44 100644 --- a/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md +++ b/docs/netdata-agent/configuration/optimizing-metrics-database/change-metrics-storage.md @@ -7,9 +7,9 @@ space**. This provides greater control and helps you optimize storage usage for | Tier | Resolution | Time Limit | Size Limit (min 256 MB) | |:----:|:-------------------:|:----------:|:-----------------------:| -| 0 | high (per second) | 14 days | 1 GiB | -| 1 | middle (per minute) | 3 months | 1 GiB | -| 2 | low (per hour) | 2 years | 1 GiB | +| 0 | high (per second) | 14d | 1 GiB | +| 1 | middle (per minute) | 3mo | 1 GiB | +| 2 | low (per hour) | 2y | 1 GiB | > **Note**: If a user sets a disk space size less than 256 MB for a tier, Netdata will automatically adjust it to 256 MB. @@ -17,7 +17,7 @@ With these defaults, Netdata requires approximately 4 GiB of storage space (incl ## Retention Settings -> **In a parent-child setup**, these settings manage the shared storage space utilized by the Netdata parent agent for +> **In a parent-child setup**, these settings manage the shared storage space used by the Netdata parent agent for > storing metrics collected by both the parent and its child nodes. You can fine-tune retention for each tier by setting a time limit or size limit. Setting a limit to 0 disables it, @@ -32,22 +32,22 @@ retention strategies as shown in the table below: You can change these limits in `netdata.conf`: -``` +```text [db] - mode = dbengine + mode = dbengine storage tiers = 3 # Tier 0, per second data. Set to 0 for no limit. - dbengine tier 0 disk space MB = 1024 - dbengine tier 0 retention days = 14 + dbengine tier 0 retention size = 1GiB + dbengine tier 0 retention time = 14d # Tier 1, per minute data. Set to 0 for no limit. - dbengine tier 1 disk space MB = 1024 - dbengine tier 1 retention days = 90 + dbengine tier 1 retention size = 1GiB + dbengine tier 1 retention time = 3mo # Tier 2, per hour data. Set to 0 for no limit. - dbengine tier 2 disk space MB = 1024 - dbengine tier 2 retention days = 730 + dbengine tier 2 retention size = 1GiB + dbengine tier 2 retention time = 2y ``` ## Monitoring Retention Utilization @@ -58,6 +58,24 @@ your storage space (disk space limits) and time (time limits) are used for metri ## Legacy configuration +### v1.99.0 and prior + +Netdata prior to v2 supports the following configuration options in `netdata.conf`. +They have the same defaults as the latest v2, but the unit of each value is given in the option name, not at the value. + +```text +storage tiers = 3 +# Tier 0, per second data. Set to 0 for no limit. +dbengine tier 0 disk space MB = 1024 +dbengine tier 0 retention days = 14 +# Tier 1, per minute data. Set to 0 for no limit. +dbengine tier 1 disk space MB = 1024 +dbengine tier 1 retention days = 90 +# Tier 2, per hour data. Set to 0 for no limit. +dbengine tier 2 disk space MB = 1024 +dbengine tier 2 retention days = 730 +``` + ### v1.45.6 and prior Netdata versions prior to v1.46.0 relied on a disk space-based retention. @@ -72,17 +90,14 @@ Netdata versions prior to v1.46.0 relied on a disk space-based retention. You can change these limits in `netdata.conf`: -``` +```text [db] - mode = dbengine + mode = dbengine storage tiers = 3 - # Tier 0, per second data dbengine multihost disk space MB = 256 - # Tier 1, per minute data dbengine tier 1 multihost disk space MB = 1024 - # Tier 2, per hour data dbengine tier 2 multihost disk space MB = 1024 ``` @@ -96,7 +111,7 @@ for the parent node and all of its children. To configure the database engine, look for the `page cache size MB` and `dbengine multihost disk space MB` settings in the `[db]` section of your `netdata.conf`. -```conf +```text [db] dbengine page cache size MB = 32 dbengine multihost disk space MB = 256 diff --git a/docs/netdata-agent/configuration/organize-systems-metrics-and-alerts.md b/docs/netdata-agent/configuration/organize-systems-metrics-and-alerts.md index b0094a60f..f7f56279b 100644 --- a/docs/netdata-agent/configuration/organize-systems-metrics-and-alerts.md +++ b/docs/netdata-agent/configuration/organize-systems-metrics-and-alerts.md @@ -1,49 +1,51 @@ # Organize systems, metrics, and alerts When you use Netdata to monitor and troubleshoot an entire infrastructure, you need sophisticated ways of keeping everything organized. -Netdata allows to organize your observability infrastructure with Spaces, Rooms, virtual nodes, host labels, and metric labels. +Netdata allows organizing your observability infrastructure with Spaces, Rooms, virtual nodes, host labels, and metric labels. ## Spaces and Rooms -[Spaces](/docs/netdata-cloud/organize-your-infrastructure-invite-your-team.md#netdata-cloud-spaces) are used for organization-level or infrastructure-level +[Spaces](/docs/netdata-cloud/organize-your-infrastructure-invite-your-team.md#netdata-cloud-spaces) are used for organization-level or infrastructure-level grouping of nodes and people. A node can only appear in a single space, while people can have access to multiple spaces. -The [Rooms](/docs/netdata-cloud/organize-your-infrastructure-invite-your-team.md#netdata-cloud-rooms) in a space bring together nodes and people in -collaboration areas. Rooms can also be used for fine-tuned -[role based access control](/docs/netdata-cloud/authentication-and-authorization/role-based-access-model.md). +The [Rooms](/docs/netdata-cloud/organize-your-infrastructure-invite-your-team.md#netdata-cloud-rooms) in a space bring together nodes and people in +collaboration areas. Rooms can also be used for fine-tuned +[role-based access control](/docs/netdata-cloud/authentication-and-authorization/role-based-access-model.md). ## Virtual nodes -Netdata’s virtual nodes functionality allows you to define nodes in configuration files and have them be treated as regular nodes -in all of the UI, dashboards, tabs, filters etc. For example, you can create a virtual node each for all your Windows machines -and monitor them as discrete entities. Virtual nodes can help you simplify your infrastructure monitoring and focus on the +Netdata’s virtual nodes functionality allows you to define nodes in configuration files and have them be treated as regular nodes +in all the UI, dashboards, tabs, filters, etc. For example, you can create a virtual node each for all your Windows machines +and monitor them as discrete entities. Virtual nodes can help you simplify your infrastructure monitoring and focus on the individual node that matters. To define your windows server as a virtual node you need to: - * Define virtual nodes in `/etc/netdata/vnodes/vnodes.conf` +* Define virtual nodes in `/etc/netdata/vnodes/vnodes.conf` ```yaml - hostname: win_server1 guid: ``` - Just remember to use a valid guid (On Linux you can use `uuidgen` command to generate one, on Windows just use the `[guid]::NewGuid()` command in PowerShell) - - * Add the vnode config to the data collection job. e.g. in `go.d/windows.conf`: + + Just remember to use a valid guid (On Linux you can use `uuidgen` command to generate one, on Windows just use the `[guid]::NewGuid()` command in PowerShell) + +* Add the vnode config to the data collection job. e.g., in `go.d/windows.conf`: + ```yaml jobs: - name: win_server1 vnode: win_server1 url: http://203.0.113.10:9182/metrics ``` - + ## Host labels Host labels can be extremely useful when: -- You need alerts that adapt to the system's purpose -- You need properly-labeled metrics archiving so you can sort, correlate, and mash-up your data to your heart's content. -- You need to keep tabs on ephemeral Docker containers in a Kubernetes cluster. +* You need alerts that adapt to the system's purpose +* You need properly labeled metrics archiving so you can sort, correlate, and mash-up your data to your heart's content. +* You need to keep tabs on ephemeral Docker containers in a Kubernetes cluster. Let's take a peek into how to create host labels and apply them across a few of Netdata's features to give you more organization power over your infrastructure. @@ -56,16 +58,17 @@ parent-child status, and more. They capture the following: -- Kernel version -- Operating system name and version -- CPU architecture, system cores, CPU frequency, RAM, and disk space -- Whether Netdata is running inside of a container, and if so, the OS and hardware details about the container's host -- Whether Netdata is running inside K8s node -- What virtualization layer the system runs on top of, if any -- Whether the system is a streaming parent or child +* Kernel version +* Operating system name and version +* CPU architecture, system cores, CPU frequency, RAM, and disk space +* Whether Netdata is running inside of a container, and if so, the OS and hardware details about the container's host +* Whether Netdata is running inside K8s node +* What virtualization layer the system runs on top of, if any +* Whether the system is a streaming parent or child If you want to organize your systems without manually creating host labels, try the automatic labels in some of the features below. You can see them under `http://HOST-IP:19999/api/v1/info`, beginning with an underscore `_`. + ```json { ... @@ -87,7 +90,7 @@ sudo ./edit-config netdata.conf Create a new `[host labels]` section defining a new host label and its value for the system in question. Make sure not to violate any of the [host label naming rules](/docs/netdata-agent/configuration/common-configuration-changes.md#organize-nodes-with-host-labels). -```conf +```text [host labels] type = webserver location = us-seattle @@ -126,7 +129,6 @@ read the status of your agent. For example, from a VPS system running Debian 10: } ``` - ### Host labels in streaming You may have noticed the `_is_parent` and `_is_child` automatic labels from above. Host labels are also now @@ -134,12 +136,11 @@ streamed from a child to its parent node, which concentrates an entire infrastru and virtualization information in one place: the parent. Now, if you'd like to remind yourself of how much RAM a certain child node has, you can access -`http://localhost:19999/host/CHILD_HOSTNAME/api/v1/info` and reference the automatically-generated host labels from the +`http://localhost:19999/host/CHILD_HOSTNAME/api/v1/info` and reference the automatically generated host labels from the child system. It's a vastly simplified way of accessing critical information about your infrastructure. > ⚠️ Because automatic labels for child nodes are accessible via API calls, and contain sensitive information like -> kernel and operating system versions, you should secure streaming connections with SSL. See the [streaming -> documentation](/src/streaming/README.md#securing-streaming-communications) for details. You may also want to use +> kernel and operating system versions, you should secure streaming connections with SSL. See the [streaming documentation](/src/streaming/README.md#securing-streaming-with-tlsssl) for details. You may also want to use > [access lists](/src/web/server/README.md#access-lists) or [expose the API only to LAN/localhost > connections](/docs/netdata-agent/securing-netdata-agents.md#expose-netdata-only-in-a-private-lan). @@ -153,23 +154,23 @@ alerts to them. For example, let's use configuration example from earlier: -```conf +```text [host labels] type = webserver location = us-seattle installed = 20200218 ``` -You could now create a new health entity (checking if disk space will run out soon) that applies only to any host +You could now create a new health entity (checking if disk space runs out soon) that applies only to any host labeled `webserver`: ```yaml template: disk_fill_rate - on: disk.space - lookup: max -1s at -30m unaligned of avail - calc: ($this - $avail) / (30 * 60) - every: 15s - host labels: type = webserver + on: disk.space + lookup: max -1s at -30m unaligned of avail + calc: ($this - $avail) / (30 * 60) + every: 15s + host labels: type = webserver ``` Or, by using one of the automatic labels, for only webserver systems running a specific OS: @@ -198,9 +199,9 @@ documentation](/src/health/REFERENCE.md#alert-line-host-labels) for more details If you have enabled any metrics exporting via our experimental [exporters](/src/exporting/README.md), any new host labels you created manually are sent to the destination database alongside metrics. You can change this behavior by -editing `exporting.conf`, and you can even send automatically-generated labels on with exported metrics. +editing `exporting.conf`, and you can even send automatically generated labels on with exported metrics. -```conf +```text [exporting:global] enabled = yes send configured labels = yes @@ -209,7 +210,7 @@ send automatic labels = no You can also change this behavior per exporting connection: -```conf +```text [opentsdb:my_instance3] enabled = yes destination = localhost:4242 @@ -227,27 +228,27 @@ more about exporting, read the [documentation](/src/exporting/README.md). The Netdata aggregate charts allow you to filter and group metrics based on label name-value pairs. -All go.d plugin collectors support the specification of labels at the "collection job" level. Some collectors come with out of the box -labels (e.g. generic Prometheus collector, Kubernetes, Docker and more). But you can also add your own custom labels, by configuring -the data collection jobs. +All go.d plugin collectors support the specification of labels at the "collection job" level. Some collectors come without of the box +labels (e.g. generic Prometheus collector, Kubernetes, Docker and more). But you can also add your own custom labels by configuring +the data collection jobs. -For example, suppose we have a single Netdata agent, collecting data from two remote Apache web servers, located in different data centers. +For example, suppose we have a single Netdata agent, collecting data from two remote Apache web servers, located in different data centers. The web servers are load balanced and provide access to the service "Payments". You can define the following in `go.d.conf`, to be able to group the web requests by service or location: -``` +```yaml jobs: - - name: mywebserver1 + - name: my_webserver1 url: http://host1/server-status?auto labels: service: "Payments" location: "Atlanta" - - name: mywebserver2 + - name: my_webserver2 url: http://host2/server-status?auto labels: service: "Payments" location: "New York" ``` -Of course you may define as many custom label/value pairs as you like, in as many data collection jobs you need. +Of course, you may define as many custom label/value pairs as you like, in as many data collection jobs you need. diff --git a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/README.md b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/README.md index 00fe63af1..a0810bb51 100644 --- a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/README.md +++ b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/README.md @@ -1,7 +1,7 @@ # Running the Netdata Agent behind a reverse proxy If you need to access a Netdata agent's user interface or API in a production environment we recommend you put Netdata behind -another web server and secure access to the dashboard via SSL, user authentication and firewall rules. +another web server and secure access to the dashboard via SSL, user authentication and firewall rules. A dedicated web server also provides more robustness and capabilities than the Agent's [internal web server](/src/web/README.md). @@ -12,7 +12,7 @@ We have documented running behind [Lighttpd](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-lighttpd.md), [Caddy](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-caddy.md), and [H2O](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-h2o.md). -If you prefer a different web server, we suggest you follow the documentation for nginx and tell us how you did it +If you prefer a different web server, we suggest you follow the documentation for nginx and tell us how you did it by adding your own "Running behind webserverX" document. When you run Netdata behind a reverse proxy, we recommend you firewall protect all your Netdata servers, so that only the web server IP will be allowed to directly access Netdata. To do this, run this on each of your servers (or use your firewall manager): @@ -26,9 +26,9 @@ The above will prevent anyone except your web server to access a Netdata dashboa You can also use `netdata.conf`: -``` +```text [web] - allow connections from = localhost 1.2.3.4 + allow connections from = localhost 1.2.3.4 ``` Of course, you can add more IPs. diff --git a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-apache.md b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-apache.md index 1f7274d5c..23e4ae233 100644 --- a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-apache.md +++ b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-apache.md @@ -1,4 +1,4 @@ -# Netdata via Apache's mod_proxy +# Running Netdata behind Apache's mod_proxy Below you can find instructions for configuring an apache server to: @@ -29,6 +29,7 @@ Also, enable the rewrite module: ```sh sudo a2enmod rewrite ``` + ## Netdata on an existing virtual host On any **existing** and already **working** apache virtual host, you can redirect requests for URL `/netdata/` to one or more Netdata servers. @@ -37,29 +38,29 @@ On any **existing** and already **working** apache virtual host, you can redirec Add the following on top of any existing virtual host. It will allow you to access Netdata as `http://virtual.host/netdata/`. -```conf +```text - RewriteEngine On - ProxyRequests Off - ProxyPreserveHost On + RewriteEngine On + ProxyRequests Off + ProxyPreserveHost On + + + Require all granted + - - Require all granted - + # Local Netdata server accessed with '/netdata/', at localhost:19999 + ProxyPass "/netdata/" "http://localhost:19999/" connectiontimeout=5 timeout=30 keepalive=on + ProxyPassReverse "/netdata/" "http://localhost:19999/" - # Local Netdata server accessed with '/netdata/', at localhost:19999 - ProxyPass "/netdata/" "http://localhost:19999/" connectiontimeout=5 timeout=30 keepalive=on - ProxyPassReverse "/netdata/" "http://localhost:19999/" + # if the user did not give the trailing /, add it + # for HTTP (if the virtualhost is HTTP, use this) + RewriteRule ^/netdata$ http://%{HTTP_HOST}/netdata/ [L,R=301] + # for HTTPS (if the virtualhost is HTTPS, use this) + #RewriteRule ^/netdata$ https://%{HTTP_HOST}/netdata/ [L,R=301] - # if the user did not give the trailing /, add it - # for HTTP (if the virtualhost is HTTP, use this) - RewriteRule ^/netdata$ http://%{HTTP_HOST}/netdata/ [L,R=301] - # for HTTPS (if the virtualhost is HTTPS, use this) - #RewriteRule ^/netdata$ https://%{HTTP_HOST}/netdata/ [L,R=301] + # rest of virtual host config here - # rest of virtual host config here - ``` @@ -67,16 +68,16 @@ Add the following on top of any existing virtual host. It will allow you to acce Add the following on top of any existing virtual host. It will allow you to access multiple Netdata as `http://virtual.host/netdata/HOSTNAME/`, where `HOSTNAME` is the hostname of any other Netdata server you have (to access the `localhost` Netdata, use `http://virtual.host/netdata/localhost/`). -```conf +```text - RewriteEngine On - ProxyRequests Off - ProxyPreserveHost On + RewriteEngine On + ProxyRequests Off + ProxyPreserveHost On - - Require all granted - + + Require all granted + # proxy any host, on port 19999 ProxyPassMatch "^/netdata/([A-Za-z0-9\._-]+)/(.*)" "http://$1:19999/$2" connectiontimeout=5 timeout=30 keepalive=on @@ -87,8 +88,8 @@ Add the following on top of any existing virtual host. It will allow you to acce # for HTTPS (if the virtualhost is HTTPS, use this) RewriteRule "^/netdata/([A-Za-z0-9\._-]+)$" https://%{HTTP_HOST}/netdata/$1/ [L,R=301] - # rest of virtual host config here - + # rest of virtual host config here + ``` @@ -97,7 +98,7 @@ Add the following on top of any existing virtual host. It will allow you to acce If you want to control the servers your users can connect to, replace the `ProxyPassMatch` line with the following. This allows only `server1`, `server2`, `server3` and `server4`. -``` +```text ProxyPassMatch "^/netdata/(server1|server2|server3|server4)/(.*)" "http://$1:19999/$2" connectiontimeout=5 timeout=30 keepalive=on ``` @@ -113,26 +114,28 @@ nano /etc/apache2/sites-available/netdata.conf with this content: -```conf +```text - ProxyRequests Off - ProxyPreserveHost On - - ServerName netdata.domain.tld - - Require all granted - + ProxyRequests Off + ProxyPreserveHost On + + ServerName netdata.domain.tld + + + Require all granted + - ProxyPass "/" "http://localhost:19999/" connectiontimeout=5 timeout=30 keepalive=on - ProxyPassReverse "/" "http://localhost:19999/" + ProxyPass "/" "http://localhost:19999/" connectiontimeout=5 timeout=30 keepalive=on + ProxyPassReverse "/" "http://localhost:19999/" + + ErrorLog ${APACHE_LOG_DIR}/netdata-error.log + CustomLog ${APACHE_LOG_DIR}/netdata-access.log combined - ErrorLog ${APACHE_LOG_DIR}/netdata-error.log - CustomLog ${APACHE_LOG_DIR}/netdata-access.log combined ``` -Enable the VirtualHost: +Enable the VirtualHost: ```sh sudo a2ensite netdata.conf && service apache2 reload @@ -142,15 +145,15 @@ sudo a2ensite netdata.conf && service apache2 reload _Assuming the main goal is to make Netdata running in HTTPS._ -1. Make a subdomain for Netdata on which you enable and force HTTPS - You can use a free Let's Encrypt certificate -2. Go to "Apache & nginx Settings", and in the following section, add: - -```conf -RewriteEngine on -RewriteRule (.*) http://localhost:19999/$1 [P,L] -``` +1. Make a subdomain for Netdata on which you enable and force HTTPS - You can use a free Let's Encrypt certificate +2. Go to "Apache & nginx Settings", and in the following section, add: -3. Optional: If your server is remote, then just replace "localhost" with your actual hostname or IP, it just works. + ```text + RewriteEngine on + RewriteRule (.*) http://localhost:19999/$1 [P,L] + ``` + +3. Optional: If your server is remote, then just replace "localhost" with your actual hostname or IP, it just works. Repeat the operation for as many servers as you need. @@ -165,49 +168,49 @@ Then, generate password for user `netdata`, using `htpasswd -c /etc/apache2/.htp **Apache 2.2 Example:**\ Modify the virtual host with these: -```conf - # replace the section - - Order deny,allow - Allow from all - - - # add a section - - AuthType Basic - AuthName "Protected site" - AuthUserFile /etc/apache2/.htpasswd - Require valid-user - Order deny,allow - Allow from all - +```text + # replace the section + + Order deny,allow + Allow from all + + + # add a section + + AuthType Basic + AuthName "Protected site" + AuthUserFile /etc/apache2/.htpasswd + Require valid-user + Order deny,allow + Allow from all + ``` Specify `Location /` if Netdata is running on dedicated virtual host. **Apache 2.4 (dedicated virtual host) Example:** -```conf +```text - RewriteEngine On - ProxyRequests Off - ProxyPreserveHost On - - ServerName netdata.domain.tld - - - AllowOverride None - AuthType Basic - AuthName "Protected site" - AuthUserFile /etc/apache2/.htpasswd - Require valid-user - - - ProxyPass "/" "http://localhost:19999/" connectiontimeout=5 timeout=30 keepalive=on - ProxyPassReverse "/" "http://localhost:19999/" - - ErrorLog ${APACHE_LOG_DIR}/netdata-error.log - CustomLog ${APACHE_LOG_DIR}/netdata-access.log combined + RewriteEngine On + ProxyRequests Off + ProxyPreserveHost On + + ServerName netdata.domain.tld + + + AllowOverride None + AuthType Basic + AuthName "Protected site" + AuthUserFile /etc/apache2/.htpasswd + Require valid-user + + + ProxyPass "/" "http://localhost:19999/" connectiontimeout=5 timeout=30 keepalive=on + ProxyPassReverse "/" "http://localhost:19999/" + + ErrorLog ${APACHE_LOG_DIR}/netdata-error.log + CustomLog ${APACHE_LOG_DIR}/netdata-access.log combined ``` @@ -217,8 +220,8 @@ Note: Changes are applied by reloading or restarting Apache. If you want to enable CSP within your Apache, you should consider some special requirements of the headers. Modify your configuration like that: -``` - Header always set Content-Security-Policy "default-src http: 'unsafe-inline' 'self' 'unsafe-eval'; script-src http: 'unsafe-inline' 'self' 'unsafe-eval'; style-src http: 'self' 'unsafe-inline'" +```text + Header always set Content-Security-Policy "default-src http: 'unsafe-inline' 'self' 'unsafe-eval'; script-src http: 'unsafe-inline' 'self' 'unsafe-eval'; style-src http: 'self' 'unsafe-inline'" ``` Note: Changes are applied by reloading or restarting Apache. @@ -242,7 +245,7 @@ exceed that threshold, and `mod_evasive` will add your IP address to a blocklist Our users have found success by setting `DOSPageCount` to `30`. Try this, and raise the value if you continue to see 403 errors while accessing the dashboard. -```conf +```text DOSPageCount 30 ``` @@ -255,100 +258,92 @@ To adjust the `DOSPageCount` for a specific virtual host, open your virtual host `/etc/httpd/conf/sites-available/my-domain.conf` or `/etc/apache2/sites-available/my-domain.conf` and add the following: -```conf +```text - ... - # Increase the DOSPageCount to prevent 403 errors and IP addresses being blocked. - - DOSPageCount 30 - + ... + # Increase the DOSPageCount to prevent 403 errors and IP addresses being blocked. + + DOSPageCount 30 + ``` See issues [#2011](https://github.com/netdata/netdata/issues/2011) and [#7658](https://github.com/netdata/netdata/issues/7568) for more information. -# Netdata configuration +## Netdata configuration You might edit `/etc/netdata/netdata.conf` to optimize your setup a bit. For applying these changes you need to restart Netdata. -## Response compression +### Response compression If you plan to use Netdata exclusively via apache, you can gain some performance by preventing double compression of its output (Netdata compresses its response, apache re-compresses it) by editing `/etc/netdata/netdata.conf` and setting: -``` +```text [web] enable gzip compression = no ``` Once you disable compression at Netdata (and restart it), please verify you receive compressed responses from apache (it is important to receive compressed responses - the charts will be more snappy). -## Limit direct access to Netdata +### Limit direct access to Netdata You would also need to instruct Netdata to listen only on `localhost`, `127.0.0.1` or `::1`. -``` +```text [web] bind to = localhost ``` or -``` +```text [web] bind to = 127.0.0.1 ``` or -``` +```text [web] bind to = ::1 ``` - - You can also use a unix domain socket. This will also provide a faster route between apache and Netdata: -``` +```text [web] bind to = unix:/tmp/netdata.sock ``` Apache 2.4.24+ can not read from `/tmp` so create your socket in `/var/run/netdata` -``` +```text [web] bind to = unix:/var/run/netdata/netdata.sock ``` -_note: Netdata v1.8+ support unix domain sockets_ - At the apache side, prepend the 2nd argument to `ProxyPass` with `unix:/tmp/netdata.sock|`, like this: -``` +```text ProxyPass "/netdata/" "unix:/tmp/netdata.sock|http://localhost:19999/" connectiontimeout=5 timeout=30 keepalive=on ``` - - If your apache server is not on localhost, you can set: -``` +```text [web] bind to = * allow connections from = IP_OF_APACHE_SERVER ``` -*note: Netdata v1.9+ support `allow connections from`* - `allow connections from` accepts [Netdata simple patterns](/src/libnetdata/simple_pattern/README.md) to match against the connection IP address. ## Prevent the double access.log apache logs accesses and Netdata logs them too. You can prevent Netdata from generating its access log, by setting this in `/etc/netdata/netdata.conf`: -``` +```text [logs] access = off ``` @@ -357,7 +352,5 @@ apache logs accesses and Netdata logs them too. You can prevent Netdata from gen Make sure the requests reach Netdata, by examining `/var/log/netdata/access.log`. -1. if the requests do not reach Netdata, your apache does not forward them. -2. if the requests reach Netdata but the URLs are wrong, you have not re-written them properly. - - +1. if the requests do not reach Netdata, your apache does not forward them. +2. if the requests reach Netdata but the URLs are wrong, you have not re-written them properly. diff --git a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-caddy.md b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-caddy.md index b7608b309..f43a7a278 100644 --- a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-caddy.md +++ b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-caddy.md @@ -1,15 +1,6 @@ - +# Running Netdata behind Caddy -# Netdata via Caddy - -To run Netdata via [Caddy v2 proxying,](https://caddyserver.com/docs/caddyfile/directives/reverse_proxy) set your Caddyfile up like this: +To run Netdata via [Caddy v2 reverse proxy,](https://caddyserver.com/docs/caddyfile/directives/reverse_proxy) set your Caddyfile up like this: ```caddyfile netdata.domain.tld { @@ -34,5 +25,3 @@ netdata.domain.tld { You would also need to instruct Netdata to listen only to `127.0.0.1` or `::1`. To limit access to Netdata only from localhost, set `bind socket to IP = 127.0.0.1` or `bind socket to IP = ::1` in `/etc/netdata/netdata.conf`. - - diff --git a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-h2o.md b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-h2o.md index 276b72e8b..f2dc45b82 100644 --- a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-h2o.md +++ b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-h2o.md @@ -1,12 +1,3 @@ - - # Running Netdata behind H2O [H2O](https://h2o.examp1e.net/) is a new generation HTTP server that provides quicker response to users with less CPU utilization when compared to older generation of web servers. @@ -15,23 +6,23 @@ It is notable for having much simpler configuration than many popular HTTP serve ## Why H2O -- Sane configuration defaults mean that typical configurations are very minimalistic and easy to work with. +- Sane configuration defaults mean that typical configurations are very minimalistic and easy to work with. -- Native support for HTTP/2 provides improved performance when accessing the Netdata dashboard remotely. +- Native support for HTTP/2 provides improved performance when accessing the Netdata dashboard remotely. -- Password protect access to the Netdata dashboard without requiring Netdata Cloud. +- Password protect access to the Netdata dashboard without requiring Netdata Cloud. -## H2O configuration file. +## H2O configuration file -On most systems, the H2O configuration is found under `/etc/h2o`. H2O uses [YAML 1.1](https://yaml.org/spec/1.1/), with a few special extensions, for it’s configuration files, with the main configuration file being `/etc/h2o/h2o.conf`. +On most systems, the H2O configuration is found under `/etc/h2o`. H2O uses [YAML 1.1](https://yaml.org/spec/1.1/), with a few special extensions, for it’s configuration files, with the main configuration file being `/etc/h2o/h2o.conf`. You can edit the H2O configuration file with Nano, Vim or any other text editors with which you are comfortable. After making changes to the configuration files, perform the following: -- Test the configuration with `h2o -m test -c /etc/h2o/h2o.conf` +- Test the configuration with `h2o -m test -c /etc/h2o/h2o.conf` -- Restart H2O to apply tha changes with `/etc/init.d/h2o restart` or `service h2o restart` +- Restart H2O to apply tha changes with `/etc/init.d/h2o restart` or `service h2o restart` ## Ways to access Netdata via H2O @@ -52,7 +43,7 @@ hosts: ### As a subfolder of an existing virtual host -This method is recommended when Netdata is to be served from a subfolder (or directory). +This method is recommended when Netdata is to be served from a subfolder (or directory). In this case, the virtual host `netdata.example.com` already exists and Netdata has to be accessed via `netdata.example.com/netdata/`. ```yaml @@ -72,7 +63,7 @@ hosts: ### As a subfolder for multiple Netdata servers, via one H2O instance -This is the recommended configuration when one H2O instance will be used to manage multiple Netdata servers via subfolders. +This is the recommended configuration when one H2O instance will be used to manage multiple Netdata servers via sub-folders. ```yaml hosts: @@ -100,12 +91,12 @@ Of course you can add as many backend servers as you like. Using the above, you access Netdata on the backend servers, like this: -- `http://netdata.example.com/netdata/server1/` to reach Netdata on `198.51.100.1:19999` -- `http://netdata.example.com/netdata/server2/` to reach Netdata on `198.51.100.2:19999` +- `http://netdata.example.com/netdata/server1/` to reach Netdata on `198.51.100.1:19999` +- `http://netdata.example.com/netdata/server2/` to reach Netdata on `198.51.100.2:19999` ### Encrypt the communication between H2O and Netdata -In case Netdata's web server has been [configured to use TLS](/src/web/server/README.md#enabling-tls-support), it is +In case Netdata's web server has been [configured to use TLS](/src/web/server/README.md#enable-httpstls-support), it is necessary to specify inside the H2O configuration that the final destination is using TLS. To do this, change the `http://` on the `proxy.reverse.url` line in your H2O configuration with `https://` @@ -141,31 +132,27 @@ For more information on using basic authentication with H2O, see [their official If your H2O server is on `localhost`, you can use this to ensure external access is only possible through H2O: -``` +```text [web] bind to = 127.0.0.1 ::1 ``` - - You can also use a unix domain socket. This will provide faster communication between H2O and Netdata as well: -``` +```text [web] bind to = unix:/run/netdata/netdata.sock ``` In the H2O configuration, use a line like the following to connect to Netdata via the unix socket: -```yaml +```text proxy.reverse.url http://[unix:/run/netdata/netdata.sock] ``` - - If your H2O server is not on localhost, you can set: -``` +```text [web] bind to = * allow connections from = IP_OF_H2O_SERVER @@ -181,7 +168,7 @@ the connection IP address. H2O logs accesses and Netdata logs them too. You can prevent Netdata from generating its access log, by setting this in `/etc/netdata/netdata.conf`: -``` +```text [logs] access = off ``` diff --git a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-haproxy.md b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-haproxy.md index 9d2aff670..04bd32838 100644 --- a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-haproxy.md +++ b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-haproxy.md @@ -1,16 +1,6 @@ - - -# Netdata via HAProxy - -> HAProxy is a free, very fast and reliable solution offering high availability, load balancing, -> and proxying for TCP and HTTP-based applications. It is particularly suited for very high traffic websites +# Running Netdata behind HAProxy + +> HAProxy is a free, very fast and reliable solution offering high availability, load balancing, and proxying for TCP and HTTP-based applications. It is particularly suited for very high traffic websites > and powers quite a number of the world's most visited ones. If Netdata is running on a host running HAProxy, rather than connecting to Netdata from a port number, a domain name can @@ -18,14 +8,14 @@ be pointed at HAProxy, and HAProxy can redirect connections to the Netdata port. Netdata at `https://example.com` or `https://example.com/netdata/`, which is a much nicer experience then `http://example.com:19999`. -To proxy requests from [HAProxy](https://github.com/haproxy/haproxy) to Netdata, +To proxy requests from [HAProxy](https://github.com/haproxy/haproxy) to Netdata, the following configuration can be used: ## Default Configuration For all examples, set the mode to `http` -```conf +```text defaults mode http ``` @@ -38,7 +28,7 @@ A simple example where the base URL, say `http://example.com`, is used with no s Create a frontend to receive the request. -```conf +```text frontend http_frontend ## HTTP ipv4 and ipv6 on all ips ## bind :::80 v4v6 @@ -50,7 +40,7 @@ frontend http_frontend Create the Netdata backend which will send requests to port `19999`. -```conf +```text backend netdata_backend option forwardfor server netdata_local 127.0.0.1:19999 @@ -69,7 +59,7 @@ An example where the base URL is used with a subpath `/netdata/`: To use a subpath, create an ACL, which will set a variable based on the subpath. -```conf +```text frontend http_frontend ## HTTP ipv4 and ipv6 on all ips ## bind :::80 v4v6 @@ -92,7 +82,7 @@ frontend http_frontend Same as simple example, except remove `/netdata/` with regex. -```conf +```text backend netdata_backend option forwardfor server netdata_local 127.0.0.1:19999 @@ -107,14 +97,14 @@ backend netdata_backend ## Using TLS communication -TLS can be used by adding port `443` and a cert to the frontend. +TLS can be used by adding port `443` and a cert to the frontend. This example will only use Netdata if host matches example.com (replace with your domain). ### Frontend This frontend uses a certificate list. -```conf +```text frontend https_frontend ## HTTP ## bind :::80 v4v6 @@ -139,11 +129,11 @@ In the cert list file place a mapping from a certificate file to the domain used `/etc/letsencrypt/certslist.txt`: -```txt +```text example.com /etc/letsencrypt/live/example.com/example.com.pem ``` -The file `/etc/letsencrypt/live/example.com/example.com.pem` should contain the key and +The file `/etc/letsencrypt/live/example.com/example.com.pem` should contain the key and certificate (in that order) concatenated into a `.pem` file.: ```sh @@ -156,7 +146,7 @@ cat /etc/letsencrypt/live/example.com/fullchain.pem \ Same as simple, except set protocol `https`. -```conf +```text backend netdata_backend option forwardfor server netdata_local 127.0.0.1:19999 @@ -172,30 +162,30 @@ backend netdata_backend To use basic HTTP Authentication, create an authentication list: -```conf +```text # HTTP Auth userlist basic-auth-list group is-admin # Plaintext password - user admin password passwordhere groups is-admin + user admin password YOUR_PASSWORD groups is-admin ``` You can create a hashed password using the `mkpassword` utility. ```sh - printf "passwordhere" | mkpasswd --stdin --method=sha-256 + printf "YOUR_PASSWORD" | mkpasswd --stdin --method=sha-256 $5$l7Gk0VPIpKO$f5iEcxvjfdF11khw.utzSKqP7W.0oq8wX9nJwPLwzy1 ``` -Replace `passwordhere` with hash: +Replace `YOUR_PASSWORD` with hash: -```conf +```text user admin password $5$l7Gk0VPIpKO$f5iEcxvjfdF11khw.utzSKqP7W.0oq8wX9nJwPLwzy1 groups is-admin ``` Now add at the top of the backend: -```conf +```text acl devops-auth http_auth_group(basic-auth-list) is-admin http-request auth realm netdata_local unless devops-auth ``` @@ -204,7 +194,7 @@ http-request auth realm netdata_local unless devops-auth Full example configuration with HTTP auth over TLS with subpath: -```conf +```text global maxconn 20000 @@ -293,5 +283,3 @@ backend netdata_backend http-request set-header X-Forwarded-Port %[dst_port] http-request set-header Connection "keep-alive" ``` - - diff --git a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-lighttpd.md b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-lighttpd.md index 637bc0642..48b9b2c93 100644 --- a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-lighttpd.md +++ b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-lighttpd.md @@ -1,26 +1,17 @@ - - -# Netdata via lighttpd v1.4.x +# Running Netdata behind lighttpd v1.4.x Here is a config for accessing Netdata in a suburl via lighttpd 1.4.46 and newer: -```txt +```text $HTTP["url"] =~ "^/netdata/" { proxy.server = ( "" => ("netdata" => ( "host" => "127.0.0.1", "port" => 19999 ))) proxy.header = ( "map-urlpath" => ( "/netdata/" => "/") ) } ``` -If you have older lighttpd you have to use a chain (such as below), as explained [at this stackoverflow answer](http://stackoverflow.com/questions/14536554/lighttpd-configuration-to-proxy-rewrite-from-one-domain-to-another). +If you have older lighttpd you have to use a chain (such as below), as explained [at this Stack Overflow answer](http://stackoverflow.com/questions/14536554/lighttpd-configuration-to-proxy-rewrite-from-one-domain-to-another). -```txt +```text $HTTP["url"] =~ "^/netdata/" { proxy.server = ( "" => ("" => ( "host" => "127.0.0.1", "port" => 19998 ))) } @@ -31,19 +22,16 @@ $SERVER["socket"] == ":19998" { } ``` - - If the only thing the server is exposing via the web is Netdata (and thus no suburl rewriting required), then you can get away with just -``` +```text proxy.server = ( "" => ( ( "host" => "127.0.0.1", "port" => 19999 ))) ``` -Though if it's public facing you might then want to put some authentication on it. htdigest support -looks like: +Though if it's public facing you might then want to put some authentication on it. `htdigest` support looks like: -``` +```text auth.backend = "htdigest" auth.backend.htdigest.userfile = "/etc/lighttpd/lighttpd.htdigest" auth.require = ( "" => ( "method" => "digest", @@ -55,14 +43,12 @@ auth.require = ( "" => ( "method" => "digest", other auth methods, and more info on htdigest, can be found in lighttpd's [mod_auth docs](http://redmine.lighttpd.net/projects/lighttpd/wiki/Docs_ModAuth). - - It seems that lighttpd (or some versions of it), fail to proxy compressed web responses. To solve this issue, disable web response compression in Netdata. -Open `/etc/netdata/netdata.conf` and set in [global]\: +Open `/etc/netdata/netdata.conf` and set in `[global]`: -``` +```text enable web responses gzip compression = no ``` @@ -71,5 +57,3 @@ enable web responses gzip compression = no You would also need to instruct Netdata to listen only to `127.0.0.1` or `::1`. To limit access to Netdata only from localhost, set `bind socket to IP = 127.0.0.1` or `bind socket to IP = ::1` in `/etc/netdata/netdata.conf`. - - diff --git a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-nginx.md b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-nginx.md index f2dd137dd..c0364633a 100644 --- a/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-nginx.md +++ b/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-nginx.md @@ -2,19 +2,19 @@ ## Intro -[Nginx](https://nginx.org/en/) is an HTTP and reverse proxy server, a mail proxy server, and a generic TCP/UDP proxy server used to host websites and applications of all sizes. +[Nginx](https://nginx.org/en/) is an HTTP and reverse proxy server, a mail proxy server, and a generic TCP/UDP proxy server used to host websites and applications of all sizes. The software is known for its low impact on memory resources, high scalability, and its modular, event-driven architecture which can offer secure, predictable performance. ## Why Nginx -- By default, Nginx is fast and lightweight out of the box. +- By default, Nginx is fast and lightweight out of the box. -- Nginx is used and useful in cases when you want to access different instances of Netdata from a single server. +- Nginx is used and useful in cases when you want to access different instances of Netdata from a single server. -- Password-protect access to Netdata, until distributed authentication is implemented via the Netdata cloud Sign In mechanism. +- Password-protect access to Netdata, until distributed authentication is implemented via the Netdata cloud Sign In mechanism. -- A proxy was necessary to encrypt the communication to Netdata, until v1.16.0, which provided TLS (HTTPS) support. +- A proxy was necessary to encrypt the communication to Netdata, until v1.16.0, which provided TLS (HTTPS) support. ## Nginx configuration file @@ -22,23 +22,23 @@ All Nginx configurations can be found in the `/etc/nginx/` directory. The main c Configuration options in Nginx are known as directives. Directives are organized into groups known as blocks or contexts. The two terms can be used interchangeably. -Depending on your installation source, you’ll find an example configuration file at `/etc/nginx/conf.d/default.conf` or `etc/nginx/sites-enabled/default`, in some cases you may have to manually create the `sites-available` and `sites-enabled` directories. +Depending on your installation source, you’ll find an example configuration file at `/etc/nginx/conf.d/default.conf` or `etc/nginx/sites-enabled/default`, in some cases you may have to manually create the `sites-available` and `sites-enabled` directories. You can edit the Nginx configuration file with Nano, Vim or any other text editors you are comfortable with. After making changes to the configuration files: -- Test Nginx configuration with `nginx -t`. +- Test Nginx configuration with `nginx -t`. -- Restart Nginx to effect the change with `/etc/init.d/nginx restart` or `service nginx restart`. +- Restart Nginx to effect the change with `/etc/init.d/nginx restart` or `service nginx restart`. ## Ways to access Netdata via Nginx ### As a virtual host -With this method instead of `SERVER_IP_ADDRESS:19999`, the Netdata dashboard can be accessed via a human-readable URL such as `netdata.example.com` used in the configuration below. +With this method instead of `SERVER_IP_ADDRESS:19999`, the Netdata dashboard can be accessed via a human-readable URL such as `netdata.example.com` used in the configuration below. -```conf +```text upstream backend { # the Netdata server server 127.0.0.1:19999; @@ -69,10 +69,10 @@ server { ### As a subfolder to an existing virtual host -This method is recommended when Netdata is to be served from a subfolder (or directory). +This method is recommended when Netdata is to be served from a subfolder (or directory). In this case, the virtual host `netdata.example.com` already exists and Netdata has to be accessed via `netdata.example.com/netdata/`. -```conf +```text upstream netdata { server 127.0.0.1:19999; keepalive 64; @@ -112,9 +112,9 @@ server { ### As a subfolder for multiple Netdata servers, via one Nginx -This is the recommended configuration when one Nginx will be used to manage multiple Netdata servers via subfolders. +This is the recommended configuration when one Nginx will be used to manage multiple Netdata servers via sub-folders. -```conf +```text upstream backend-server1 { server 10.1.1.103:19999; keepalive 64; @@ -159,16 +159,16 @@ Of course you can add as many backend servers as you like. Using the above, you access Netdata on the backend servers, like this: -- `http://netdata.example.com/netdata/server1/` to reach `backend-server1` -- `http://netdata.example.com/netdata/server2/` to reach `backend-server2` +- `http://netdata.example.com/netdata/server1/` to reach `backend-server1` +- `http://netdata.example.com/netdata/server2/` to reach `backend-server2` ### Encrypt the communication between Nginx and Netdata -In case Netdata's web server has been [configured to use TLS](/src/web/server/README.md#enabling-tls-support), it is +In case Netdata's web server has been [configured to use TLS](/src/web/server/README.md#enable-httpstls-support), it is necessary to specify inside the Nginx configuration that the final destination is using TLS. To do this, please, append the following parameters in your `nginx.conf` -```conf +```text proxy_set_header X-Forwarded-Proto https; proxy_pass https://localhost:19999; ``` @@ -189,7 +189,7 @@ printf "yourusername:$(openssl passwd -apr1)" > /etc/nginx/passwords And then enable the authentication inside your server directive: -```conf +```text server { # ... auth_basic "Protected"; @@ -202,40 +202,35 @@ server { If your Nginx is on `localhost`, you can use this to protect your Netdata: -``` +```text [web] bind to = 127.0.0.1 ::1 ``` You can also use a unix domain socket. This will also provide a faster route between Nginx and Netdata: -``` +```text [web] bind to = unix:/var/run/netdata/netdata.sock ``` -*note: Netdata v1.8+ support unix domain sockets* - At the Nginx side, use something like this to use the same unix domain socket: -```conf +```text upstream backend { server unix:/var/run/netdata/netdata.sock; keepalive 64; } ``` - If your Nginx server is not on localhost, you can set: -``` +```text [web] bind to = * allow connections from = IP_OF_NGINX_SERVER ``` -*note: Netdata v1.9+ support `allow connections from`* - `allow connections from` accepts [Netdata simple patterns](/src/libnetdata/simple_pattern/README.md) to match against the connection IP address. @@ -243,7 +238,7 @@ connection IP address. Nginx logs accesses and Netdata logs them too. You can prevent Netdata from generating its access log, by setting this in `/etc/netdata/netdata.conf`: -``` +```text [logs] access = off ``` @@ -252,18 +247,18 @@ Nginx logs accesses and Netdata logs them too. You can prevent Netdata from gene By default, netdata compresses its responses. You can have nginx do that instead, with the following options in the `location /` block: -```conf - location / { - ... - gzip on; - gzip_proxied any; - gzip_types *; - } +```text +location / { + ... + gzip on; + gzip_proxied any; + gzip_types *; +} ``` To disable Netdata's gzip compression, open `netdata.conf` and in the `[web]` section put: -```conf +```text [web] enable gzip compression = no ``` @@ -278,5 +273,3 @@ If you get an 502 Bad Gateway error you might check your Nginx error log: ``` If you see something like the above, chances are high that SELinux prevents nginx from connecting to the backend server. To fix that, just use this policy: `setsebool -P httpd_can_network_connect true`. - - diff --git a/docs/netdata-agent/securing-netdata-agents.md b/docs/netdata-agent/securing-netdata-agents.md index 5232173fb..91a82c1ae 100644 --- a/docs/netdata-agent/securing-netdata-agents.md +++ b/docs/netdata-agent/securing-netdata-agents.md @@ -1,26 +1,25 @@ # Securing Netdata Agents -Netdata is a monitoring system. It should be protected, the same way you protect all your admin apps. We assume Netdata +Netdata is a monitoring system. It should be protected, the same way you protect all your admin apps. We assume Netdata will be installed privately, for your eyes only. Upon installation, the Netdata Agent serves the **local dashboard** at port `19999`. If the node is accessible to the internet at large, anyone can access the dashboard and your node's metrics at `http://NODE:19999`. We made this decision so that the local dashboard was immediately accessible to users, and so that we don't dictate how professionals set up -and secure their infrastructures. +and secure their infrastructures. -Viewers will be able to get some information about the system Netdata is running. This information is everything the dashboard -provides. The dashboard includes a list of the services each system runs (the legends of the charts under the `Systemd Services` -section), the applications running (the legends of the charts under the `Applications` section), the disks of the system and -their names, the user accounts of the system that are running processes (the `Users` and `User Groups` section of the dashboard), +Viewers will be able to get some information about the system Netdata is running. This information is everything the dashboard +provides. The dashboard includes a list of the services each system runs (the legends of the charts under the `Systemd Services` +section), the applications running (the legends of the charts under the `Applications` section), the disks of the system and +their names, the user accounts of the system that are running processes (the `Users` and `User Groups` section of the dashboard), the network interfaces and their names (not the IPs) and detailed information about the performance of the system and its applications. -This information is not sensitive (meaning that it is not your business data), but **it is important for possible attackers**. -It will give them clues on what to check, what to try and in the case of DDoS against your applications, they will know if they -are doing it right or not. +This information is not sensitive (meaning that it is not your business data), but **it is important for possible attackers**. +It will give them clues on what to check, what to try and in the case of DDoS against your applications, they will know if they’re doing it right or not. -Also, viewers could use Netdata itself to stress your servers. Although the Netdata daemon runs unprivileged, with the minimum -process priority (scheduling priority `idle` - lower than nice 19) and adjusts its OutOfMemory (OOM) score to 1000 (so that it -will be first to be killed by the kernel if the system starves for memory), some pressure can be applied on your systems if +Also, viewers could use Netdata itself to stress your servers. Although the Netdata daemon runs unprivileged, with the minimum +process priority (scheduling priority `idle` - lower than nice 19) and adjusts its OutOfMemory (OOM) score to 1000 (so that it +will be first to be killed by the kernel if the system starves for memory), some pressure can be applied on your systems if someone attempts a DDoS against Netdata. Instead of dictating how to secure your infrastructure, we give you many options to establish security best practices @@ -29,12 +28,12 @@ that align with your goals and your organization's standards. - [Disable the local dashboard](#disable-the-local-dashboard): **Simplest and recommended method** for those who have added nodes to Netdata Cloud and view dashboards and metrics there. -- [Expose Netdata only in a private LAN](#expose-netdata-only-in-a-private-lan). Simplest and recommended method for those who do not use Netdata Cloud. +- [Expose Netdata only in a private LAN](#expose-netdata-only-in-a-private-lan). Simplest and recommended method for those who don’t use Netdata Cloud. - [Fine-grained access control](#fine-grained-access-control): Allow local dashboard access from only certain IP addresses, such as a trusted static IP or connections from behind a management LAN. Full support for Netdata Cloud. -- [Use a reverse proxy (authenticating web server in proxy mode)](#use-an-authenticating-web-server-in-proxy-mode): Password-protect +- [Use a reverse proxy (authenticating web server in proxy mode)](#use-an-authenticating-web-server-in-proxy-mode): Password-protect a local dashboard and enable TLS to secure it. Full support for Netdata Cloud. - [Use Netdata parents as Web Application Firewalls](#use-netdata-parents-as-web-application-firewalls) @@ -46,7 +45,7 @@ that align with your goals and your organization's standards. This is the _recommended method for those who have connected their nodes to Netdata Cloud_ and prefer viewing real-time metrics using the Room Overview, Nodes tab, and Cloud dashboards. -You can disable the local dashboard (and API) but retain the encrypted Agent-Cloud link +You can disable the local dashboard (and API) but retain the encrypted Agent-Cloud link ([ACLK](/src/aclk/README.md)) that allows you to stream metrics on demand from your nodes via the Netdata Cloud interface. This change mitigates all concerns about revealing metrics and system design to the internet at large, while keeping all the functionality you @@ -55,64 +54,61 @@ need to view metrics and troubleshoot issues with Netdata Cloud. Open `netdata.conf` with `./edit-config netdata.conf`. Scroll down to the `[web]` section, and find the `mode = static-threaded` setting, and change it to `none`. -```conf +```text [web] mode = none ``` -Save and close the editor, then [restart your Agent](/packaging/installer/README.md#maintaining-a-netdata-agent-installation) -using `sudo systemctl -restart netdata`. If you try to visit the local dashboard to `http://NODE:19999` again, the connection will fail because +Save and close the editor, then [restart your Agent](/docs/netdata-agent/start-stop-restart.md). If you try to visit the local dashboard to `http://NODE:19999` again, the connection will fail because that node no longer serves its local dashboard. -> See the [configuration basics doc](/docs/netdata-agent/configuration/README.md) for details on how to find +> See the [configuration basics doc](/docs/netdata-agent/configuration/README.md) for details on how to find `netdata.conf` and use > `edit-config`. -If you are using Netdata with Docker, make sure to set the `NETDATA_HEALTHCHECK_TARGET` environment variable to `cli`. - +If you’re using Netdata with Docker, make sure to set the `NETDATA_HEALTHCHECK_TARGET` environment variable to `cli`. ## Expose Netdata only in a private LAN -If your organisation has a private administration and management LAN, you can bind Netdata on this network interface on all your servers. +If your organization has a private administration and management LAN, you can bind Netdata on this network interface on all your servers. This is done in `Netdata.conf` with these settings: -``` +```text [web] - bind to = 10.1.1.1:19999 localhost:19999 + bind to = 10.1.1.1:19999 localhost:19999 ``` -You can bind Netdata to multiple IPs and ports. If you use hostnames, Netdata will resolve them and use all the IPs +You can bind Netdata to multiple IPs and ports. If you use hostnames, Netdata will resolve them and use all the IPs (in the above example `localhost` usually resolves to both `127.0.0.1` and `::1`). -**This is the best and the suggested way to protect Netdata**. Your systems **should** have a private administration and management +**This is the best and the suggested way to protect Netdata**. Your systems **should** have a private administration and management LAN, so that all management tasks are performed without any possibility of them being exposed on the internet. -For cloud based installations, if your cloud provider does not provide such a private LAN (or if you use multiple providers), -you can create a virtual management and administration LAN with tools like `tincd` or `gvpe`. These tools create a mesh VPN -allowing all servers to communicate securely and privately. Your administration stations join this mesh VPN to get access to +For Cloud-based installations, if your cloud provider doesn’t provide such a private LAN (or if you use multiple providers), +you can create a virtual management and administration LAN with tools like `tincd` or `gvpe`. These tools create a mesh VPN +allowing all servers to communicate securely and privately. Your administration stations join this mesh VPN to get access to management and administration tasks on all your cloud servers. -For `gvpe` we have developed a [simple provisioning tool](https://github.com/netdata/netdata-demo-site/tree/master/gvpe) you -may find handy (it includes statically compiled `gvpe` binaries for Linux and FreeBSD, and also a script to compile `gvpe` -on your macOS system). We use this to create a management and administration LAN for all Netdata demo sites (spread all over +For `gvpe` we have developed a [simple provisioning tool](https://github.com/netdata/netdata-demo-site/tree/master/gvpe) you +may find handy (it includes statically compiled `gvpe` binaries for Linux and FreeBSD, and also a script to compile `gvpe` +on your macOS system). We use this to create a management and administration LAN for all Netdata demo sites (spread all over the internet using multiple hosting providers). ## Fine-grained access control If you want to keep using the local dashboard, but don't want it exposed to the internet, you can restrict access with -[access lists](/src/web/server/README.md#access-lists). This method also fully +[access lists](/src/web/server/README.md#access-lists). This method also fully retains the ability to stream metrics on-demand through Netdata Cloud. The `allow connections from` setting helps you allow only certain IP addresses or FQDN/hostnames, such as a trusted -static IP, only `localhost`, or connections from behind a management LAN. +static IP, only `localhost`, or connections from behind a management LAN. By default, this setting is `localhost *`. This setting allows connections from `localhost` in addition to _all_ connections, using the `*` wildcard. You can change this setting using Netdata's [simple patterns](/src/libnetdata/simple_pattern/README.md). -```conf +```text [web] # Allow only localhost connections allow connections from = localhost @@ -125,9 +121,9 @@ patterns](/src/libnetdata/simple_pattern/README.md). ``` The `allow connections from` setting is global and restricts access to the dashboard, badges, streaming, API, and -`netdata.conf`, but you can also set each of those access lists more granularly if you choose: +`netdata.conf`, but you can also set each of those access lists in more detail if you want: -```conf +```text [web] allow connections from = localhost * allow dashboard from = localhost * @@ -137,44 +133,42 @@ The `allow connections from` setting is global and restricts access to the dashb allow management from = localhost ``` -See the [web server](/src/web/server/README.md#access-lists) docs for additional details -about access lists. You can take -access lists one step further by [enabling SSL](/src/web/server/README.md#enabling-tls-support) to encrypt data from local +See the [web server](/src/web/server/README.md#access-lists) docs for additional details about access lists. You can take access lists one step further by [enabling SSL](/src/web/server/README.md#enable-httpstls-support) to encrypt data from local dashboard in transit. The connection to Netdata Cloud is always secured with TLS. ## Use an authenticating web server in proxy mode -Use one web server to provide authentication in front of **all your Netdata servers**. So, you will be accessing all your Netdata with -URLs like `http://{HOST}/netdata/{NETDATA_HOSTNAME}/` and authentication will be shared among all of them (you will sign-in once for all your servers). -Instructions are provided on how to set the proxy configuration to have Netdata run behind -[nginx](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-nginx.md), -[HAproxy](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-haproxy.md), -[Apache](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-apache.md), -[lighthttpd](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-lighttpd.md), +Use one web server to provide authentication in front of **all your Netdata servers**. So, you will be accessing all your Netdata with +URLs like `http://{HOST}/netdata/{NETDATA_HOSTNAME}/` and authentication will be shared among all of them (you will sign in once for all your servers). +Instructions are provided on how to set the proxy configuration to have Netdata run behind +[nginx](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-nginx.md), +[HAproxy](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-haproxy.md), +[Apache](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-apache.md), +[lighthttpd](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-lighttpd.md), [caddy](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-caddy.md), and [H2O](/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/Running-behind-h2o.md). ## Use Netdata parents as Web Application Firewalls -The Netdata Agents you install on your production systems do not need direct access to the Internet. Even when you use -Netdata Cloud, you can appoint one or more Netdata Parents to act as border gateways or application firewalls, isolating -your production systems from the rest of the world. Netdata -Parents receive metric data from Netdata Agents or other Netdata Parents on one side, and serve most queries using their own +The Netdata Agents you install on your production systems don’t need direct access to the Internet. Even when you use +Netdata Cloud, you can appoint one or more Netdata Parents to act as border gateways or application firewalls, isolating +your production systems from the rest of the world. Netdata +Parents receive metric data from Netdata Agents or other Netdata Parents on one side, and serve most queries using their own copy of the data to satisfy dashboard requests on the other side. -For more information see [Streaming and replication](/docs/observability-centralization-points/README.md). +For more information, see [Streaming and replication](/docs/observability-centralization-points/README.md). ## Other methods Of course, there are many more methods you could use to protect Netdata: -- Bind Netdata to localhost and use `ssh -L 19998:127.0.0.1:19999 remote.netdata.ip` to forward connections of local port 19998 to remote port 19999. -This way you can ssh to a Netdata server and then use `http://127.0.0.1:19998/` on your computer to access the remote Netdata dashboard. +- Bind Netdata to localhost and use `ssh -L 19998:127.0.0.1:19999 remote.netdata.ip` to forward connections of local port 19998 to remote port 19999. + This way you can ssh to a Netdata server and then use `http://127.0.0.1:19998/` on your computer to access the remote Netdata dashboard. -- If you are always under a static IP, you can use the script given above to allow direct access to your Netdata servers without authentication, -from all your static IPs. +- If you’re always under a static IP, you can use the script given above to allow direct access to your Netdata servers without authentication, + from all your static IPs. -- Install all your Netdata in **headless data collector** mode, forwarding all metrics in real-time to a parent - Netdata server, which will be protected with authentication using an nginx server running locally at the parent - Netdata server. This requires more resources (you will need a bigger parent Netdata server), but does not require - any firewall changes, since all the child Netdata servers will not be listening for incoming connections. +- Install all your Netdata in **headless data collector** mode, forwarding all metrics in real-time to a parent + Netdata server, which will be protected with authentication using a nginx server running locally at the parent + Netdata server. This requires more resources (you will need a bigger parent Netdata server), but doesn’t require + any firewall changes, since all the child Netdata servers will not be listening for incoming connections. diff --git a/docs/netdata-agent/sizing-netdata-agents/README.md b/docs/netdata-agent/sizing-netdata-agents/README.md index 3ba346f7a..3880e214c 100644 --- a/docs/netdata-agent/sizing-netdata-agents/README.md +++ b/docs/netdata-agent/sizing-netdata-agents/README.md @@ -1,89 +1,85 @@ -# Sizing Netdata Agents +# Resource utilization -Netdata automatically adjusts its resources utilization based on the workload offered to it. +Netdata is designed to automatically adjust its resource consumption based on the specific workload. -This is a map of how Netdata **features impact resources utilization**: +This table shows the specific system resources affected by different Netdata features: -| Feature | CPU | RAM | Disk I/O | Disk Space | Retention | Bandwidth | -|-----------------------------:|:---:|:---:|:--------:|:----------:|:---------:|:---------:| -| Metrics collected | X | X | X | X | X | - | -| Samples collection frequency | X | - | X | X | X | - | -| Database mode and tiers | - | X | X | X | X | - | -| Machine learning | X | X | - | - | - | - | -| Streaming | X | X | - | - | - | X | +| Feature | CPU | RAM | Disk I/O | Disk Space | Network Traffic | +|------------------------:|:---:|:---:|:--------:|:----------:|:---------------:| +| Collected metrics | ✓ | ✓ | ✓ | ✓ | - | +| Sample frequency | ✓ | - | ✓ | ✓ | - | +| Database mode and tiers | - | ✓ | ✓ | ✓ | - | +| Machine learning | ✓ | ✓ | - | - | - | +| Streaming | ✓ | ✓ | - | - | ✓ | -1. **Metrics collected**: The number of metrics collected affects almost every aspect of resources utilization. +1. **Collected metrics** - When you need to lower the resources used by Netdata, this is an obvious first step. + - **Impact**: More metrics mean higher CPU, RAM, disk I/O, and disk space usage. + - **Optimization**: To reduce resource consumption, consider lowering the number of collected metrics by disabling unnecessary data collectors. -2. **Samples collection frequency**: By default Netdata collects metrics with 1-second granularity, unless the metrics collected are not updated that frequently, in which case Netdata collects them at the frequency they are updated. This is controlled per data collection job. +2. **Sample frequency** - Lowering the data collection frequency from every-second to every-2-seconds, will make Netdata use half the CPU utilization. So, CPU utilization is proportional to the data collection frequency. + - **Impact**: Netdata collects most metrics with 1-second granularity. This high frequency impacts CPU usage. + - **Optimization**: Lowering the sampling frequency (e.g., 1-second to 2-second intervals) can halve CPU usage. Balance the need for detailed data with resource efficiency. -3. **Database Mode and Tiers**: By default Netdata stores metrics in 3 database tiers: high-resolution, mid-resolution, low-resolution. All database tiers are updated in parallel during data collection, and depending on the query duration Netdata may consult one or more tiers to optimize the resources required to satisfy it. +3. **Database Mode** - The number of database tiers affects the memory requirements of Netdata. Going from 3-tiers to 1-tier, will make Netdata use half the memory. Of course metrics retention will also be limited to 1 tier. + - **Impact**: The default database mode, `dbengine`, compresses data and writes it to disk. + - **Optimization**: In a Parent-Child setup, switch the Child's database mode to `ram`. This eliminates disk I/O for the Child. -4. **Machine Learning**: Byt default Netdata trains multiple machine learning models for every metric collected, to learn its behavior and detect anomalies. Machine Learning is a CPU intensive process and affects the overall CPU utilization of Netdata. +4. **Database Tiers** -5. **Streaming Compression**: When using Netdata in Parent-Child configurations to create Metrics Centralization Points, the compression algorithm used greatly affects CPU utilization and bandwidth consumption. + - **Impact**: The number of database tiers directly affects memory consumption. More tiers mean higher memory usage. + - **Optimization**: The default number of tiers is 3. Choose the appropriate number of tiers based on data retention requirements. - Netdata supports multiple streaming compressions algorithms, allowing the optimization of either CPU utilization or Network Bandwidth. The default algorithm `zstd` provides the best balance among them. +5. **Machine Learning** -## Minimizing the resources used by Netdata Agents - -To minimize the resources used by Netdata Agents, we suggest to configure Netdata Parents for centralizing metric samples, and disabling most of the features on Netdata Children. This will provide minimal resources utilization at the edge, while all the features of Netdata are available at the Netdata Parents. - -The following guides provide instructions on how to do this. + - **Impact**: Machine learning model training is CPU-intensive, affecting overall CPU usage. + - **Optimization**: Consider disabling machine learning for less critical metrics or adjusting model training frequency. -## Maximizing the scale of Netdata Parents - -Netdata Parents automatically size resource utilization based on the workload they receive. The only possible option for improving query performance is to dedicate more RAM to them, by increasing their caches efficiency. - -Check [RAM Requirements](/docs/netdata-agent/sizing-netdata-agents/ram-requirements.md) for more information. +6. **Streaming Compression** -## Innovations Netdata has for optimal performance and scalability + - **Impact**: Compression algorithm choice affects CPU usage and network traffic. + - **Optimization**: Select an algorithm that balances CPU efficiency with network bandwidth requirements (e.g., zstd for a good balance). -The following are some of the innovations the open-source Netdata agent has, that contribute to its excellent performance, and scalability. - -1. **Minimal disk I/O** - - When Netdata saves data on-disk, it stores them at their final place, eliminating the need to reorganize this data. - - Netdata is organizing its data structures in such a way that samples are committed to disk as evenly as possible across time, without affecting its memory requirements. +## Minimizing the resources used by Netdata Agents - Furthermore, Netdata Agents use direct-I/O for saving and loading metric samples. This prevents Netdata from polluting system caches with metric data. Netdata maintains its own caches for this data. +To optimize resource utilization, consider using a **Parent-Child** setup. - All these features make Netdata an nice partner and a polite citizen for production applications running on the same systems Netdata runs. +This approach involves centralizing the collection and processing of metrics on Parent nodes while running lightweight Children Agents on edge devices. -2. **4 bytes per sample uncompressed** +## Maximizing the scale of Parent Agents - To achieve optimal memory and disk footprint, Netdata uses a custom 32-bit floating point number. This floating point number is used to store the samples collected, together with their anomaly bit. The database of Netdata is fixed-step, so it has predefined slots for every sample, allowing Netdata to store timestamps once every several hundreds samples, minimizing both its memory requirements and the disk footprint. +Parents dynamically adjust their resource usage based on the volume of metrics received. However, for optimal query performance, you may need to dedicate more RAM. - The final disk footprint of Netdata varies due to compression efficiency. It is usually about 0.6 bytes per sample for the high-resolution tier (per-second), 6 bytes per sample for the mid-resolution tier (per-minute) and 18 bytes per sample for the low-resolution tier (per-hour). +Check [RAM Requirements](/docs/netdata-agent/sizing-netdata-agents/ram-requirements.md) for more information. -3. **Query priorities** +## Netdata's performance and scalability optimization techniques - Alerting, Machine Learning, Streaming and Replication, rely on metric queries. When multiple queries are running in parallel, Netdata assigns priorities to all of them, favoring interactive queries over background tasks. This means that queries do not compete equally for resources. Machine learning or replication may slow down when interactive queries are running and the system starves for resources. +1. **Minimal Disk I/O** -4. **A pointer per label** + Netdata directly writes metric data to disk, bypassing system caches and reducing I/O overhead. Additionally, its optimized data structures minimize disk space and memory usage through efficient compression and timestamping. - Apart from metric samples, metric labels and their cardinality is the biggest memory consumer, especially in highly ephemeral environments, like kubernetes. Netdata uses a single pointer for any label key-value pair that is reused. Keys and values are also deduplicated, providing the best possible memory footprint for metric labels. +2. **Compact Storage Engine** -5. **Streaming Protocol** + Netdata uses a custom 32-bit floating-point format tailored for efficient storage of time-series data, along with an anomaly bit. This, combined with a fixed-step database design, enables efficient storage and retrieval of data. - The streaming protocol of Netdata allows minimizing the resources consumed on production systems by delegating features of to other Netdata agents (Parents), without compromising monitoring fidelity or responsiveness, enabling the creation of a highly distributed observability platform. + | Tier | Approximate Sample Size (bytes) | + |-----------------------------------|---------------------------------| + | High-resolution tier (per-second) | 0.6 | + | Mid-resolution tier (per-minute) | 6 | + | Low-resolution tier (per-hour) | 18 | -## Netdata vs Prometheus + Timestamp optimization further reduces storage overhead by storing timestamps at regular intervals. -Netdata outperforms Prometheus in every aspect. -35% CPU Utilization, -49% RAM usage, -12% network bandwidth, -98% disk I/O, -75% in disk footprint for high resolution data, while providing more than a year of retention. +3. **Intelligent Query Engine** -Read the [full comparison here](https://blog.netdata.cloud/netdata-vs-prometheus-performance-analysis/). + Netdata prioritizes interactive queries over background tasks like machine learning and replication, ensuring optimal user experience, especially under heavy load. -## Energy Efficiency +4. **Efficient Label Storage** -University of Amsterdam contacted a research on the impact monitoring systems have on docker based systems. + Netdata uses pointers to reference shared label key-value pairs, minimizing memory usage, especially in highly dynamic environments. -The study found that Netdata excels in CPU utilization, RAM usage, Execution Time and concluded that **Netdata is the most energy efficient tool**. +5. **Scalable Streaming Protocol** -Read the [full study here](https://www.ivanomalavolta.com/files/papers/ICSOC_2023.pdf). + Netdata's streaming protocol enables the creation of distributed monitoring setups, where Children offload data processing to Parents, optimizing resource utilization. diff --git a/docs/netdata-agent/sizing-netdata-agents/bandwidth-requirements.md b/docs/netdata-agent/sizing-netdata-agents/bandwidth-requirements.md index 092c8da16..fbbc279d5 100644 --- a/docs/netdata-agent/sizing-netdata-agents/bandwidth-requirements.md +++ b/docs/netdata-agent/sizing-netdata-agents/bandwidth-requirements.md @@ -1,16 +1,16 @@ # Bandwidth Requirements -## On Production Systems, Standalone Netdata +## Production Systems: Standalone Netdata Standalone Netdata may use network bandwidth under the following conditions: -1. You configured data collection jobs that are fetching data from remote systems. There is no such jobs enabled by default. +1. You configured data collection jobs that are fetching data from remote systems. There are no such jobs enabled by default. 2. You use the dashboard of the Netdata. 3. [Netdata Cloud communication](#netdata-cloud-communication) (see below). -## On Metrics Centralization Points, between Netdata Children & Parents +## Metrics Centralization Points: Between Netdata Children & Parents -Netdata supports multiple compression algorithms for streaming communication. Netdata Children offer all their compression algorithms when connecting to a Netdata Parent, and the Netdata Parent decides which one to use based on algorithms availability and user configuration. +Netdata supports multiple compression algorithms for streaming communication. Netdata Children offer all their compression algorithms when connecting to a Netdata Parent, and the Netdata Parent decides which one to use based on algorithm availability and user configuration. | Algorithm | Best for | |:---------:|:-----------------------------------------------------------------------------------------------------------------------------------:| @@ -23,7 +23,7 @@ The expected bandwidth consumption using `zstd` for 1 million samples per second The order compression algorithms is selected is configured in `stream.conf`, per `[API KEY]`, like this: -``` +```text compression algorithms order = zstd lz4 brotli gzip ``` @@ -42,6 +42,6 @@ The information transferred to Netdata Cloud is: 3. Information about the **metrics available and their retention**. 4. Information about the **configured alerts and their transitions**. -This is not a constant stream of information. Netdata Agents update Netdata Cloud only about status changes on all the above (e.g. an alert being triggered, or a metric stopped being collected). So, there is an initial handshake and exchange of information when Netdata starts, and then there only updates when required. +This is not a constant stream of information. Netdata Agents update Netdata Cloud only about status changes on all the above (e.g., an alert being triggered, or a metric stopped being collected). So, there is an initial handshake and exchange of information when Netdata starts, and then there only updates when required. Of course, when you view Netdata Cloud dashboards that need to query the database a Netdata agent maintains, this query is forwarded to an agent that can satisfy it. This means that Netdata Cloud receives metric samples only when a user is accessing a dashboard and the samples transferred are usually aggregations to allow rendering the dashboards. diff --git a/docs/netdata-agent/sizing-netdata-agents/cpu-requirements.md b/docs/netdata-agent/sizing-netdata-agents/cpu-requirements.md index 021a35fb2..76580b1c3 100644 --- a/docs/netdata-agent/sizing-netdata-agents/cpu-requirements.md +++ b/docs/netdata-agent/sizing-netdata-agents/cpu-requirements.md @@ -1,65 +1,43 @@ -# CPU Requirements +# CPU -Netdata's CPU consumption is affected by the following factors: +Netdata's CPU usage depends on the features you enable. For details, see [resource utilization](/docs/netdata-agent/sizing-netdata-agents/README.md). -1. The number of metrics collected -2. The frequency metrics are collected -3. Machine Learning -4. Streaming compression (streaming of metrics to Netdata Parents) -5. Database Mode +## Children -## On Production Systems, Netdata Children +With default settings on Children, CPU utilization typically falls within the range of 1% to 5% of a single core. This includes the combined resource usage of: -On production systems, where Netdata is running with default settings, monitoring the system it is installed at and its containers and applications, CPU utilization should usually be about 1% to 5% of a single CPU core. +- Three database tiers for data storage. +- Machine learning for anomaly detection. +- Per-second data collection. +- Alerts. +- Streaming to a [Parent Agent](/docs/observability-centralization-points/metrics-centralization-points/README.md). -This includes 3 database tiers, machine learning, per-second data collection, alerts, and streaming to a Netdata Parent. +## Parents -## On Metrics Centralization Points, Netdata Parents +For Netdata Parents (Metrics Centralization Points), we estimate the following CPU utilization: -On Metrics Centralization Points, Netdata Parents running on modern server hardware, we **estimate CPU utilization per million of samples collected per second**: +| Feature | Depends On | Expected Utilization (CPU cores per million) | Key Reasons | +|:--------------------:|:---------------------------------------------------:|:--------------------------------------------:|:------------------------------------------------------------------------:| +| Metrics Ingest | Number of samples received per second | 2 | Decompress and decode received messages, update database | +| Metrics re-streaming | Number of samples resent per second | 2 | Encode and compress messages towards another Parent | +| Machine Learning | Number of unique time-series concurrently collected | 2 | Train machine learning models, query existing models to detect anomalies | -| Feature | Depends On | Expected Utilization | Key Reasons | -|:-----------------:|:---------------------------------------------------:|:----------------------------------------------------------------:|:-------------------------------------------------------------------------:| -| Metrics Ingestion | Number of samples received per second | 2 CPU cores per million of samples per second | Decompress and decode received messages, update database. | -| Metrics re-streaming| Number of samples resent per second | 2 CPU cores per million of samples per second | Encode and compress messages towards Netdata Parent. | -| Machine Learning | Number of unique time-series concurrently collected | 2 CPU cores per million of unique metrics concurrently collected | Train machine learning models, query existing models to detect anomalies. | +To ensure optimal performance, keep total CPU utilization below 60% when the Parent is actively processing metrics, training models, and running health checks. -We recommend keeping the total CPU utilization below 60% when a Netdata Parent is steadily ingesting metrics, training machine learning models and running health checks. This will leave enough CPU resources available for queries. +## Increased CPU consumption on Parent startup -## I want to minimize CPU utilization. What should I do? +When a Netdata Parent starts up, it undergoes a series of initialization tasks that can temporarily increase CPU, network, and disk I/O usage: -You can control Netdata's CPU utilization with these parameters: +1. **Backfilling Higher Tiers**: The Parent calculates aggregated metrics for missing data points, ensuring consistency across different time resolutions. +2. **Metadata Synchronization**: The Parent and Children exchange metadata information about collected metrics. +3. **Data Replication**: Missing data is transferred from Children to the Parent. +4. **Normal Streaming**: Regular streaming of new metrics begins. +5. **Machine Learning Initialization**: Machine learning models are loaded and prepared for anomaly detection. +6. **Health Check Initialization**: The health engine starts monitoring metrics and triggering alerts. -1. **Data collection frequency**: Going from per-second metrics to every-2-seconds metrics will half the CPU utilization of Netdata. -2. **Number of metrics collected**: Netdata by default collects every metric available on the systems it runs. Review the metrics collected and disable data collection plugins and modules not needed. -3. **Machine Learning**: Disable machine learning to save CPU cycles. -4. **Number of database tiers**: Netdata updates database tiers in parallel, during data collection. This affects both CPU utilization and memory requirements. -5. **Database Mode**: The default database mode is `dbengine`, which compresses and commits data to disk. If you have a Netdata Parent where metrics are aggregated and saved to disk and there is a reliable connection between the Netdata you want to optimize and its Parent, switch to database mode `ram` or `alloc`. This disables saving to disk, so your Netdata will also not use any disk I/O. +Additional considerations: -## I see increased CPU consumption when a busy Netdata Parent starts, why? +- **Compression Optimization**: The compression algorithm learns data patterns to optimize compression ratios. +- **Database Optimization**: The database engine adjusts page sizes for efficient disk I/O. -When a Netdata Parent starts and Netdata children get connected to it, there are several operations that temporarily affect CPU utilization, network bandwidth and disk I/O. - -The general flow looks like this: - -1. **Back-filling of higher tiers**: Usually this means calculating the aggregates of the last hour of `tier2` and of the last minute of `tier1`, ensuring that higher tiers reflect all the information `tier0` has. If Netdata was stopped abnormally (e.g. due to a system failure or crash), higher tiers may have to be back-filled for longer durations. -2. **Metadata synchronization**: The metadata of all metrics each Netdata Child maintains are negotiated between the Child and the Parent and are synchronized. -3. **Replication**: If the Parent is missing samples the Child has, these samples are transferred to the Parent before transferring new samples. -4. Once all these finish, the normal **streaming of new metric samples** starts. -5. At the same time, **machine learning** initializes, loads saved trained models and prepares anomaly detection. -6. After a few moments the **health engine starts checking metrics** for triggering alerts. - -The above process is per metric. So, while one metric back-fills, another replicates and a third one streams. - -At the same time: - -- the compression algorithm learns the patterns of the data exchanged and optimizes its dictionaries for optimal compression and CPU utilization, -- the database engine adjusts the page size of each metric, so that samples are committed to disk as evenly as possible across time. - -So, when looking for the "steady CPU consumption during ingestion" of a busy Netdata Parent, we recommend to let it stabilize for a few hours before checking. - -Keep in mind that Netdata has been designed so that even if during the initialization phase and the connection of hundreds of Netdata Children the system lacks CPU resources, the Netdata Parent will complete all the operations and eventually enter a steady CPU consumption during ingestion, without affecting the quality of the metrics stored. So, it is ok if during initialization of a busy Netdata Parent, CPU consumption spikes to 100%. - -Important: the above initialization process is not such intense when new nodes get connected to a Netdata Parent for the first time (e.g. ephemeral nodes), since several of the steps involved are not required. - -Especially for the cases where children disconnect and reconnect to the Parent due to network related issues (i.e. both the Netdata Child and the Netdata Parent have not been restarted and less than 1 hour has passed since the last disconnection), the re-negotiation phase is minimal and metrics are instantly entering the normal streaming phase. +These initial tasks can temporarily increase resource usage, but the impact typically diminishes as the Parent stabilizes and enters a steady-state operation. diff --git a/docs/netdata-agent/sizing-netdata-agents/disk-requirements-and-retention.md b/docs/netdata-agent/sizing-netdata-agents/disk-requirements-and-retention.md index 7cd9a527d..68da44000 100644 --- a/docs/netdata-agent/sizing-netdata-agents/disk-requirements-and-retention.md +++ b/docs/netdata-agent/sizing-netdata-agents/disk-requirements-and-retention.md @@ -12,7 +12,7 @@ Netdata offers two database modes to suit your needs for performance and data pe ## `dbengine` Netdata's `dbengine` mode efficiently stores data on disk using compression. The actual disk space used depends on how well the data compresses. -This mode utilizes a tiered storage approach: data is saved in multiple tiers on disk. Each tier retains data at a different resolution (detail level). Higher tiers store a down-sampled (less detailed) version of the data found in lower tiers. +This mode uses a tiered storage approach: data is saved in multiple tiers on disk. Each tier retains data at a different resolution (detail level). Higher tiers store a down-sampled (less detailed) version of the data found in lower tiers. ```mermaid gantt @@ -25,7 +25,7 @@ gantt tier2, 365d :a3, 2023-11-02, 59d ``` -`dbengine` supports up to 5 tiers. By default, 3 tiers are used: +`dbengine` supports up to five tiers. By default, three tiers are used: | Tier | Resolution | Uncompressed Sample Size | Usually On Disk | |:-------:|:--------------------------------------------------------------------------------------------:|:------------------------:|:---------------:| @@ -40,11 +40,11 @@ gantt ## `ram` -`ram` mode can help when Netdata should not introduce any disk I/O at all. In both of these modes, metric samples exist only in memory, and only while they are collected. +`ram` mode can help when Netdata shouldn’t introduce any disk I/O at all. In both of these modes, metric samples exist only in memory, and only while they’re collected. -When Netdata is configured to stream its metrics to a Metrics Observability Centralization Point (a Netdata Parent), metric samples are forwarded in real-time to that Netdata Parent. The ring buffers available in these modes is used to cache the collected samples for some time, in case there are network issues, or the Netdata Parent is restarted for maintenance. +When Netdata is configured to stream its metrics to a Metrics Observability Centralization Point (a Netdata Parent), metric samples are forwarded in real-time to that Netdata Parent. The ring buffers available in these modes are used to cache the collected samples for some time, in case there are network issues, or the Netdata Parent is restarted for maintenance. -The memory required per sample in these modes, is 4 bytes: `ram` mode uses `mmap()` behind the scene, and can be incremented in steps of 1024 samples (4KiB). Mode `ram` allows the use of the Linux kernel memory dedupper (Kernel-Same-Page or KSM) to deduplicate Netdata ring buffers and save memory. +The memory required per sample in these modes, is four bytes: `ram` mode uses `mmap()` behind the scene, and can be incremented in steps of 1024 samples (4KiB). Mode `ram` allows the use of the Linux kernel memory dedupper (Kernel-Same-Page or KSM) to deduplicate Netdata ring buffers and save memory. **Configuring ram mode and retention**: diff --git a/docs/netdata-agent/sizing-netdata-agents/ram-requirements.md b/docs/netdata-agent/sizing-netdata-agents/ram-requirements.md index 8d8522517..a4ccf5507 100644 --- a/docs/netdata-agent/sizing-netdata-agents/ram-requirements.md +++ b/docs/netdata-agent/sizing-netdata-agents/ram-requirements.md @@ -8,21 +8,21 @@ Netdata supports memory ballooning and automatically sizes and limits the memory With default settings, Netdata should run with 100MB to 200MB of RAM, depending on the number of metrics being collected. -This number can be lowered by limiting the number of database tier or switching database modes. For more information check [Disk Requirements and Retention](/docs/netdata-agent/sizing-netdata-agents/disk-requirements-and-retention.md). +This number can be lowered by limiting the number of database tier or switching database modes. For more information, check [Disk Requirements and Retention](/docs/netdata-agent/sizing-netdata-agents/disk-requirements-and-retention.md). ## On Metrics Centralization Points, Netdata Parents The general formula, with the default configuration of database tiers, is: -``` +```text memory = UNIQUE_METRICS x 16KiB + CONFIGURED_CACHES ``` The default `CONFIGURED_CACHES` is 32MiB. -For 1 million concurrently collected time-series (independently of their data collection frequency), the memory required is: +For one million concurrently collected time-series (independently of their data collection frequency), the memory required is: -``` +```text UNIQUE_METRICS = 1000000 CONFIGURED_CACHES = 32MiB @@ -32,16 +32,16 @@ CONFIGURED_CACHES = 32MiB about 16 GiB ``` -There are 2 cache sizes that can be configured in `netdata.conf`: +There are two cache sizes that can be configured in `netdata.conf`: -1. `[db].dbengine page cache size MB`: this is the main cache that keeps metrics data into memory. When data are not found in it, the extent cache is consulted, and if not found in that either, they are loaded from disk. -2. `[db].dbengine extent cache size MB`: this is the compressed extent cache. It keeps in memory compressed data blocks, as they appear on disk, to avoid reading them again. Data found in the extend cache but not in the main cache have to be uncompressed to be queried. +1. `[db].dbengine page cache size`: this is the main cache that keeps metrics data into memory. When data is not found in it, the extent cache is consulted, and if not found in that too, they are loaded from the disk. +2. `[db].dbengine extent cache size`: this is the compressed extent cache. It keeps in memory compressed data blocks, as they appear on disk, to avoid reading them again. Data found in the extent cache but not in the main cache have to be uncompressed to be queried. Both of them are dynamically adjusted to use some of the total memory computed above. The configuration in `netdata.conf` allows providing additional memory to them, increasing their caching efficiency. ## I have a Netdata Parent that is also a systemd-journal logs centralization point, what should I know? -Logs usually require significantly more disk space and I/O bandwidth than metrics. For optimal performance we recommend to store metrics and logs on separate, independent disks. +Logs usually require significantly more disk space and I/O bandwidth than metrics. For optimal performance, we recommend to store metrics and logs on separate, independent disks. Netdata uses direct-I/O for its database, so that it does not pollute the system caches with its own data. We want Netdata to be a nice citizen when it runs side-by-side with production applications, so this was required to guarantee that Netdata does not affect the operation of databases or other sensitive applications running on the same servers. @@ -49,9 +49,9 @@ To optimize disk I/O, Netdata maintains its own private caches. The default sett `systemd-journal` on the other hand, relies on operating system caches for improving the query performance of logs. When the system lacks free memory, querying logs leads to increased disk I/O. -If you are experiencing slow responses and increased disk reads when metrics queries run, we suggest to dedicate some more RAM to Netdata. +If you are experiencing slow responses and increased disk reads when metrics queries run, we suggest dedicating some more RAM to Netdata. -We frequently see that the following strategy gives best results: +We frequently see that the following strategy gives the best results: 1. Start the Netdata Parent, send all the load you expect it to have and let it stabilize for a few hours. Netdata will now use the minimum memory it believes is required for smooth operation. 2. Check the available system memory. diff --git a/docs/netdata-agent/start-stop-restart.md b/docs/netdata-agent/start-stop-restart.md index 6fbe18d31..21bf443a0 100644 --- a/docs/netdata-agent/start-stop-restart.md +++ b/docs/netdata-agent/start-stop-restart.md @@ -1,30 +1,24 @@ -# Start, stop, or restart the Netdata Agent +# Service Control -When you install the Netdata Agent, the [daemon](/src/daemon/README.md) is -configured to start at boot and stop and restart/shutdown. +The Netdata Agent automatically starts at boot after installation. -You will most often need to _restart_ the Agent to load new or editing configuration files. -[Health configuration](#reload-health-configuration) files are the only exception, as they can be reloaded without restarting -the entire Agent. +> In most cases, you need to **restart the Netdata service** to apply changes to configuration files. Health configuration files, which define alerts, are an exception. They can be [reloaded](#reload-health) **without restarting**. +> +> Restarting the Netdata Agent will cause temporary gaps in your collected metrics. This occurs while the netdata process reinitializes its data collectors and database engine. -Stopping or restarting the Netdata Agent will cause gaps in stored metrics until the `netdata` process initiates -collectors and the database engine. +## UNIX -## Using `systemctl`, `service`, or `init.d` +### Using `systemctl`, `service`, or `init.d` -This is the recommended way to start, stop, or restart the Netdata daemon. +| Action | Systemd | Non-systemd | +|---------|--------------------------------|------------------------------| +| start | `sudo systemctl start netdata` | `sudo service netdata start` | +| stop | `sudo systemctl stop netdata` | `sudo service netdata stop` | +| restart | `sudo systemctl stop netdata` | `sudo service netdata stop` | -- To **start** Netdata, run `sudo systemctl start netdata`. -- To **stop** Netdata, run `sudo systemctl stop netdata`. -- To **restart** Netdata, run `sudo systemctl restart netdata`. +### Using `netdata` -If the above commands fail, or you know that you're using a non-systemd system, try using the `service` command: - -- **service**: `sudo service netdata start`, `sudo service netdata stop`, `sudo service netdata restart` - -## Using `netdata` - -Use the `netdata` command, typically located at `/usr/sbin/netdata`, to start the Netdata daemon. +Use the `netdata` command, typically located at `/usr/sbin/netdata`, to start the Netdata daemon. ```bash sudo netdata @@ -32,122 +26,30 @@ sudo netdata If you start the daemon this way, close it with `sudo killall netdata`. -## Using `netdatacli` +### Using `netdatacli` -The Netdata Agent also comes with a [CLI tool](/src/cli/README.md) capable of performing shutdowns. Start the Agent back up -using your preferred method listed above. +The Netdata Agent also comes with a [CLI tool](/src/cli/README.md) capable of performing shutdowns. Start the Agent back up using your preferred method listed above. ```bash sudo netdatacli shutdown-agent ``` -## Netdata MSI installations - -Netdata provides an installer for Windows using WSL, on those installations by using a Windows terminal (e.g. the Command prompt or Windows Powershell) you can: - -- Start Netdata, by running `start-netdata` -- Stop Netdata, by running `stop-netdata` -- Restart Netdata, by running `restart-netdata` +### Reload health -## Reload health configuration - -You do not need to restart the Netdata Agent between changes to health configuration files, such as specific health -entities. Instead, use [`netdatacli`](#using-netdatacli) and the `reload-health` option to prevent gaps in metrics -collection. +No need to restart the Netdata Agent after modifying health configuration files (alerts). Use `netdatacli` to avoid metric collection gaps. ```bash sudo netdatacli reload-health ``` -If `netdatacli` doesn't work on your system, send a `SIGUSR2` signal to the daemon, which reloads health configuration -without restarting the entire process. - -```bash -killall -USR2 netdata -``` - -## Force stop stalled or unresponsive `netdata` processes - -In rare cases, the Netdata Agent may stall or not properly close sockets, preventing a new process from starting. In -these cases, try the following three commands: - -```bash -sudo systemctl stop netdata -sudo killall netdata -ps aux| grep netdata -``` - -The output of `ps aux` should show no `netdata` or associated processes running. You can now start the Netdata Agent -again with `service netdata start`, or the appropriate method for your system. - -## Starting Netdata at boot - -In the `system` directory you can find scripts and configurations for the -various distros. - -### systemd - -The installer already installs `netdata.service` if it detects a systemd system. - -To install `netdata.service` by hand, run: - -```sh -# stop Netdata -killall netdata - -# copy netdata.service to systemd -cp system/netdata.service /etc/systemd/system/ - -# let systemd know there is a new service -systemctl daemon-reload - -# enable Netdata at boot -systemctl enable netdata - -# start Netdata -systemctl start netdata -``` - -### init.d - -In the system directory you can find `netdata-lsb`. Copy it to the proper place according to your distribution -documentation. For Ubuntu, this can be done via running the following commands as root. - -```sh -# copy the Netdata startup file to /etc/init.d -cp system/netdata-lsb /etc/init.d/netdata - -# make sure it is executable -chmod +x /etc/init.d/netdata - -# enable it -update-rc.d netdata defaults -``` - -### openrc (gentoo) - -In the `system` directory you can find `netdata-openrc`. Copy it to the proper -place according to your distribution documentation. - -### CentOS / Red Hat Enterprise Linux - -For older versions of RHEL/CentOS that don't have systemd, an init script is included in the system directory. This can -be installed by running the following commands as root. - -```sh -# copy the Netdata startup file to /etc/init.d -cp system/netdata-init-d /etc/init.d/netdata - -# make sure it is executable -chmod +x /etc/init.d/netdata - -# enable it -chkconfig --add netdata -``` +## Windows -_There have been some recent work on the init script, see PR -_ +> **Note** +> +> You will need to run PowerShell as administrator. -### other systems +- To **start** Netdata, run `Start-Service Netdata`. +- To **stop** Netdata, run `Stop-Service Netdata`. +- To **restart** Netdata, run `Restart-Service Netdata`. -You can start Netdata by running it from `/etc/rc.local` or equivalent. +If you prefer to manage the Agent through the GUI, you can start-stop and restart the `Netdata` service from the "Services" tab of Task Manager. diff --git a/docs/netdata-agent/versions-and-platforms.md b/docs/netdata-agent/versions-and-platforms.md index 14dc393b5..1f5bf6a97 100644 --- a/docs/netdata-agent/versions-and-platforms.md +++ b/docs/netdata-agent/versions-and-platforms.md @@ -1,6 +1,6 @@ # Netdata Agent Versions & Platforms -Netdata is evolving rapidly and new features are added at a constant pace. Therefore we have a frequent release cadence to deliver all these features to use as soon as possible. +Netdata is evolving rapidly and new features are added at a constant pace. Therefore, we have a frequent release cadence to deliver all these features to use as soon as possible. Netdata Agents are available in 2 versions: @@ -9,11 +9,11 @@ Netdata Agents are available in 2 versions: | Stable | At most once per month, usually every 45 days | Receiving bug fixes and security updates between releases | Up to the 2nd stable release after them | Previous configuration semantics and data are supported by newer releases | | Nightly | Every night at 00:00 UTC | Latest pre-released features | Up to the 2nd nightly release after them | Configuration and data of unreleased features may change between nightly releases | -> "Support Duration" defines the time we consider the release as actively used by users in production systems, so that all features of Netdata should be working like the day they were released. However, after the latest release, previous releases stop receiving bug fixes and security updates. All users are advised to update to the latest release to get the latest bug fixes. +> "Support Duration" defines the time we consider the release as actively used by users in production systems, so that all features of Netdata should be working like the day they were released. However, after the latest release, previous releases stop receiving bug fixes and security updates. All users are advised to update to the latest release to get the latest bug fixes. ## Binary Distribution Packages -Binary distribution packages are provided by Netdata, via CI integration, for the following platforms and architectures: +Binary distribution packages are provided by Netdata, via CI integration, for the following platforms and architectures: | Platform | Platform Versions | Released Packages Architecture | Format | |:-----------------------:|:--------------------------------:|:------------------------------------------------:|:------------:| @@ -30,7 +30,7 @@ Binary distribution packages are provided by Netdata, via CI integration, for th | Redhat Enterprise Linux | 8.x, 9.x | `x86_64`, `AArch64` | RPM | | Ubuntu | 20.04, 22.04, 23.10 | `x86_64`, `i386`, `ARMv7`, `AArch64` | DEB | -> IMPORTANT: Linux distributions frequently provide binary packages of Netdata. However, the packages you will find in the distributions' repositories may be outdated, incomplete, missing significant features or completely broken. We recommend using the packages we provide. +> IMPORTANT: Linux distributions frequently provide binary packages of Netdata. However, the packages you will find in the distributions' repositories may be outdated, incomplete, missing significant features or completely broken. We recommend using the packages we provide. ## Third-party Supported Binary Packages @@ -41,7 +41,6 @@ The following distributions always provide the latest stable version of Netdata: | Arch Linux | Latest | All the Arch supported architectures | | MacOS Brew | Latest | All the Brew supported architectures | - ## Builds from Source We guarantee Netdata builds from source for the platforms we provide automated binary packages. These platforms are automatically checked via our CI, and fixes are always applied to allow merging new code into the nightly versions. @@ -59,9 +58,9 @@ The following builds from source should usually work, although we don't regularl ## Static Builds and Unsupported Linux Versions -The static builds of Netdata can be used on any Linux platform of the supported architectures. The only requirement these static builds have is a working Linux kernel, any version. Everything else required for Netdata to run, is inside the package itself. +The static builds of Netdata can be used on any Linux platform of the supported architectures. The only requirement these static builds have is a working Linux kernel, any version. Everything else required for Netdata to run is inside the package itself. -Static builds usually miss certain features that require operating-system support and cannot be provided in a generic way. These features include: +Static builds usually miss certain features that require operating-system support and can’t be provided generically. These features include: - IPMI hardware sensors support - systemd-journal features diff --git a/docs/netdata-assistant.md b/docs/netdata-assistant.md index afa13f6e9..e01aa2774 100644 --- a/docs/netdata-assistant.md +++ b/docs/netdata-assistant.md @@ -7,14 +7,14 @@ The Netdata Assistant is a feature that uses large language models and the Netda - Navigate to the alerts tab - If there are active alerts, the `Actions` column will have an Assistant button - ![](https://github-production-user-asset-6210df.s3.amazonaws.com/24860547/253559075-815ca123-e2b6-4d44-a780-eeee64cca420.png) + ![actions column](https://github-production-user-asset-6210df.s3.amazonaws.com/24860547/253559075-815ca123-e2b6-4d44-a780-eeee64cca420.png) - Clicking on the Assistant button opens up as a floating window with customized information and troubleshooting tips for this alert (note that the window can follow you through your troubleshooting journey on Netdata dashboards) - ![](https://github-production-user-asset-6210df.s3.amazonaws.com/24860547/253559645-62850c7b-cd1d-45f2-b2dd-474ecbf2b713.png) + ![Netdata Assistant popup](https://github-production-user-asset-6210df.s3.amazonaws.com/24860547/253559645-62850c7b-cd1d-45f2-b2dd-474ecbf2b713.png) -- In case you need more information, or want to understand deeper, Netdata Assistant also provides useful web links to resources that can help. +- In case you need more information, or want to understand deeper, Netdata Assistant also provides useful web links to resources that can help. - ![](https://github-production-user-asset-6210df.s3.amazonaws.com/24860547/253560071-e768fa6d-6c9a-4504-bb1f-17d5f4707627.png) + ![useful resources](https://github-production-user-asset-6210df.s3.amazonaws.com/24860547/253560071-e768fa6d-6c9a-4504-bb1f-17d5f4707627.png) - If there are no active alerts, you can still use Netdata Assistant by clicking the Assistant button on the Alert Configuration view. diff --git a/docs/netdata-cloud/authentication-and-authorization/api-tokens.md b/docs/netdata-cloud/authentication-and-authorization/api-tokens.md index 88b73ee68..a8f304ffb 100644 --- a/docs/netdata-cloud/authentication-and-authorization/api-tokens.md +++ b/docs/netdata-cloud/authentication-and-authorization/api-tokens.md @@ -30,5 +30,5 @@ Currently, the Netdata Cloud is not exposing stable API. * get the cloud space list ```console -$ curl -H 'Accept: application/json' -H "Authorization: Bearer " https://app.netdata.cloud/api/v2/spaces +curl -H 'Accept: application/json' -H "Authorization: Bearer " https://app.netdata.cloud/api/v2/spaces ``` diff --git a/docs/netdata-cloud/authentication-and-authorization/enterprise-sso-authentication.md b/docs/netdata-cloud/authentication-and-authorization/enterprise-sso-authentication.md index 7657e8bcf..184ff5db9 100644 --- a/docs/netdata-cloud/authentication-and-authorization/enterprise-sso-authentication.md +++ b/docs/netdata-cloud/authentication-and-authorization/enterprise-sso-authentication.md @@ -1,36 +1,47 @@ # Enterprise SSO Authentication Netdata provides you with means to streamline and control how your team connects and authenticates to Netdata Cloud. We provide - diferent Single Sign-On (SSO) integrations that allow you to connect with the tool that your organization is using to manage your + different Single Sign-On (SSO) integrations that allow you to connect with the tool that your organization is using to manage your user accounts. - > ❗ This feature focus is on the Authentication flow, it doesn't support the Authorization with managing Users and Roles. - + > **Note** This feature focus is on the Authentication flow, it doesn't support the Authorization with managing Users and Roles. ## How to set it up? If you want to setup your Netdata Space to allow user Authentication through an Enterprise SSO tool you need to: -* Confirm the integration to the tool you want is available ([Authentication integations](https://learn.netdata.cloud/docs/netdata-cloud/authentication-&-authorization/cloud-authentication-&-authorization-integrations)) + +* Confirm the integration to the tool you want is available ([Authentication integrations](https://learn.netdata.cloud/docs/netdata-cloud/authentication-&-authorization/cloud-authentication-&-authorization-integrations)) * Have a Netdata Cloud account * Have Access to the Space as an administrator * Your Space needs to be on the Business plan or higher Once you ensure the above prerequisites you need to: + 1. Click on the Space settings cog (located above your profile icon) 2. Click on the Authentication tab 3. Select the card for the integration you are looking for, click on Configure 4. Fill the required attributes need to establish the integration with the tool - ## How to authenticate to Netdata? ### From Netdata Sign-up page -If you're starting your flow from Netdata sign-in page you need to: -1. Click on the link `Sign-in with an Enterprise Signle Sign-On (SSO)` -2. Enter your email address -3. Go to your mailbox and check the `Sign In to Nedata` email that you have received -4. Click on the **Sign In** button +#### Requirements + +You have to update your DNS settings by adding a TXT record with the Netdata verification code as its **Value**. +The **Value** can be found by clicking the **DNS TXT record** button in your space settings under **User Management**, in the** Authentication & Authorization** tab. + +Log into your domain provider’s website, and navigate to the DNS records section. +Create a new TXT record with the following specifications: +- Value/Answer/Description: `"netdata-verification=[VERIFICATION CODE]"` +- Name/Host/Alias: Leave this blank or type @ to include a subdomain. +- Time to live (TTL): "86400" (this can also be inherited from the default configuration). + +#### Starting the flow from Netdata sign-in page + +1. Click on the link `Sign-in with an Enterprise Single Sign-On (SSO)` +2. Enter your email address +3. Complete the SSO flow Note: If you're not authenticated on the Enterprise SSO tool you'll be prompted to authenticate there first before being allowed to proceed to Netdata Cloud. diff --git a/docs/netdata-cloud/authentication-and-authorization/role-based-access-model.md b/docs/netdata-cloud/authentication-and-authorization/role-based-access-model.md index d2a3ea4f2..2226a1a0d 100644 --- a/docs/netdata-cloud/authentication-and-authorization/role-based-access-model.md +++ b/docs/netdata-cloud/authentication-and-authorization/role-based-access-model.md @@ -108,9 +108,9 @@ In more detail, you can find on the following tables which functionalities are a | **Functionality** | **Admin** | **Manager** | **Troubleshooter** | **Observer** | **Billing** | **Member** | Notes | |:-------------------------------|:------------------:|:------------------:|:------------------:|:------------------:|:-----------:|:------------------:|:---------------------------------------------------------------------| -| See all functions in Room | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | - | :heavy_check_mark: | -| Run any function in Room | :heavy_check_mark: | :heavy_check_mark: | - | - | - | - | -| Run read-only function in Room | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | - | :heavy_check_mark: | | +| See all functions in Room | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | - | :heavy_check_mark: | :: | +| Run any function in Room | :heavy_check_mark: | :heavy_check_mark: | - | - | - | - | :: | +| Run read-only function in Room | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | - | :heavy_check_mark: | :: | | Run sensitive function in Room | :heavy_check_mark: | :heavy_check_mark: | - | - | - | - | There isn't any function on this category yet, so subject to change. | ### Events feed diff --git a/docs/netdata-cloud/netdata-cloud-on-prem/installation.md b/docs/netdata-cloud/netdata-cloud-on-prem/installation.md index 259ddb5ce..a23baa99c 100644 --- a/docs/netdata-cloud/netdata-cloud-on-prem/installation.md +++ b/docs/netdata-cloud/netdata-cloud-on-prem/installation.md @@ -10,6 +10,20 @@ The following components are required to install Netdata Cloud On-Prem: - **Helm** version 3.12+ with OCI Configuration (explained in the installation section) - **Kubectl** +The minimum requirements for Netdata-Cloud are: + +- 4 CPU cores +- 15GiB of memory +- Cloud services are ephemeral + +The requirements for the non-production Dependencies helm chart: + +- 8 CPU cores +- 14GiB of memory +- 160GiB for PVCs (SSD) + +> **_NOTE:_** Values for each component may vary depending on the type of load. The most compute-intensive task that the On-Prem needs to perform is the initial sync of directly connected Agents. The testing for these requirements was conducted with 1,000 nodes directly connected to the On-Prem. If you plan on spawning hundreds of new nodes within a few minutes, Postgres will be the first bottleneck. For example, a 2 vCPU / 8 GiB memory / 1k IOPS database can handle 1,000 nodes without any problems if your environment is fairly steady, adding nodes in batches of 10-30 (directly connected). + ## Preparations for Installation ### Configure AWS CLI @@ -103,39 +117,40 @@ helm upgrade --wait --install netdata-cloud-onprem -n netdata-cloud --create-nam ## Short description of Netdata Cloud microservices -#### cloud-accounts-service +### cloud-accounts-service Responsible for user registration & authentication. Manages user account information. -#### cloud-agent-data-ctrl-service +### cloud-agent-data-ctrl-service Forwards request from the cloud to the relevant agents. The requests include: + - Fetching chart metadata from the agent - Fetching chart data from the agent - Fetching function data from the agent -#### cloud-agent-mqtt-input-service +### cloud-agent-mqtt-input-service Forwards MQTT messages emitted by the agent related to the agent entities to the internal Pulsar broker. These include agent connection state updates. -#### cloud-agent-mqtt-output-service +### cloud-agent-mqtt-output-service Forwards Pulsar messages emitted in the cloud related to the agent entities to the MQTT broker. From there, the messages reach the relevant agent. -#### cloud-alarm-config-mqtt-input-service +### cloud-alarm-config-mqtt-input-service Forwards MQTT messages emitted by the agent related to the alarm-config entities to the internal Pulsar broker. These include the data for the alarm configuration as seen by the agent. -#### cloud-alarm-log-mqtt-input-service +### cloud-alarm-log-mqtt-input-service Forwards MQTT messages emitted by the agent related to the alarm-log entities to the internal Pulsar broker. These contain data about the alarm transitions that occurred in an agent. -#### cloud-alarm-mqtt-output-service +### cloud-alarm-mqtt-output-service Forwards Pulsar messages emitted in the cloud related to the alarm entities to the MQTT broker. From there, the messages reach the relevant agent. -#### cloud-alarm-processor-service +### cloud-alarm-processor-service Persists latest alert statuses received from the agent in the cloud. Aggregates alert statuses from relevant node instances. @@ -143,69 +158,69 @@ Exposes API endpoints to fetch alert data for visualization on the cloud. Determines if notifications need to be sent when alert statuses change and emits relevant messages to Pulsar. Exposes API endpoints to store and return notification-silencing data. -#### cloud-alarm-streaming-service +### cloud-alarm-streaming-service Responsible for starting the alert stream between the agent and the cloud. Ensures that messages are processed in the correct order, and starts a reconciliation process between the cloud and the agent if out-of-order processing occurs. -#### cloud-charts-mqtt-input-service +### cloud-charts-mqtt-input-service Forwards MQTT messages emitted by the agent related to the chart entities to the internal Pulsar broker. These include the chart metadata that is used to display relevant charts on the cloud. -#### cloud-charts-mqtt-output-service +### cloud-charts-mqtt-output-service Forwards Pulsar messages emitted in the cloud related to the charts entities to the MQTT broker. From there, the messages reach the relevant agent. -#### cloud-charts-service +### cloud-charts-service Exposes API endpoints to fetch the chart metadata. Forwards data requests via the `cloud-agent-data-ctrl-service` to the relevant agents to fetch chart data points. Exposes API endpoints to call various other endpoints on the agent, for instance, functions. -#### cloud-custom-dashboard-service +### cloud-custom-dashboard-service Exposes API endpoints to fetch and store custom dashboard data. -#### cloud-environment-service +### cloud-environment-service Serves as the first contact point between the agent and the cloud. Returns authentication and MQTT endpoints to connecting agents. -#### cloud-feed-service +### cloud-feed-service Processes incoming feed events and stores them in Elasticsearch. Exposes API endpoints to fetch feed events from Elasticsearch. -#### cloud-frontend +### cloud-frontend Contains the on-prem cloud website. Serves static content. -#### cloud-iam-user-service +### cloud-iam-user-service Acts as a middleware for authentication on most of the API endpoints. Validates incoming token headers, injects the relevant ones, and forwards the requests. -#### cloud-metrics-exporter +### cloud-metrics-exporter Exports various metrics from an On-Prem Cloud installation. Uses the Prometheus metric exposition format. -#### cloud-netdata-assistant +### cloud-netdata-assistant Exposes API endpoints to fetch a human-friendly explanation of various netdata configuration options, namely the alerts. -#### cloud-node-mqtt-input-service +### cloud-node-mqtt-input-service Forwards MQTT messages emitted by the agent related to the node entities to the internal Pulsar broker. These include the node metadata as well as their connectivity state, either direct or via parents. -#### cloud-node-mqtt-output-service +### cloud-node-mqtt-output-service Forwards Pulsar messages emitted in the cloud related to the charts entities to the MQTT broker. From there, the messages reach the relevant agent. -#### cloud-notifications-dispatcher-service +### cloud-notifications-dispatcher-service Exposes API endpoints to handle integrations. Handles incoming notification messages and uses the relevant channels(email, slack...) to notify relevant users. -#### cloud-spaceroom-service +### cloud-spaceroom-service Exposes API endpoints to fetch and store relations between agents, nodes, spaces, users, and rooms. Acts as a provider of authorization for other cloud endpoints. diff --git a/docs/netdata-cloud/versions.md b/docs/netdata-cloud/versions.md index 06a8f706a..1bfd363d6 100644 --- a/docs/netdata-cloud/versions.md +++ b/docs/netdata-cloud/versions.md @@ -4,7 +4,7 @@ Netdata Cloud is provided in two versions: - **SaaS**, we run and maintain Netdata Cloud and users use it to complement their observability with the additional features it provides. -- **On Prem**, we provide a licensed copy of the Netdata Cloud software, that users can install and run at their premises. +- **On Prem**, we provide a licensed copy of the Netdata Cloud software, that users can install and run at their premises. The pricing of both versions is similar, with the On-Prem version introducing a monthly fixed-fee for the extra support and packaging required when users are running Netdata Cloud by themselves. diff --git a/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/active-journal-source-without-encryption.md b/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/active-journal-source-without-encryption.md index cbed1e81e..8abccad01 100644 --- a/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/active-journal-source-without-encryption.md +++ b/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/active-journal-source-without-encryption.md @@ -47,7 +47,7 @@ sudo systemctl enable --now systemd-journal-gatewayd.socket To use it, open your web browser and navigate to: -``` +```url http://server.ip:19531/browse ``` diff --git a/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/passive-journal-centralization-with-encryption-using-self-signed-certificates.md b/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/passive-journal-centralization-with-encryption-using-self-signed-certificates.md index 7f0b7152e..8509a33da 100644 --- a/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/passive-journal-centralization-with-encryption-using-self-signed-certificates.md +++ b/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/passive-journal-centralization-with-encryption-using-self-signed-certificates.md @@ -26,7 +26,7 @@ This helps to also automate the distribution of the certificates to your servers We suggest to keep this script and all the involved certificates at the journals centralization server, in the directory `/etc/ssl/systemd-journal`, so that you can make future changes as required. If you prefer to keep the certificate authority and all the certificates at a more secure location, just use the script on that location. -On the server that will issue the certificates (usually the centralizaton server), do the following: +On the server that will issue the certificates (usually the centralization server), do the following: ```bash # install systemd-journal-remote to add the users and groups required and openssl for the certs @@ -150,7 +150,7 @@ sudo apt-get install systemd-journal-remote Edit `/etc/systemd/journal-upload.conf` and set the IP address and the port of the server, like so: -```conf +```text [Upload] URL=https://centralization.server.ip:19532 ``` @@ -165,7 +165,7 @@ sudo systemctl edit systemd-journal-upload.service At the top, add: -```conf +```text [Service] Restart=always ``` diff --git a/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/passive-journal-centralization-without-encryption.md b/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/passive-journal-centralization-without-encryption.md index b70c22033..a89379e4b 100644 --- a/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/passive-journal-centralization-without-encryption.md +++ b/docs/observability-centralization-points/logs-centralization-points-with-systemd-journald/passive-journal-centralization-without-encryption.md @@ -74,7 +74,7 @@ sudo apt-get install systemd-journal-remote Edit `/etc/systemd/journal-upload.conf` and set the IP address and the port of the server, like so: -```conf +```text [Upload] URL=http://centralization.server.ip:19532 ``` @@ -87,7 +87,7 @@ sudo systemctl edit systemd-journal-upload At the top, add: -```conf +```text [Service] Restart=always ``` diff --git a/docs/observability-centralization-points/metrics-centralization-points/clustering-and-high-availability-of-netdata-parents.md b/docs/observability-centralization-points/metrics-centralization-points/clustering-and-high-availability-of-netdata-parents.md index 17a10b02e..412263beb 100644 --- a/docs/observability-centralization-points/metrics-centralization-points/clustering-and-high-availability-of-netdata-parents.md +++ b/docs/observability-centralization-points/metrics-centralization-points/clustering-and-high-availability-of-netdata-parents.md @@ -45,6 +45,6 @@ The easiest way is to `rsync` the directory `/var/cache/netdata` from the existi To configure retention at the new Netdata Parent, set in `netdata.conf` the following to at least the values the old Netdata Parent has: -- `[db].dbengine multihost disk space MB`, this is the max disk size for `tier0`. The default is 256MiB. -- `[db].dbengine tier 1 multihost disk space MB`, this is the max disk space for `tier1`. The default is 50% of `tier0`. -- `[db].dbengine tier 2 multihost disk space MB`, this is the max disk space for `tier2`. The default is 50% of `tier1`. +- `[db].dbengine tier 0 retention size`, this is the max disk size for `tier0`. The default is 1GiB. +- `[db].dbengine tier 1 retention size`, this is the max disk space for `tier1`. The default is 1GiB. +- `[db].dbengine tier 2 retention size`, this is the max disk space for `tier2`. The default is 1GiB. diff --git a/docs/observability-centralization-points/metrics-centralization-points/configuration.md b/docs/observability-centralization-points/metrics-centralization-points/configuration.md index bf2aa98db..d1f13f050 100644 --- a/docs/observability-centralization-points/metrics-centralization-points/configuration.md +++ b/docs/observability-centralization-points/metrics-centralization-points/configuration.md @@ -58,7 +58,7 @@ Save the file and restart Netdata. While encrypting the connection between your parent and child nodes is recommended for security, it's not required to get started. -This example uses self-signed certificates. +This example uses self-signed certificates. > **Note** > This section assumes you have read the documentation on [how to edit the Netdata configuration files](/docs/netdata-agent/configuration/README.md). @@ -70,7 +70,7 @@ This example uses self-signed certificates. 2. **Child node** Update `stream.conf` to enable SSL/TLS and allow self-signed certificates. Append ':SSL' to the destination and uncomment 'ssl skip certificate verification'. - ```conf + ```text [stream] enabled = yes destination = 203.0.113.0:SSL @@ -80,8 +80,6 @@ This example uses self-signed certificates. 3. Restart the Netdata Agent on both the parent and child nodes, to stream encrypted metrics using TLS/SSL. - - ## Troubleshooting Streaming Connections You can find any issues related to streaming at Netdata logs. diff --git a/docs/observability-centralization-points/metrics-centralization-points/faq.md b/docs/observability-centralization-points/metrics-centralization-points/faq.md index 027dfc748..1ce0d8534 100644 --- a/docs/observability-centralization-points/metrics-centralization-points/faq.md +++ b/docs/observability-centralization-points/metrics-centralization-points/faq.md @@ -65,6 +65,14 @@ It depends on the ephemerality setting of each Netdata Child. 2. **Ephemeral nodes**: These are nodes that are ephemeral by nature and they may shutdown at any point in time without any impact on the services you run. -To set the ephemeral flag on a node, edit its netdata.conf and in the `[health]` section set `is ephemeral = yes`. This setting is propagated to parent nodes and Netdata Cloud. +To set the ephemeral flag on a node, edit its netdata.conf and in the `[global]` section set `is ephemeral node = yes`. This setting is propagated to parent nodes and Netdata Cloud. + +A parent node tracks connections and disconnections. When a node is marked as ephemeral and stops connecting for more than 24 hours, the parent will delete it from its memory and local administration, and tell Cloud that it is no longer live nor stale. Data for the node can no longer be accessed, but if the node connects again later, the node will be "revived", and previous data becomes available again. + +A node can be forced into this "forgotten" state with the Netdata CLI tool on the parent the node is connected to (if still connected) or one of the parent agents it was previously connected to. The state will be propagated _upwards_ and _sideways_ in case of an HA setup. + +``` +netdatacli remove-stale-node +``` When using Netdata Cloud (via a parent or directly) and a permanent node gets disconnected, Netdata Cloud sends node disconnection notifications. diff --git a/docs/observability-centralization-points/metrics-centralization-points/replication-of-past-samples.md b/docs/observability-centralization-points/metrics-centralization-points/replication-of-past-samples.md index 5c776b860..e0c60e89f 100644 --- a/docs/observability-centralization-points/metrics-centralization-points/replication-of-past-samples.md +++ b/docs/observability-centralization-points/metrics-centralization-points/replication-of-past-samples.md @@ -45,13 +45,13 @@ The following `netdata.conf` configuration parameters affect replication. On the receiving side (Netdata Parent): -- `[db].seconds to replicate` limits the maximum time to be replicated. The default is 1 day (86400 seconds). Keep in mind that replication is also limited by the `tier0` retention the sending side has. +- `[db].replication period` limits the maximum time to be replicated. The default is 1 day. Keep in mind that replication is also limited by the `tier0` retention the sending side has. On the sending side (Netdata Children, or Netdata Parent when parents are clustered): - `[db].replication threads` controls how many concurrent threads will be replicating metrics. The default is 1. Usually the performance is about 2 million samples per second per thread, so increasing this number may allow replication to progress faster between Netdata Parents. -- `[db].cleanup obsolete charts after secs` controls for how much time after metrics stop being collected will not be available for replication. The default is 1 hour (3600 seconds). If you plan to have scheduled maintenance on Netdata Parents of more than 1 hour, we recommend increasing this setting. Keep in mind however, that increasing this duration in highly ephemeral environments can have an impact on RAM utilization, since metrics will be considered as collected for longer durations. +- `[db].cleanup obsolete charts after` controls for how much time after metrics stop being collected will not be available for replication. The default is 1 hour (3600 seconds). If you plan to have scheduled maintenance on Netdata Parents of more than 1 hour, we recommend increasing this setting. Keep in mind however, that increasing this duration in highly ephemeral environments can have an impact on RAM utilization, since metrics will be considered as collected for longer durations. ## Monitoring Replication Progress diff --git a/docs/security-and-privacy-design/README.md b/docs/security-and-privacy-design/README.md index c6bfd699e..da484bc0e 100644 --- a/docs/security-and-privacy-design/README.md +++ b/docs/security-and-privacy-design/README.md @@ -1,9 +1,8 @@ # Security and Privacy Design This document serves as the relevant Annex to the [Terms of Service](https://www.netdata.cloud/service-terms/), -the [Privacy Policy](https://www.netdata.cloud/privacy/) and -the Data Processing Addendum, when applicable. It provides more information regarding Netdata’s technical and -organizational security and privacy measures. +the [Privacy Policy](https://www.netdata.cloud/privacy/) and the Data Processing Addendum, when applicable. +It provides more information regarding Netdata’s technical and organizational security and privacy measures. We have given special attention to all aspects of Netdata, ensuring that everything throughout its operation is as secure as possible. Netdata has been designed with security in mind. @@ -16,6 +15,13 @@ Netdata, an open-source software widely installed across the globe, prioritizes commitment to safeguarding user data. The entire structure and internal architecture of the software is built to ensure maximum security. We aim to provide a secure environment from the ground up, rather than as an afterthought. +Netdata Cloud ensures a secure, user-centric environment for monitoring and troubleshooting, treating +observability data and observability metadata distinctly to maintain user control over system insights and +personal information. **Observability data**, which includes metric values (time series) and log events, remains +fully under user control, stored locally on the user's premises. **Observability metadata**, including hostnames, +metric names, alert names, and alert transitions, is minimally required by Netdata Cloud and securely managed +for routing and platform usage purposes. + ### Compliance with Open Source Security Foundation Best Practices Netdata is committed to adhering to the best practices laid out by the Open Source Security Foundation (OSSF). @@ -23,7 +29,7 @@ Currently, the Netdata Agent follows the OSSF best practices at the passing leve the [OSSF guidelines](https://bestpractices.coreinfrastructure.org/en/projects/2231) Netdata Cloud boasts of comprehensive end-to-end automated testing, encompassing the UI, back-end, and agents, where -involved. In addition, the Netdata Agent uses an array of third-party services for static code analysis, static code +involved. In addition, the Netdata Agent uses an array of third-party services for static code analysis, security analysis, and CI/CD integrations to ensure code quality on a per pull request basis. Tools like Github's CodeQL, Github's Dependabot, our own unit tests, various types of linters, and [Coverity](https://scan.coverity.com/projects/netdata-netdata?tab=overview) are utilized to this end. @@ -75,16 +81,20 @@ protection laws, including the GDPR and CCPA. ### Data Transfers -While Netdata Agent itself does not engage in any cross-border data transfers, certain personal and infrastructure data -is transferred to Netdata Cloud for the purpose of providing its services. The metric data collected and processed by -Netdata Agents, however, stays strictly within the user's infrastructure, eliminating any concerns about cross-border -data transfer issues. +While Netdata Agent itself does not engage in any cross-border data transfers, certain **observability metadata** (e.g. +hostnames, metric names, alert names, and alert transitions) is transferred to Netdata Cloud solely to provide routing +and alert notifications. **Observability data**, consisting of metric values (time series) and log events, stays +strictly within the user's infrastructure, mitigating cross-border data transfer concerns. + +For users leveraging Netdata Cloud, **observability data** is securely tunneled through Netdata Cloud for real-time +viewing, similar to a VPN, without being stored on Netdata Cloud servers. This approach ensures that Netdata Cloud +maintains only necessary metadata, while full control of observability data remains with the user. -When users utilize Netdata Cloud, the metric data is streamed directly from the Netdata Agent to the users’ web browsers -via Netdata Cloud, without being stored on Netdata Cloud's servers. However, user identification data (such as email -addresses) and infrastructure metadata necessary for Netdata Cloud's operation are stored in data centers in the United -States, using compliant infrastructure providers such as Google Cloud and Amazon Web Services. These transfers and -storage are carried out in full compliance with applicable data protection laws, including GDPR and CCPA. +Netdata Cloud only stores Netdata Cloud users identification data (such as observability users' email addresses) and +infrastructure metadata (such as infrastructure hostnames) necessary for Netdata Cloud's operation. All these metadata +are stored in data centers in the United States, using compliant infrastructure providers such as Google Cloud and +Amazon Web Services. These transfers and storage are carried out in full compliance with applicable data protection +laws, including GDPR and CCPA. ### Privacy Rights @@ -104,9 +114,11 @@ and reach out with any questions or concerns they may have about data protection ## Anonymous Statistics -The anonymous statistics collected by the Netdata Agent are related to the installations and not to individual users. -This data includes community size, types of plugins used, possible crashes, operating systems installed, and the use of -the registry feature. No IP addresses are collected, but each Netdata installation has a unique ID. +The anonymous statistics collected by the Netdata Agent pertain to installations rather than individual users, +capturing general information such as community size, plugin types, crashes, operating systems, and feature usage. +Importantly, **observability data** — metric values and log events — remain local to the user's infrastructure and +are not collected in this process. **Observability metadata**, including unique IDs for installations, is anonymized +and stored solely to support product development and community understanding. Netdata also collects anonymous telemetry events, which provide information on the usage of various features, errors, and performance metrics. This data is used to understand how the software is being used and to identify areas for @@ -130,41 +142,45 @@ improvement, while respecting user privacy and maintaining transparency. Internal Security Measures at Netdata are designed with an emphasis on data privacy and protection. The measures include: -1. **Infrastructure as Code (IaC)** : +1. **Observability data and metadata distinction** + Netdata Cloud securely handles observability metadata in isolated environments, while observability data remains + exclusively within user premises, stored locally and managed by the user. This distinction ensures that only + minimal metadata is required for routing and system identification. +3. **Infrastructure as Code (IaC)** : Netdata Cloud follows the IaC model, which means it is a microservices environment that is completely isolated. All changes are managed through Terraform, an open-source IaC software tool that provides a consistent CLI workflow for managing cloud services. -2. **TLS Termination and IAM Service** : +4. **TLS Termination and IAM Service** : At the edge of Netdata Cloud, there is a TLS termination, which provides the decryption point for incoming TLS connections. Additionally, an Identity Access Management (IAM) service validates JWT tokens included in request cookies or denies access to them. -3. **Session Identification** : +5. **Session Identification** : Once inside the microservices environment, all requests are associated with session IDs that identify the user making the request. This approach provides additional layers of security and traceability. -4. **Data Storage** : +6. **Data Storage** : Data is stored in various NoSQL and SQL databases and message brokers. The entire environment is fully isolated, providing a secure space for data management. -5. **Authentication** : +7. **Authentication** : Netdata Cloud does not store credentials. It offers three types of authentication: GitHub Single Sign-On (SSO), Google SSO, and email validation. -6. **DDoS Protection** : +8. **DDoS Protection** : Netdata Cloud has multiple protection mechanisms against Distributed Denial of Service (DDoS) attacks, including rate-limiting and automated blacklisting. -7. **Security-Focused Development Process** : +9. **Security-Focused Development Process** : To ensure a secure environment, Netdata employs a security-focused development process. This includes the use of - static code analysers to identify potential security vulnerabilities in the codebase. -8. **High Security Standards** : + static code analyzers to identify potential security vulnerabilities in the codebase. +10. **High Security Standards** : Netdata Cloud maintains high security standards and can provide additional customization on a per contract basis. -9. **Employee Security Practices** : +11. **Employee Security Practices** : Netdata ensures its employees follow security best practices, including role-based access, periodic access review, and multi-factor authentication. This helps to minimize the risk of unauthorized access to sensitive data. -10. **Experienced Developers** : +12. **Experienced Developers** : Netdata hires senior developers with vast experience in security-related matters. It enforces two code reviews for every Pull Request (PR), ensuring that any potential issues are identified and addressed promptly. -11. **DevOps Methodologies** : +13. **DevOps Methodologies** : Netdata's DevOps methodologies use the highest standards in access control in all places, utilizing the best practices available. -12. **Risk-Based Security Program** : +14. **Risk-Based Security Program** : Netdata has a risk-based security program that continually assesses and mitigates risks associated with data security. This program helps maintain a secure environment for user data. @@ -243,7 +259,12 @@ Netdata is committed to continuous improvement in security and privacy. While we ## Conclusion -In conclusion, Netdata Cloud's commitment to data security and user privacy is paramount. From the careful design of the +Netdata Cloud is designed to secure observability insights for users, maintaining a clear separation between +observability data and observability metadata. All observability data — metric values and log events — are stored locally, +entirely under user control, while only essential metadata (hostnames, metric names, alert details) is managed by Netdata +Cloud for system routing and alerting. + +Netdata Cloud's commitment to data security and user privacy is paramount. From the careful design of the infrastructure and stringent internal security measures to compliance with international regulations and standards like GDPR and CCPA, Netdata Cloud ensures a secure environment for users to monitor and troubleshoot their systems. diff --git a/docs/security-and-privacy-design/netdata-agent-security.md b/docs/security-and-privacy-design/netdata-agent-security.md index f441fe850..d2e2e1429 100644 --- a/docs/security-and-privacy-design/netdata-agent-security.md +++ b/docs/security-and-privacy-design/netdata-agent-security.md @@ -14,7 +14,6 @@ databases, sent to upstream Netdata servers, or archived to external time-series > > Users are responsible for backing up, recovering, and ensuring their data's availability because Netdata stores data locally on each system due to its decentralized architecture. - The Netdata Agent is programmed to safeguard user data. When collecting data, the raw data does not leave the host. All plugins, even those running with escalated capabilities or privileges, perform a hard-coded data collection job. They do not accept commands from Netdata, and the original application data collected do not leave the process they are @@ -60,7 +59,7 @@ information can be found [here](https://github.com/netdata/netdata/security/poli The Netdata agent is resilient against common security threats such as DDoS attacks and SQL injections. For DDoS, Netdata agent uses a fixed number of threads for processing requests, providing a cap on the resources that can be -consumed. It also automatically manages its memory to prevent overutilization. SQL injections are prevented as nothing +consumed. It also automatically manages its memory to prevent over-utilization. SQL injections are prevented as nothing from the UI is passed back to the data collection plugins accessing databases. Additionally, the Netdata agent is running as a normal, unprivileged, operating system user (a few data collections diff --git a/docs/security-and-privacy-design/netdata-cloud-security.md b/docs/security-and-privacy-design/netdata-cloud-security.md index 1f1bb67d2..1df022860 100644 --- a/docs/security-and-privacy-design/netdata-cloud-security.md +++ b/docs/security-and-privacy-design/netdata-cloud-security.md @@ -44,7 +44,7 @@ Netdata Cloud does not store user credentials. Netdata Cloud offers a variety of security features, including infrastructure-level dashboards, centralized alerts notifications, auditing logs, and role-based access to different segments of the infrastructure. The cloud service employs several protection mechanisms against DDoS attacks, such as rate-limiting and automated blacklisting. It also -uses static code analysers to prevent other types of attacks. +uses static code analyzers to prevent other types of attacks. In the event of potential security vulnerabilities or incidents, Netdata Cloud follows the same process as the Netdata agent. Every report is acknowledged and analyzed by the Netdata team within three working days, and the team keeps the diff --git a/docs/top-monitoring-netdata-functions.md b/docs/top-monitoring-netdata-functions.md index ee76d40ff..a9caea781 100644 --- a/docs/top-monitoring-netdata-functions.md +++ b/docs/top-monitoring-netdata-functions.md @@ -7,7 +7,7 @@ executed on the node/host where the function is made available. Collectors besides the metric collection, storing, and/or streaming work are capable of executing specific routines on request. These routines will bring additional information to help you troubleshoot or even trigger some action to happen on the node itself. -For more details please check out documentation on how we use our internal collector to get this from the first collector that exposes functions - [plugins.d](/src/collectors/plugins.d/README.md#function). +For more details please check out documentation on how we use our internal collector to get this from the first collector that exposes functions - [plugins.d](/src/plugins.d/README.md#function). ## Prerequisites -- cgit v1.2.3