From bb8713bbc1c4594366fc735c04910edbf4c61aab Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 31 Mar 2021 14:59:21 +0200 Subject: Merging upstream version 1.30.0. Signed-off-by: Daniel Baumann --- .devcontainer/devcontainer.json | 14 + .github/ISSUE_TEMPLATE/bug_report.md | 19 +- .github/scripts/build-dist.sh | 70 ++ .github/scripts/check-updater.sh | 38 + .github/scripts/gen-docker-tags.py | 13 + .github/scripts/old_package_purging.sh | 88 ++ .github/scripts/package_cloud_wrapper.sh | 48 + .github/workflows/build-and-install.yml | 13 + .github/workflows/checks.yml | 10 + .github/workflows/coverity.yml | 16 + .github/workflows/docker.yml | 28 +- .github/workflows/docs.yml | 2 + .github/workflows/packaging.yml | 149 +++ .github/workflows/review.yml | 3 + .github/workflows/tests.yml | 6 +- .github/workflows/updater.yml | 84 ++ .gitmodules | 3 + .travis.yml | 253 +---- .travis/package_management/build.sh | 32 - .travis/package_management/build_judy.sh | 36 - .travis/package_management/build_libuv.sh | 36 - .../build_package_in_container.sh | 82 -- .travis/package_management/common.py | 182 ---- .../configure_deb_lxc_environment.py | 90 -- .../configure_rpm_lxc_environment.py | 102 -- .travis/package_management/create_lxc_for_build.sh | 101 -- .travis/package_management/functions.sh | 33 - .travis/package_management/old_package_purging.sh | 93 -- .../package_management/package_cloud_wrapper.sh | 48 - .travis/package_management/prepare_packages.sh | 63 -- .../package_management/trigger_deb_lxc_build.py | 86 -- .../package_management/trigger_rpm_lxc_build.py | 55 - .travis/package_management/yank_stale_pkg.sh | 35 - .travis/trigger_package_build.sh | 20 + .travis/trigger_package_generation.sh | 57 - CHANGELOG.md | 186 +++- CMakeLists.txt | 4 + Makefile.am | 62 +- aclk/aclk.c | 821 ++++++++++++++ aclk/aclk.h | 100 ++ aclk/aclk_collector_list.c | 193 ++++ aclk/aclk_collector_list.h | 39 + aclk/aclk_otp.c | 261 +++++ aclk/aclk_otp.h | 10 + aclk/aclk_query.c | 295 +++++ aclk/aclk_query.h | 32 + aclk/aclk_query_queue.c | 128 +++ aclk/aclk_query_queue.h | 71 ++ aclk/aclk_rx_msgs.c | 343 ++++++ aclk/aclk_rx_msgs.h | 14 + aclk/aclk_stats.c | 274 +++++ aclk/aclk_stats.h | 64 ++ aclk/aclk_tx_msgs.c | 395 +++++++ aclk/aclk_tx_msgs.h | 24 + aclk/aclk_util.c | 347 ++++++ aclk/aclk_util.h | 52 + aclk/https_client.c | 246 +++++ aclk/https_client.h | 11 + aclk/legacy/aclk_common.c | 1 + aclk/legacy/aclk_lws_https_client.c | 4 + aclk/legacy/aclk_lws_wss_client.c | 4 +- aclk/legacy/aclk_query.c | 60 +- aclk/legacy/aclk_query.h | 4 + aclk/legacy/aclk_rx_msgs.c | 36 +- aclk/legacy/aclk_stats.c | 125 ++- aclk/legacy/aclk_stats.h | 13 +- aclk/legacy/agent_cloud_link.c | 3 +- backends/backends.c | 2 +- backends/prometheus/backend_prometheus.c | 4 +- build/subst.inc | 2 +- claim/claim.c | 4 + claim/netdata-claim.sh.in | 50 +- collectors/QUICKSTART.md | 4 +- collectors/all.h | 9 +- collectors/apps.plugin/apps_groups.conf | 8 +- collectors/apps.plugin/apps_plugin.c | 22 +- collectors/cgroups.plugin/sys_fs_cgroup.c | 97 +- collectors/cups.plugin/cups_plugin.c | 14 +- collectors/ebpf.plugin/Makefile.am | 18 +- collectors/ebpf.plugin/README.md | 51 +- collectors/ebpf.plugin/ebpf.c | 1122 +++----------------- collectors/ebpf.plugin/ebpf.conf | 45 - collectors/ebpf.plugin/ebpf.d.conf | 36 + collectors/ebpf.plugin/ebpf.d/cachestat.conf | 14 + .../ebpf.plugin/ebpf.d/ebpf_kernel_reject_list.txt | 1 + collectors/ebpf.plugin/ebpf.d/network.conf | 30 + collectors/ebpf.plugin/ebpf.d/process.conf | 14 + collectors/ebpf.plugin/ebpf.d/sync.conf | 23 + collectors/ebpf.plugin/ebpf.h | 38 +- collectors/ebpf.plugin/ebpf_apps.c | 32 +- collectors/ebpf.plugin/ebpf_apps.h | 15 +- collectors/ebpf.plugin/ebpf_cachestat.c | 655 ++++++++++++ collectors/ebpf.plugin/ebpf_cachestat.h | 64 ++ collectors/ebpf.plugin/ebpf_kernel_reject_list.txt | 1 - collectors/ebpf.plugin/ebpf_process.c | 68 +- collectors/ebpf.plugin/ebpf_process.h | 4 +- collectors/ebpf.plugin/ebpf_socket.c | 1029 +++++++++++++++++- collectors/ebpf.plugin/ebpf_socket.h | 29 +- collectors/ebpf.plugin/ebpf_sync.c | 389 +++++++ collectors/ebpf.plugin/ebpf_sync.h | 54 + collectors/freeipmi.plugin/README.md | 4 +- collectors/freeipmi.plugin/freeipmi_plugin.c | 37 +- collectors/nfacct.plugin/plugin_nfacct.c | 57 +- collectors/proc.plugin/proc_diskstats.c | 64 +- collectors/proc.plugin/proc_net_dev.c | 430 ++++++-- collectors/proc.plugin/proc_self_mountinfo.c | 17 +- collectors/proc.plugin/proc_self_mountinfo.h | 7 +- collectors/python.d.plugin/README.md | 7 +- collectors/python.d.plugin/adaptec_raid/README.md | 4 +- .../adaptec_raid/adaptec_raid.chart.py | 10 +- collectors/python.d.plugin/alarms/README.md | 4 +- collectors/python.d.plugin/alarms/alarms.chart.py | 43 +- collectors/python.d.plugin/alarms/alarms.conf | 4 + collectors/python.d.plugin/anomalies/README.md | 3 +- collectors/python.d.plugin/chrony/README.md | 4 +- .../python.d.plugin/example/example.chart.py | 7 +- collectors/python.d.plugin/example/example.conf | 25 +- collectors/python.d.plugin/hpssa/README.md | 7 +- collectors/python.d.plugin/megacli/README.md | 3 +- collectors/python.d.plugin/samba/README.md | 4 +- .../python.d.plugin/smartd_log/smartd_log.chart.py | 9 + collectors/statsd.plugin/Makefile.am | 1 + collectors/statsd.plugin/README.md | 38 +- collectors/statsd.plugin/k6.conf | 104 ++ collectors/statsd.plugin/statsd.c | 6 +- collectors/tc.plugin/plugin_tc.c | 16 +- collectors/xenstat.plugin/xenstat_plugin.c | 20 +- configure.ac | 51 +- contrib/debian/netdata.postinst | 4 +- contrib/debian/rules | 6 +- coverity-scan.sh | 2 +- daemon/README.md | 4 +- daemon/anonymous-statistics.sh.in | 133 ++- daemon/buildinfo.c | 255 +++-- daemon/buildinfo.h | 2 + daemon/common.h | 4 + daemon/config/README.md | 5 +- daemon/main.c | 7 +- daemon/main.h | 2 +- daemon/unit_test.c | 2 +- database/engine/pagecache.c | 68 ++ database/engine/pagecache.h | 2 + database/engine/rrdengine.c | 53 +- database/engine/rrdengine.h | 20 +- database/engine/rrdengineapi.c | 30 + database/engine/rrdengineapi.h | 1 + database/rrd.h | 64 +- database/rrdcalc.c | 14 +- database/rrdcalc.h | 2 +- database/rrddim.c | 32 +- database/rrdfamily.c | 6 +- database/rrdhost.c | 40 +- database/rrdset.c | 52 +- database/rrdvar.c | 6 +- database/rrdvar.h | 2 +- database/sqlite/Makefile.am | 4 + database/sqlite/sqlite_functions.c | 380 ++++++- database/sqlite/sqlite_functions.h | 3 +- docs/Running-behind-apache.md | 3 +- docs/Running-behind-caddy.md | 8 +- docs/anonymous-statistics.md | 66 +- docs/collect/enable-configure.md | 6 +- docs/collect/system-metrics.md | 5 +- docs/configure/nodes.md | 2 +- docs/export/enable-connector.md | 14 +- docs/get/README.md | 5 +- docs/getting-started.md | 14 +- docs/guides/collect-apache-nginx-web-logs.md | 9 +- docs/guides/collect-unbound-metrics.md | 8 +- .../export/export-netdata-metrics-graphite.md | 4 +- docs/guides/monitor-cockroachdb.md | 6 +- docs/guides/monitor-hadoop-cluster.md | 4 +- docs/guides/monitor/anomaly-detection.md | 26 +- docs/guides/monitor/kubernetes-k8s-netdata.md | 374 +++---- docs/guides/monitor/lamp-stack.md | 249 +++++ docs/guides/monitor/pi-hole-raspberry-pi.md | 8 +- docs/guides/monitor/process.md | 10 +- .../monitor/raspberry-pi-anomaly-detection.md | 127 +++ docs/guides/monitor/statsd.md | 297 ++++++ docs/guides/monitor/visualize-monitor-anomalies.md | 3 - docs/guides/python-collector.md | 486 +++++++++ docs/guides/step-by-step/step-04.md | 4 +- docs/guides/step-by-step/step-05.md | 4 +- docs/guides/step-by-step/step-06.md | 10 +- docs/guides/step-by-step/step-09.md | 10 +- .../monitor-debug-applications-ebpf.md | 8 +- docs/monitor/view-active-alarms.md | 19 +- docs/quickstart/infrastructure.md | 8 +- docs/store/change-metrics-storage.md | 3 +- docs/visualize/overview-infrastructure.md | 30 +- exporting/check_filters.c | 2 +- exporting/prometheus/prometheus.c | 5 +- health/Makefile.am | 1 - health/health.c | 6 +- health/health.d/adaptec_raid.conf | 16 +- health/health.d/anomalies.conf | 8 +- health/health.d/apcupsd.conf | 8 +- health/health.d/apps_plugin.conf | 15 - health/health.d/backend.conf | 25 +- health/health.d/bcache.conf | 14 +- health/health.d/beanstalkd.conf | 10 +- health/health.d/bind_rndc.conf | 4 +- health/health.d/boinc.conf | 8 +- health/health.d/btrfs.conf | 9 +- health/health.d/ceph.conf | 13 +- health/health.d/cgroups.conf | 17 +- health/health.d/cockroachdb.conf | 6 +- health/health.d/cpu.conf | 8 +- health/health.d/dbengine.conf | 10 +- health/health.d/disks.conf | 104 +- health/health.d/dns_query.conf | 2 +- health/health.d/dnsmasq_dhcp.conf | 4 +- health/health.d/dockerd.conf | 2 +- health/health.d/entropy.conf | 4 +- health/health.d/exporting.conf | 11 - health/health.d/fping.conf | 13 +- health/health.d/gearman.conf | 4 +- health/health.d/haproxy.conf | 4 +- health/health.d/hdfs.conf | 10 +- health/health.d/httpcheck.conf | 42 +- health/health.d/ioping.conf | 4 +- health/health.d/ipc.conf | 4 +- health/health.d/ipfs.conf | 2 +- health/health.d/ipmi.conf | 4 +- health/health.d/isc_dhcpd.conf | 20 +- health/health.d/kubelet.conf | 43 +- health/health.d/linux_power_supply.conf | 2 +- health/health.d/load.conf | 20 +- health/health.d/mdstat.conf | 7 +- health/health.d/megacli.conf | 68 +- health/health.d/memcached.conf | 13 +- health/health.d/memory.conf | 4 +- health/health.d/mysql.conf | 26 +- health/health.d/net.conf | 97 +- health/health.d/netfilter.conf | 19 +- health/health.d/pihole.conf | 10 +- health/health.d/portcheck.conf | 12 +- health/health.d/processes.conf | 6 +- health/health.d/ram.conf | 11 +- health/health.d/redis.conf | 4 +- health/health.d/retroshare.conf | 2 +- health/health.d/riakkv.conf | 38 +- health/health.d/scaleio.conf | 4 +- health/health.d/softnet.conf | 13 +- health/health.d/swap.conf | 22 +- health/health.d/synchronization.conf | 12 + health/health.d/tcp_conn.conf | 4 +- health/health.d/tcp_listen.conf | 9 +- health/health.d/tcp_mem.conf | 4 +- health/health.d/tcp_orphans.conf | 4 +- health/health.d/tcp_resets.conf | 28 +- health/health.d/udp_errors.conf | 29 +- health/health.d/unbound.conf | 4 +- health/health.d/vcsa.conf | 24 +- health/health.d/vernemq.conf | 319 ++---- health/health.d/vsphere.conf | 62 +- health/health.d/web_log.conf | 50 +- health/health.d/whoisquery.conf | 2 +- health/health.d/wmi.conf | 24 +- health/health.d/x509check.conf | 4 +- health/health.d/zfs.conf | 2 +- health/health.h | 2 +- health/health_config.c | 2 +- health/health_json.c | 19 +- health/health_log.c | 6 +- health/notifications/alarm-notify.sh.in | 35 +- health/notifications/email/README.md | 2 +- health/notifications/health_alarm_notify.conf | 4 + health/notifications/stackpulse/README.md | 5 +- libnetdata/avl/avl.c | 56 +- libnetdata/avl/avl.h | 20 +- libnetdata/config/appconfig.c | 12 +- libnetdata/config/appconfig.h | 4 +- libnetdata/dictionary/dictionary.c | 10 +- libnetdata/dictionary/dictionary.h | 2 +- libnetdata/ebpf/ebpf.c | 99 +- libnetdata/ebpf/ebpf.h | 25 + libnetdata/eval/eval.c | 2 +- libnetdata/libnetdata.c | 59 +- libnetdata/libnetdata.h | 11 +- libnetdata/tests/test_str2ld.c | 2 +- netdata-installer.sh | 42 +- netdata.spec.in | 10 +- packaging/Dockerfile.packager | 15 +- packaging/dashboard.checksums | 2 +- packaging/dashboard.version | 2 +- packaging/docker/README.md | 16 + packaging/docker/run.sh | 11 +- packaging/ebpf.checksums | 6 +- packaging/ebpf.version | 2 +- packaging/go.d.checksums | 32 +- packaging/go.d.version | 2 +- packaging/installer/README.md | 4 +- packaging/installer/kickstart-static64.sh | 127 ++- packaging/installer/kickstart.sh | 158 ++- packaging/installer/methods/kickstart-64.md | 2 +- packaging/installer/methods/kickstart.md | 2 +- packaging/installer/methods/kubernetes.md | 231 ++-- packaging/installer/netdata-updater.sh | 8 +- packaging/scripts/install.sh | 35 +- packaging/scripts/test.sh | 35 +- packaging/version | 2 +- registry/README.md | 11 + registry/registry.c | 19 +- registry/registry_init.c | 1 + registry/registry_internals.h | 1 + registry/registry_person.c | 4 +- registry/registry_person.h | 2 +- registry/registry_url.c | 6 +- registry/registry_url.h | 2 +- spawn/spawn.c | 14 +- spawn/spawn.h | 2 +- spawn/spawn_server.c | 12 +- streaming/receiver.c | 12 +- web/api/exporters/shell/README.md | 8 +- web/api/formatters/json/json.c | 10 +- web/api/formatters/json/json.h | 2 +- web/api/formatters/json_wrapper.c | 28 +- web/api/formatters/json_wrapper.h | 2 +- web/api/formatters/rrd2json.c | 89 +- web/api/formatters/value/value.c | 3 +- web/api/queries/query.c | 35 +- web/api/queries/rrdr.c | 7 +- web/api/queries/rrdr.h | 1 + web/api/web_api_v1.c | 26 +- web/gui/dashboard_info.js | 81 +- web/gui/main.js | 14 +- web/server/web_client.c | 27 +- web/server/web_client.h | 1 + 329 files changed, 12256 insertions(+), 4905 deletions(-) create mode 100644 .devcontainer/devcontainer.json create mode 100755 .github/scripts/build-dist.sh create mode 100755 .github/scripts/check-updater.sh create mode 100755 .github/scripts/gen-docker-tags.py create mode 100755 .github/scripts/old_package_purging.sh create mode 100755 .github/scripts/package_cloud_wrapper.sh create mode 100644 .github/workflows/packaging.yml create mode 100644 .github/workflows/updater.yml create mode 100644 .gitmodules delete mode 100644 .travis/package_management/build.sh delete mode 100755 .travis/package_management/build_judy.sh delete mode 100755 .travis/package_management/build_libuv.sh delete mode 100755 .travis/package_management/build_package_in_container.sh delete mode 100755 .travis/package_management/common.py delete mode 100755 .travis/package_management/configure_deb_lxc_environment.py delete mode 100755 .travis/package_management/configure_rpm_lxc_environment.py delete mode 100755 .travis/package_management/create_lxc_for_build.sh delete mode 100644 .travis/package_management/functions.sh delete mode 100755 .travis/package_management/old_package_purging.sh delete mode 100755 .travis/package_management/package_cloud_wrapper.sh delete mode 100755 .travis/package_management/prepare_packages.sh delete mode 100755 .travis/package_management/trigger_deb_lxc_build.py delete mode 100755 .travis/package_management/trigger_rpm_lxc_build.py delete mode 100755 .travis/package_management/yank_stale_pkg.sh create mode 100755 .travis/trigger_package_build.sh delete mode 100755 .travis/trigger_package_generation.sh create mode 100644 aclk/aclk.c create mode 100644 aclk/aclk.h create mode 100644 aclk/aclk_collector_list.c create mode 100644 aclk/aclk_collector_list.h create mode 100644 aclk/aclk_otp.c create mode 100644 aclk/aclk_otp.h create mode 100644 aclk/aclk_query.c create mode 100644 aclk/aclk_query.h create mode 100644 aclk/aclk_query_queue.c create mode 100644 aclk/aclk_query_queue.h create mode 100644 aclk/aclk_rx_msgs.c create mode 100644 aclk/aclk_rx_msgs.h create mode 100644 aclk/aclk_stats.c create mode 100644 aclk/aclk_stats.h create mode 100644 aclk/aclk_tx_msgs.c create mode 100644 aclk/aclk_tx_msgs.h create mode 100644 aclk/aclk_util.c create mode 100644 aclk/aclk_util.h create mode 100644 aclk/https_client.c create mode 100644 aclk/https_client.h delete mode 100644 collectors/ebpf.plugin/ebpf.conf create mode 100644 collectors/ebpf.plugin/ebpf.d.conf create mode 100644 collectors/ebpf.plugin/ebpf.d/cachestat.conf create mode 100644 collectors/ebpf.plugin/ebpf.d/ebpf_kernel_reject_list.txt create mode 100644 collectors/ebpf.plugin/ebpf.d/network.conf create mode 100644 collectors/ebpf.plugin/ebpf.d/process.conf create mode 100644 collectors/ebpf.plugin/ebpf.d/sync.conf create mode 100644 collectors/ebpf.plugin/ebpf_cachestat.c create mode 100644 collectors/ebpf.plugin/ebpf_cachestat.h delete mode 100644 collectors/ebpf.plugin/ebpf_kernel_reject_list.txt create mode 100644 collectors/ebpf.plugin/ebpf_sync.c create mode 100644 collectors/ebpf.plugin/ebpf_sync.h create mode 100644 collectors/statsd.plugin/k6.conf create mode 100644 database/sqlite/Makefile.am create mode 100644 docs/guides/monitor/lamp-stack.md create mode 100644 docs/guides/monitor/raspberry-pi-anomaly-detection.md create mode 100644 docs/guides/monitor/statsd.md create mode 100644 docs/guides/python-collector.md delete mode 100644 health/health.d/apps_plugin.conf create mode 100644 health/health.d/synchronization.conf diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..0f1be69da --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,14 @@ +{ + "image": "netdata/devenv", + "extensions":[ + "golang.go", + "exiasr.hadolint", + "ms-python.python", + "timonwong.shellcheck", + "redhat.vscode-yaml", + "dbaeumer.vscode-eslint", + "jasonnutter.search-node-modules", + "mgmcdermott.vscode-language-babel", + ], + "forwardPorts": [19999] +} \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index ecfd9fc47..799fba7a0 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,6 +1,6 @@ --- name: "Bug report: Netdata Agent" -about: Submit a report and help us improve our free and open-source Netdata Agent +about: "Submit a report and help us improve our free and open-source Netdata Agent" labels: bug, needs triage --- @@ -11,9 +11,11 @@ When creating a bug report please: --> ##### Bug report summary + ##### OS / Environment + + ``` ``` ##### Netdata version + - + +##### Installation method + + ##### Component Name + ##### Steps To Reproduce + diff --git a/.github/scripts/build-dist.sh b/.github/scripts/build-dist.sh new file mode 100755 index 000000000..f7e27324c --- /dev/null +++ b/.github/scripts/build-dist.sh @@ -0,0 +1,70 @@ +#!/bin/sh +# +# Builds the netdata-vX.y.Z-xxxx.tar.gz source tarball (dist) + +set -e + +# shellcheck source=.github/scripts/functions.sh +. "$(dirname "$0")/functions.sh" + +NAME="${NAME:-netdata}" +VERSION="${VERSION:-"$(git describe --always)"}" +BASENAME="$NAME-$VERSION" + +prepare_build() { + progress "Preparing build" + ( + test -d artifacts || mkdir -p artifacts + echo "${VERSION}" > packaging/version + ) >&2 +} + +build_dist() { + progress "Building dist" + ( + command -v git > /dev/null && [ -d .git ] && git clean -d -f + autoreconf -ivf + ./configure \ + --prefix=/usr \ + --sysconfdir=/etc \ + --localstatedir=/var \ + --libexecdir=/usr/libexec \ + --with-zlib \ + --with-math \ + --with-user=netdata \ + CFLAGS=-O2 + make dist + mv "${BASENAME}.tar.gz" artifacts/ + ) >&2 +} + +prepare_assets() { + progress "Preparing assets" + ( + cp packaging/version artifacts/latest-version.txt + cd artifacts || exit 1 + ln -f "${BASENAME}.tar.gz" netdata-latest.tar.gz + ln -f "${BASENAME}.gz.run" netdata-latest.gz.run + sha256sum -b ./* > "sha256sums.txt" + ) >&2 +} + +steps="prepare_build build_dist prepare_assets" + +_main() { + for step in $steps; do + if ! run "$step"; then + if [ -t 1 ]; then + debug + else + fail "Build failed" + fi + fi + done + + echo "๐ŸŽ‰ All Done!" +} + +if [ -n "$0" ] && [ x"$0" != x"-bash" ]; then + _main "$@" +fi diff --git a/.github/scripts/check-updater.sh b/.github/scripts/check-updater.sh new file mode 100755 index 000000000..3ef4857f9 --- /dev/null +++ b/.github/scripts/check-updater.sh @@ -0,0 +1,38 @@ +#!/bin/sh +# +set -e +# shellcheck source=.github/scripts/functions.sh +. "$(dirname "$0")/functions.sh" + +check_successfull_update() { + progress "Check netdata version after update" + ( + netdata_version=$(netdata -v | awk '{print $2}') + updater_version=$(cat packaging/version) + if [ "$netdata_version" = "$updater_version" ]; then + echo "Update successfull!" + else + exit 1 + fi + ) >&2 +} + +steps="check_successfull_update" + +_main() { + for step in $steps; do + if ! run "$step"; then + if [ -t 1 ]; then + debug + else + fail "Build failed" + fi + fi + done + + echo "๐ŸŽ‰ All Done!" +} + +if [ -n "$0" ] && [ x"$0" != x"-bash" ]; then + _main "$@" +fi diff --git a/.github/scripts/gen-docker-tags.py b/.github/scripts/gen-docker-tags.py new file mode 100755 index 000000000..6c6251155 --- /dev/null +++ b/.github/scripts/gen-docker-tags.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import sys + +REPO = 'netdata/netdata' + +version = sys.argv[1].split('.') + +MAJOR = ':'.join([REPO, version[0]]) +MINOR = ':'.join([REPO, '.'.join(version[0:2])]) +PATCH = ':'.join([REPO, '.'.join(version[0:3])]) + +print(','.join([MAJOR, MINOR, PATCH])) diff --git a/.github/scripts/old_package_purging.sh b/.github/scripts/old_package_purging.sh new file mode 100755 index 000000000..c90c4b780 --- /dev/null +++ b/.github/scripts/old_package_purging.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# +# Script to handle package cloud retention policy +# Our open source subscription is limited, +# so we use this sript to control the number of packages maintained historically +# +# Dependencies: +# - PACKAGE_CLOUD_RETENTION_DAYS +# This is to indicate for how many days back we want to maintain the various RPM and DEB packages on package cloud +# +# Copyright : SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis +# +set -e + +delete_files_for_version() { + local v="$1" + + # Delete the selected filenames in version + FILES_IN_VERSION=$(jq --sort-keys --arg v "${v}" '.[] | select ( .version | contains($v))' "${PKG_LIST_FILE}" | grep filename | cut -d':' -f 2) + + # Iterate through the files and delete them + for pkg in ${FILES_IN_VERSION/\\n/}; do + pkg=${pkg/,/} + pkg=${pkg/\"/} + pkg=${pkg/\"/} + echo "Attempting yank on ${pkg}.." + .github/scripts/package_cloud_wrapper.sh yank "${REPO}" "${pkg}" || echo "Nothing to yank or error on ${pkg}" + done +} + +# If we are not in netdata git repo, at the top level directory, fail +TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") +CWD=$(git rev-parse --show-cdup) +if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then + echo "Run as .github/scripts/$(basename "$0") from top level directory of netdata git repository" + echo "Old packages yanking cancelled" + exit 1 +fi + +if [ -z "${REPO}" ]; then + echo "No REPO variable found" + exit 1 +fi + +if [ -z ${PKG_CLOUD_TOKEN} ]; then + echo "No PKG_CLOUD_TOKEN variable found" + exit 1 +fi + +if [ -z ${PACKAGE_CLOUD_RETENTION_DAYS} ]; then + echo "No PACKAGE_CLOUD_RETENTION_DAYS variable found" + exit 1 +fi + +TMP_DIR="$(mktemp -d /tmp/netdata-old-package-yanking-XXXXXX)" +PKG_LIST_FILE="${TMP_DIR}/complete_package_list.json" +DATE_EPOCH="1970-01-01" +DATE_UNTIL_TO_DELETE=$(date --date="${PACKAGE_CLOUD_RETENTION_DAYS} day ago" +%Y-%m-%d) + + +echo "Created temp directory: ${TMP_DIR}" +echo "We will be purging contents up until ${DATE_UNTIL_TO_DELETE}" + +echo "Calling package could to retrieve all available packages on ${REPO}" +curl -sS "https://${PKG_CLOUD_TOKEN}:@packagecloud.io/api/v1/repos/${REPO}/packages.json" > "${PKG_LIST_FILE}" + +# Get versions within the desired duration +# +VERSIONS_TO_PURGE=$(jq --arg s "${DATE_EPOCH}" --arg e "${DATE_UNTIL_TO_DELETE}" ' +[($s, $e) | strptime("%Y-%m-%d")[0:3]] as $r + | map(select( + (.created_at[:19] | strptime("%Y-%m-%dT%H:%M:%S")[0:3]) as $d + | $d >= $r[0] and $d <= $r[1] +))' "${PKG_LIST_FILE}" | grep '"version":' | sort -u | sed -e 's/ //g' | cut -d':' -f2) + +echo "We will be deleting the following versions: ${VERSIONS_TO_PURGE}" +for v in ${VERSIONS_TO_PURGE/\n//}; do + v=${v/\"/} + v=${v/\"/} + v=${v/,/} + echo "Remove all files for version $v" + delete_files_for_version "${v}" +done + +# Done, clean up +[ -d "${TMP_DIR}" ] && rm -rf "${TMP_DIR}" diff --git a/.github/scripts/package_cloud_wrapper.sh b/.github/scripts/package_cloud_wrapper.sh new file mode 100755 index 000000000..7640ef484 --- /dev/null +++ b/.github/scripts/package_cloud_wrapper.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# +# This is a tool to help removal of packages from packagecloud.io +# It utilizes the package_cloud utility provided from packagecloud.io +# +# Depends on: +# 1) package cloud gem (detects absence and installs it) +# +# Requires: +# 1) PKG_CLOUD_TOKEN variable exported +# 2) To properly install package_cloud when not found, it requires: ruby gcc gcc-c++ ruby-devel +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) +#shellcheck disable=SC2068,SC2145 +set -e +PKG_CLOUD_CONFIG="$HOME/.package_cloud_configuration.cfg" + +# If we are not in netdata git repo, at the top level directory, fail +TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") +CWD=$(git rev-parse --show-cdup) +if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then + echo "Run as .github/scripts/$(basename "$0") from top level directory of netdata git repository" + echo "Docker build process aborted" + exit 1 +fi + +# Install dependency if not there +if ! command -v package_cloud > /dev/null 2>&1; then + echo "No package cloud gem found, installing" + sudo gem install -V package_cloud || (echo "Package cloud installation failed. you might want to check if required dependencies are there (ruby gcc gcc-c++ ruby-devel)" && exit 1) +else + echo "Found package_cloud gem, continuing" +fi + +# Check for required token and prepare config +if [ -z "${PKG_CLOUD_TOKEN}" ]; then + echo "Please set PKG_CLOUD_TOKEN to be able to use ${0}" + exit 1 +fi +echo "{\"url\":\"https://packagecloud.io\",\"token\":\"${PKG_CLOUD_TOKEN}\"}" > "${PKG_CLOUD_CONFIG}" + +echo "Executing package_cloud with config ${PKG_CLOUD_CONFIG} and parameters $@" +package_cloud $@ --config="${PKG_CLOUD_CONFIG}" + +rm -rf "${PKG_CLOUD_CONFIG}" +echo "Done!" diff --git a/.github/workflows/build-and-install.yml b/.github/workflows/build-and-install.yml index 26a144acd..4c4693601 100644 --- a/.github/workflows/build-and-install.yml +++ b/.github/workflows/build-and-install.yml @@ -12,6 +12,8 @@ jobs: steps: - name: Git clone repository uses: actions/checkout@v2 + with: + submodules: recursive - run: | git fetch --prune --unshallow --tags - name: Build @@ -34,10 +36,12 @@ jobs: - 'clearlinux:latest' - 'debian:10' - 'debian:9' + - 'fedora:34' - 'fedora:33' - 'fedora:32' - 'opensuse/leap:15.2' - 'opensuse/tumbleweed:latest' + - 'ubuntu:21.04' - 'ubuntu:20.10' - 'ubuntu:20.04' - 'ubuntu:18.04' @@ -72,6 +76,8 @@ jobs: pre: 'apt-get update' rmjsonc: 'apt-get remove -y libjson-c-dev' + - distro: 'fedora:34' + rmjsonc: 'dnf remove -y json-c-devel' - distro: 'fedora:33' rmjsonc: 'dnf remove -y json-c-devel' - distro: 'fedora:32' @@ -82,6 +88,9 @@ jobs: - distro: 'opensuse/tumbleweed:latest' rmjsonc: 'zypper rm -y libjson-c-devel' + - distro: 'ubuntu:21.04' + pre: 'apt-get update' + rmjsonc: 'apt-get remove -y libjson-c-dev' - distro: 'ubuntu:20.10' pre: 'apt-get update' rmjsonc: 'apt-get remove -y libjson-c-dev' @@ -98,6 +107,8 @@ jobs: steps: - name: Git clone repository uses: actions/checkout@v2 + with: + submodules: recursive - name: install-required-packages.sh on ${{ matrix.distro }} env: PRE: ${{ matrix.pre }} @@ -183,6 +194,8 @@ jobs: steps: - name: Git clone repository uses: actions/checkout@v2 + with: + submodules: recursive - name: install-required-packages.sh on ${{ matrix.distro }} env: PRE: ${{ matrix.pre }} diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index cf494e95c..7225d3dbe 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -12,6 +12,8 @@ jobs: steps: - name: Git clone repository uses: actions/checkout@v2 + with: + submodules: recursive - name: Run checksum checks on kickstart files env: LOCAL_ONLY: "true" @@ -23,6 +25,8 @@ jobs: steps: - name: Git clone repository uses: actions/checkout@v2 + with: + submodules: recursive - name: Install required packages run: | ./packaging/installer/install-required-packages.sh --dont-wait --non-interactive netdata @@ -43,6 +47,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + with: + submodules: recursive - name: Build run: > docker run -v "$PWD":/netdata -w /netdata alpine:latest /bin/sh -c @@ -68,6 +74,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + with: + submodules: recursive - name: Prepare environment run: | ./packaging/installer/install-required-packages.sh --dont-wait --non-interactive netdata @@ -98,6 +106,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + with: + submodules: recursive - name: Prepare environment run: ./packaging/installer/install-required-packages.sh --dont-wait --non-interactive netdata - name: Build netdata diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 926257dc0..766275ed9 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -15,6 +15,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + with: + submodules: recursive - name: Prepare environment env: DEBIAN_FRONTEND: 'noninteractive' @@ -32,3 +34,17 @@ jobs: COVERITY_SCAN_SUBMIT_MAIL: ${{ secrets.COVERITY_SCAN_SUBMIT_MAIL }} run: | ./coverity-scan.sh --with-install + - name: Failure Notification + uses: rtCamp/action-slack-notify@v2 + env: + SLACK_COLOR: 'danger' + SLACK_FOOTER: + SLACK_ICON_EMOJI: ':github-actions:' + SLACK_TITLE: 'Coverity run failed:' + SLACK_USERNAME: 'GitHub Actions' + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }} + if: ${{ + failure() + && github.event_name != 'pull_request' + && startsWith(github.ref, 'refs/heads/master') + }} diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 04f91bff5..a5648d2a9 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -4,15 +4,7 @@ on: push: branches: - master - paths: - - '.github/workflows/docker.yml' - - 'netdata-installer.sh' - - 'packaging/**' pull_request: - paths: - - '.github/workflows/docker.yml' - - 'netdata-installer.sh' - - 'packaging/**' workflow_dispatch: inputs: version: @@ -26,11 +18,13 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + with: + submodules: recursive - name: Determine if we should push changes and which tags to use if: github.event_name == 'workflow_dispatch' && github.event.inputs.version != 'nightly' run: | echo "publish=true" >> $GITHUB_ENV - echo "tags=netdata/netdata:latest,netdata/netdata:stable,netdata/netdata:${{ github.event.inputs.version }}" >> $GITHUB_ENV + echo "tags=netdata/netdata:latest,netdata/netdata:stable,$(.github/scripts/gen-docker-tags.py ${{ github.event.inputs.version }})" >> $GITHUB_ENV - name: Determine if we should push changes and which tags to use if: github.event_name == 'workflow_dispatch' && github.event.inputs.version == 'nightly' run: | @@ -57,3 +51,19 @@ jobs: platforms: linux/amd64,linux/i386,linux/arm/v7,linux/arm64 push: ${{ env.publish }} tags: ${{ env.tags }} + - name: Failure Notification + uses: rtCamp/action-slack-notify@v2 + env: + SLACK_COLOR: 'danger' + SLACK_FOOTER: + SLACK_ICON_EMOJI: ':github-actions:' + SLACK_TITLE: 'Docker Build failed:' + SLACK_USERNAME: 'GitHub Actions' + SLACK_MESSAGE: "Docker image build failed." + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }} + if: >- + ${{ + failure() + && github.event_name != 'pull_request' + && startsWith(github.ref, 'refs/heads/master') + }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2a4fe87e4..9f7234f92 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -16,6 +16,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + with: + submodules: recursive - name: Run link check uses: gaurav-nelson/github-action-markdown-link-check@v1 with: diff --git a/.github/workflows/packaging.yml b/.github/workflows/packaging.yml new file mode 100644 index 000000000..7340c27bb --- /dev/null +++ b/.github/workflows/packaging.yml @@ -0,0 +1,149 @@ +--- +# Handles building of binary packages for the agent. +name: Packages +on: + pull_request: + branches: + - master + - develop + workflow_dispatch: + inputs: + type: + name: Package build type + default: devel + required: true + version: + name: Package version + required: false +jobs: + build: + name: Build + runs-on: ubuntu-latest + env: + DOCKER_CLI_EXPERIMENTAL: enabled + strategy: + matrix: + include: + - {distro: debian, version: "9", pkgclouddistro: debian/stretch, format: deb, base_image: debian, platform: linux/amd64, arch: amd64} + - {distro: debian, version: "9", pkgclouddistro: debian/stretch, format: deb, base_image: debian, platform: linux/i386, arch: i386} + - {distro: debian, version: "10", pkgclouddistro: debian/buster, format: deb, base_image: debian, platform: linux/amd64, arch: amd64} + - {distro: debian, version: "10", pkgclouddistro: debian/buster, format: deb, base_image: debian, platform: linux/i386, arch: i386} + - {distro: ubuntu, version: "16.04", pkgclouddistro: ubuntu/xenial, format: deb, base_image: ubuntu, platform: linux/amd64, arch: amd64} + - {distro: ubuntu, version: "16.04", pkgclouddistro: ubuntu/xenial, format: deb, base_image: ubuntu, platform: linux/i386, arch: i386} + - {distro: ubuntu, version: "18.04", pkgclouddistro: ubuntu/bionic, format: deb, base_image: ubuntu, platform: linux/amd64, arch: amd64} + - {distro: ubuntu, version: "18.04", pkgclouddistro: ubuntu/bionic, format: deb, base_image: ubuntu, platform: linux/i386, arch: i386} + - {distro: ubuntu, version: "20.04", pkgclouddistro: ubuntu/focal, format: deb, base_image: ubuntu, platform: linux/amd64, arch: amd64} + - {distro: ubuntu, version: "20.10", pkgclouddistro: ubuntu/groovy, format: deb, base_image: ubuntu, platform: linux/amd64, arch: amd64} + - {distro: ubuntu, version: "21.04", pkgclouddistro: ubuntu/hirsute, format: deb, base_image: ubuntu, platform: linux/amd64, arch: amd64} + - {distro: centos, version: "7", pkgclouddistro: el/7, format: rpm, base_image: centos, platform: linux/amd64, arch: amd64} + - {distro: centos, version: "8", pkgclouddistro: el/8, format: rpm, base_image: centos, platform: linux/amd64, arch: amd64} + - {distro: fedora, version: "32", pkgclouddistro: fedora/32, format: rpm, base_image: fedora, platform: linux/amd64, arch: amd64} + - {distro: fedora, version: "33", pkgclouddistro: fedora/33, format: rpm, base_image: fedora, platform: linux/amd64, arch: amd64} + - {distro: fedora, version: "34", pkgclouddistro: fedora/34, format: rpm, base_image: fedora, platform: linux/amd64, arch: amd64} + - {distro: opensuse, version: "15.2", pkgclouddistro: opensuse/15.2, format: rpm, base_image: opensuse/leap, platform: linux/amd64, arch: amd64} + # We intentiaonally disable the fail-fast behavior so that a + # build failure for one version doesn't prevent us from publishing + # successfully built and tested packages for another version. + fail-fast: false + steps: + - name: Checkout PR # Checkout the PR if it's a PR. + if: github.event_name == 'pull_request' + uses: actions/checkout@v2 + with: + fetch-depth: 0 # We need full history for versioning + submodules: true + - name: Checkout Tag # Otherwise check out the tag that triggered this. + if: github.event_name == 'workflow_dispatch' + uses: actions/checkout@v2 + with: + ref: ${{ github.event.ref }} + fetch-depth: 0 # We need full history for versioning + submodules: true + - name: Check Base Branch + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "runtype=${{ github.event.inputs.type }}" >> $GITHUB_ENV + case "${{ github.event.inputs.type }}" in + "release") + echo "repo=${{ secrets.PACKAGE_CLOUD_REPO }}" >> $GITHUB_ENV + echo "pkg_version=${{ github.event.inputs.version }}" >> $GITHUB_ENV + echo "pkg_retention_days=365" >> $GITHUB_ENV + ;; + "nightly") + echo "repo=${{ secrets.PACKAGE_CLOUD_REPO }}-edge" >> $GITHUB_ENV + echo "pkg_version=${{ github.event.inputs.version }}" >> $GITHUB_ENV + echo "pkg_retention_days=30" >> $GITHUB_ENV + ;; + *) + echo "repo=${{ secrets.PACKAGE_CLOUD_REPO }}-devel" >> $GITHUB_ENV + echo "pkg_version=0.${GITHUB_SHA}" >> $GITHUB_ENV + echo "pkg_retention_days=30" >> $GITHUB_ENV + ;; + esac + else + echo "runtype=test" >> $GITHUB_ENV + echo "pkg_version=$(cut -d'-' -f 1 packaging/version | sed -e 's/^v//')" >> $GITHUB_ENV + fi + - name: Setup QEMU + if: matrix.platform != 'linux/amd64' + uses: docker/setup-qemu-action@v1 + - name: Setup Buildx + uses: docker/setup-buildx-action@v1 + - name: Prepare Docker Environment + shell: bash + run: | + echo '{"cgroup-parent": "/actions_job", "experimental": true}' | sudo tee /etc/docker/daemon.json 2>/dev/null + sudo service docker restart + - name: Build Packages + uses: docker/build-push-action@v2 + with: + platforms: ${{ matrix.platform }} + file: packaging/Dockerfile.packager + tags: local/package-builder:${{ matrix.distro}}${{ matrix.version }} + push: false + load: true + build-args: | + ARCH=${{ matrix.arch }} + DISTRO=${{ matrix.distro }} + TEST_BASE=${{ matrix.base_image }} + DISTRO_VERSION=${{ matrix.version }} + PKG_VERSION=${{ env.pkg_version }} + - name: Extract Packages + shell: bash + run: | + mkdir -p artifacts + docker run --platform ${{ matrix.platform }} -v $PWD/artifacts:/artifacts local/package-builder:${{ matrix.distro }}${{ matrix.version }} + - name: Upload + if: github.event_name == 'workflow_dispatch' + shell: bash + env: + PKG_CLOUD_TOKEN: ${{ secrets.PACKAGE_CLOUD_API_KEY }} + run: | + echo "Packages to upload:\n$(ls artifacts/*.${{ matrix.format }})" + for pkgfile in artifacts/*.${{ matrix.format }} ; do + .github/scripts/package_cloud_wrapper.sh push ${{ env.repo }}/${{ matrix.pkgclouddistro }} ${pkgfile} + done + - name: Clean + if: github.event_name == 'workflow_dispatch' + shell: bash + env: + REPO: ${{ env.repo }} + PKG_CLOUD_TOKEN: ${{ secrets.PACKAGE_CLOUD_API_KEY }} + PACKAGE_CLOUD_RETENTION_DAYS: ${{ env.pkg_retention_days }} + run: .github/scripts/old_package_purging.sh + - name: Failure Notification + uses: rtCamp/action-slack-notify@v2 + env: + SLACK_COLOR: 'danger' + SLACK_FOOTER: + SLACK_ICON_EMOJI: ':github-actions:' + SLACK_TITLE: 'Package Build failed:' + SLACK_USERNAME: 'GitHub Actions' + SLACK_MESSAGE: "${{ matrix.pkgclouddistro }} ${{ matrix.version }} package build for ${{ matrix.arch }} failed." + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }} + if: >- + ${{ + failure() + && github.event_name != 'pull_request' + && startsWith(github.ref, 'refs/heads/master') + }} diff --git a/.github/workflows/review.yml b/.github/workflows/review.yml index ca8f6de13..a267fea3f 100644 --- a/.github/workflows/review.yml +++ b/.github/workflows/review.yml @@ -16,6 +16,7 @@ jobs: - name: Git clone repository uses: actions/checkout@v2 with: + submodules: recursive fetch-depth: 0 - name: Check files run: | @@ -57,6 +58,7 @@ jobs: - name: Git clone repository uses: actions/checkout@v2 with: + submodules: recursive fetch-depth: 0 - name: Check files run: | @@ -80,6 +82,7 @@ jobs: - name: Git clone repository uses: actions/checkout@v2 with: + submodules: recursive fetch-depth: 0 - name: Check files run: | diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ef6bfbc2a..c166c7442 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,6 +21,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + with: + submodules: recursive - name: Prepare environment run: | ./packaging/installer/install-required-packages.sh --dont-wait --non-interactive netdata-all @@ -39,6 +41,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + with: + submodules: recursive - name: Prepare environment run: | ./packaging/installer/install-required-packages.sh --dont-wait --non-interactive netdata-all @@ -48,7 +52,7 @@ jobs: - name: Configure run: | autoreconf -ivf - ./configure + ./configure --without-aclk-ng # XXX: Work-around for bug with libbson-1.0 in Ubuntu 18.04 # See: https://bugs.launchpad.net/ubuntu/+source/libmongoc/+bug/1790771 # https://jira.mongodb.org/browse/CDRIVER-2818 diff --git a/.github/workflows/updater.yml b/.github/workflows/updater.yml new file mode 100644 index 000000000..76cb2fdbe --- /dev/null +++ b/.github/workflows/updater.yml @@ -0,0 +1,84 @@ +--- +name: Updater +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + source-build: + name: Install, Build & Update + strategy: + fail-fast: false + matrix: + distro: + - 'alpine:3.12' + - 'alpine:3.13' + - 'archlinux:latest' + - 'centos:7' + - 'centos:8' + - 'clearlinux:latest' + - 'debian:9' + - 'debian:10' + - 'fedora:33' + - 'fedora:34' + - 'ubuntu:16.04' + - 'ubuntu:18.04' + - 'ubuntu:20.04' + - 'ubuntu:20.10' + - 'ubuntu:21.04' + include: + - distro: 'alpine:3.12' + pre: 'apk add -U bash' + - distro: 'alpine:3.13' + pre: 'apk add -U bash' + - distro: 'debian:9' + pre: 'apt-get update' + - distro: 'debian:10' + pre: 'apt-get update' + - distro: 'ubuntu:16.04' + pre: 'apt-get update' + - distro: 'ubuntu:18.04' + pre: 'apt-get update' + - distro: 'ubuntu:20.04' + pre: 'apt-get update' + - distro: 'ubuntu:20.10' + pre: 'apt-get update' + - distro: 'ubuntu:21.04' + pre: 'apt-get update' + runs-on: ubuntu-latest + steps: + - name: Git clone repository + uses: actions/checkout@v2 + with: + submodules: recursive + - name: Install required packages & build tarball + run: | + ./packaging/installer/install-required-packages.sh --dont-wait --non-interactive netdata-all + .github/scripts/build-dist.sh + - name: Run a dockerised web server to serve files used by the custom update script + run: | + docker run -dit --name my-apache-app -p 8080:80 -v "$PWD":/usr/local/apache2/htdocs/ httpd:2.4 + - name: Replace URLs in updater script to point at the local web server + run: | + ORIG_TARBALL="export NETDATA_TARBALL_URL=.*" + ORIG_CHECKSUM="export NETDATA_TARBALL_CHECKSUM_URL=.*" + CURRENT_VERSION="current_version=.*" + NEW_TARBALL="export NETDATA_TARBALL_URL=http://localhost:8080/artifacts/netdata-latest.tar.gz" + NEW_CHECKSUM="export NETDATA_TARBALL_CHECKSUM_URL=http://localhost:8080/artifacts/sha256sums.txt" + sed -i "s|${ORIG_TARBALL}|${NEW_TARBALL}|g" packaging/installer/netdata-updater.sh + sed -i "s|${ORIG_CHECKSUM}|${NEW_CHECKSUM}|g" packaging/installer/netdata-updater.sh + sed -i "s|"current_version=.*"|"current_version=1"|g" packaging/installer/netdata-updater.sh + - name: Install netdata and run the updater on ${{ matrix.distro }} + env: + PRE: ${{ matrix.pre }} + run: | + echo $PRE > ./prep-cmd.sh + docker build . -f .github/dockerfiles/Dockerfile.build_test -t test --build-arg BASE=${{ matrix.distro }} + docker run --network host -w /netdata test \ + /bin/sh -c '/netdata/packaging/installer/kickstart.sh --dont-wait \ + && /netdata/packaging/installer/netdata-updater.sh --not-running-from-cron --no-updater-self-update \ + && bash /netdata/.github/scripts/check-updater.sh' diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..ef9349b38 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "mqtt_websockets"] + path = mqtt_websockets + url = https://github.com/underhood/mqtt_websockets.git diff --git a/.travis.yml b/.travis.yml index 8c95f5d1d..d5cf8ea10 100644 --- a/.travis.yml +++ b/.travis.yml @@ -56,8 +56,6 @@ stages: - name: Nightly release if: branch = master AND type = cron AND env(RUN_NIGHTLY) = yes - - name: Trigger deb and rpm package build (nightly release) - if: branch = master AND type = cron AND env(RUN_NIGHTLY) = yes # Scheduled releases - name: Support activities on main branch @@ -71,21 +69,6 @@ stages: AND type != cron AND tag !~ /(-rc)/ AND commit_message =~ /\[netdata (release candidate|(major|minor|patch) release)\]/ - - name: Trigger deb and rpm package build (release) - if: >- - branch = master - AND type != pull_request - AND type != cron - AND tag !~ /(-rc)/ - AND commit_message =~ /\[netdata (release candidate|(major|minor|patch) release)\]/ - - # Build DEB packages under special conditions - - name: Package ubuntu/* and debian/* - if: type != cron AND type != pull_request AND branch = master - - # Build RPM packages under special conditions - - name: Package centos, fedora and opensuse - if: type != cron AND type != pull_request AND branch = master # Define stage implementation details @@ -103,57 +86,6 @@ jobs: env: CFLAGS='-O1 -Wall -Wextra -Wformat-signedness -fstack-protector-all -fno-common -DNETDATA_INTERNAL_CHECKS=1 -D_FORTIFY_SOURCE=2 -DNETDATA_VERIFY_LOCKS=1' after_failure: post_message "TRAVIS_MESSAGE" " standard netdata build is failing (Still dont know which one, will improve soon)" - - name: Build/Install for ubuntu 20.04 (not containerized) - script: fakeroot ./netdata-installer.sh --dont-wait --dont-start-it --install $HOME - after_failure: post_message "TRAVIS_MESSAGE" "Build/Install failed on ubuntu 18.04" - - - name: Build/install for CentOS 7 (Containerized) - script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "netdata/os-test:centos7" ./netdata-installer.sh --dont-wait --dont-start-it --install /tmp - after_failure: post_message "TRAVIS_MESSAGE" "Build/Install failed on CentOS 7" - - - name: DEB package test - git: - depth: false - before_install: - - sudo apt-get install -y wget lxc python3-lxc lxc-templates dh-make git-buildpackage build-essential libdistro-info-perl - before_script: - - export PACKAGES_DIRECTORY="$(mktemp -d -t netdata-packaging-contents-dir-XXXXXX)" && echo "Created packaging directory ${PACKAGES_DIRECTORY}" - script: - - echo "GIT Branch:" && git branch - - echo "Last commit:" && git log -1 - - echo "GIT Describe:" && git describe - - echo "packaging/version:" && cat packaging/version - - echo "Creating LXC environment for the build" && sudo -E .travis/package_management/create_lxc_for_build.sh - - echo "Building package in container" && sudo -E .travis/package_management/build_package_in_container.sh - - sudo chown -R root:travis "/var/lib/lxc" - - sudo chmod -R 750 "/var/lib/lxc" - - echo "Preparing DEB packaging contents for upload" && sudo -E .travis/package_management/prepare_packages.sh - env: - - BUILDER_NAME="builder" BUILD_DISTRO="debian" BUILD_RELEASE="buster" BUILD_STRING="debian/buster" - - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" - after_failure: post_message "TRAVIS_MESSAGE" "Netdata DEB package build check failed." - - - name: RPM package test - git: - depth: false - before_install: - - sudo apt-get install -y wget lxc lxc-templates python3-lxc - before_script: - - export PACKAGES_DIRECTORY="$(mktemp -d -t netdata-packaging-contents-dir-XXXXXX)" && echo "Created packaging directory ${PACKAGES_DIRECTORY}" - script: - - echo "GIT Branch:" && git branch - - echo "Last commit:" && git log -1 - - echo "GIT Describe:" && git describe - - echo "packaging/version:" && cat packaging/version - - echo "Creating LXC environment for the build" && sudo -E .travis/package_management/create_lxc_for_build.sh - - echo "Building package in container" && sudo -E .travis/package_management/build_package_in_container.sh - - sudo chmod -R 755 "/var/lib/lxc" - - echo "Preparing RPM packaging contents for upload" && sudo -E .travis/package_management/prepare_packages.sh - env: - - BUILDER_NAME="builder" BUILD_DISTRO="fedora" BUILD_RELEASE="32" BUILD_STRING="fedora/32" - - PACKAGE_TYPE="rpm" REPO_TOOL="dnf" - after_failure: post_message "TRAVIS_MESSAGE" "Netdata RPM package build check failed." - - stage: Support activities on main branch name: Generate changelog for release (only on special and tagged commit msg) before_script: post_message "TRAVIS_MESSAGE" "Support activities on main branch initiated" "${NOTIF_CHANNEL}" @@ -169,172 +101,6 @@ jobs: depth: false if: commit_message =~ /\[netdata (release candidate|(major|minor|patch) release)\]/ AND tag !~ /(-rc)/ OR (env(GIT_TAG) IS present AND NOT env(GIT_TAG) IS blank) - # ###### Packaging workflow section ###### - # References: - # https://us.images.linuxcontainers.org - # https://packagecloud.io/docs#install_repo - - # TODO: This section is stale, will be aligned with the RPM implementation when we get to DEB packaging - - stage: Package ubuntu/* and debian/* - _template: &DEB_TEMPLATE - git: - depth: false - before_install: - - sudo apt-get install -y wget lxc python3-lxc lxc-templates dh-make git-buildpackage build-essential libdistro-info-perl - - source tests/installer/slack.sh - before_script: - - export PACKAGES_DIRECTORY="$(mktemp -d -t netdata-packaging-contents-dir-XXXXXX)" && echo "Created packaging directory ${PACKAGES_DIRECTORY}" - script: - - echo "GIT Branch:" && git branch - - echo "Last commit:" && git log -1 - - echo "GIT Describe:" && git describe - - echo "packaging/version:" && cat packaging/version - - echo "Creating LXC environment for the build" && sudo -E .travis/package_management/create_lxc_for_build.sh - - echo "Building package in container" && sudo -E .travis/package_management/build_package_in_container.sh - - sudo chown -R root:travis "/var/lib/lxc" - - sudo chmod -R 750 "/var/lib/lxc" - - echo "Preparing DEB packaging contents for upload" && sudo -E .travis/package_management/prepare_packages.sh - after_failure: post_message "TRAVIS_MESSAGE" "Failed to build DEB for ${BUILD_STRING}.${BUILD_ARCH}" - before_deploy: - - .travis/package_management/yank_stale_pkg.sh "${PACKAGES_DIRECTORY}" "${BUILD_STRING}" || echo "No stale DEB found" - deploy: - - provider: packagecloud - repository: "${DEPLOY_REPO}" - username: "${PACKAGING_USER}" - token: "${PKG_CLOUD_TOKEN}" - dist: "${BUILD_STRING}" - local_dir: "${PACKAGES_DIRECTORY}" - skip_cleanup: true - on: - # Only deploy on ${USER}/netdata, master branch, when build-area directory is created - repo: ${TRAVIS_REPO_SLUG} - branch: "master" - condition: -d "${PACKAGES_DIRECTORY}" - after_deploy: - - if [ -n "${BUILDER_NAME}" ]; then rm -rf /home/${BUILDER_NAME}/* && echo "Cleared /home/${BUILDER_NAME} directory" || echo "Failed to clean /home/${BUILDER_NAME} directory"; fi; - - if [ -d "${PACKAGES_DIRECTORY}" ]; then rm -rf "${PACKAGES_DIRECTORY}"; fi; - - name: "Build & Publish DEB package for ubuntu/bionic" - <<: *DEB_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64|i386) DEB( Ubuntu)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="ubuntu" BUILD_RELEASE="bionic" BUILD_STRING="ubuntu/bionic" - - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" - - ALLOW_SOFT_FAILURE_HERE=true - - - name: "Build & Publish DEB package for ubuntu/xenial" - <<: *DEB_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64|i386) DEB( Ubuntu)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="ubuntu" BUILD_RELEASE="xenial" BUILD_STRING="ubuntu/xenial" - - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" - - ALLOW_SOFT_FAILURE_HERE=true - - - name: "Build & Publish DEB package for ubuntu/focal" - <<: *DEB_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64) DEB( Ubuntu)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="ubuntu" BUILD_RELEASE="focal" BUILD_STRING="ubuntu/focal" - - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" - - ALLOW_SOFT_FAILURE_HERE=true - - - name: "Build & Publish DEB package for ubuntu/groovy" - <<: *DEB_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64) DEB( Ubuntu)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="ubuntu" BUILD_RELEASE="groovy" BUILD_STRING="ubuntu/groovy" - - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" - - ALLOW_SOFT_FAILURE_HERE=true - - - name: "Build & Publish DEB package for debian/buster" - <<: *DEB_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64|i386) DEB( Debian)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="debian" BUILD_RELEASE="buster" BUILD_STRING="debian/buster" - - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" - - ALLOW_SOFT_FAILURE_HERE=true - - - name: "Build & Publish DEB package for debian/stretch" - <<: *DEB_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64|i386) DEB( Debian)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="debian" BUILD_RELEASE="stretch" BUILD_STRING="debian/stretch" - - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" - - ALLOW_SOFT_FAILURE_HERE=true - - - stage: Package centos, fedora and opensuse - _template: &RPM_TEMPLATE - git: - depth: false - before_install: - - sudo apt-get install -y wget lxc lxc-templates python3-lxc - - source tests/installer/slack.sh - before_script: - - export PACKAGES_DIRECTORY="$(mktemp -d -t netdata-packaging-contents-dir-XXXXXX)" && echo "Created packaging directory ${PACKAGES_DIRECTORY}" - script: - - echo "GIT Branch:" && git branch - - echo "Last commit:" && git log -1 - - echo "GIT Describe:" && git describe - - echo "packaging/version:" && cat packaging/version - - echo "Creating LXC environment for the build" && sudo -E .travis/package_management/create_lxc_for_build.sh - - echo "Building package in container" && sudo -E .travis/package_management/build_package_in_container.sh - - sudo chmod -R 755 "/var/lib/lxc" - - echo "Preparing RPM packaging contents for upload" && sudo -E .travis/package_management/prepare_packages.sh - after_failure: post_message "TRAVIS_MESSAGE" "Failed to build RPM for ${BUILD_STRING}.${BUILD_ARCH}" - before_deploy: - - .travis/package_management/yank_stale_pkg.sh "${PACKAGES_DIRECTORY}" "${BUILD_STRING}" || echo "No stale RPM found" - deploy: - - provider: packagecloud - repository: "${DEPLOY_REPO}" - username: "${PACKAGING_USER}" - token: "${PKG_CLOUD_TOKEN}" - dist: "${BUILD_STRING}" - local_dir: "${PACKAGES_DIRECTORY}" - skip_cleanup: true - on: - # Only deploy on ${USER}/netdata, master branch, when packages directory is created - repo: ${TRAVIS_REPO_SLUG} - branch: "master" - condition: -d "${PACKAGES_DIRECTORY}" - after_deploy: - - if [ -n "${BUILDER_NAME}" ]; then rm -rf /home/${BUILDER_NAME}/* && echo "Cleared /home/${BUILDER_NAME} directory" || echo "Failed to clean /home/${BUILDER_NAME} directory"; fi; - - if [ -d "${PACKAGES_DIRECTORY}" ]; then rm -rf "${PACKAGES_DIRECTORY}"; fi; - - name: "Build & Publish RPM package for Enterprise Linux 7" - <<: *RPM_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64) RPM( Enterprise Linux)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="centos" BUILD_RELEASE="7" BUILD_STRING="el/7" - - PACKAGE_TYPE="rpm" REPO_TOOL="yum" - - ALLOW_SOFT_FAILURE_HERE=true - - - name: "Build & Publish RPM package for Fedora 32" - <<: *RPM_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64) RPM( Fedora)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="fedora" BUILD_RELEASE="32" BUILD_STRING="fedora/32" - - PACKAGE_TYPE="rpm" REPO_TOOL="dnf" - - ALLOW_SOFT_FAILURE_HERE=true - - - name: "Build & Publish RPM package for Fedora 33" - <<: *RPM_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64) RPM( Fedora)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="fedora" BUILD_RELEASE="33" BUILD_STRING="fedora/33" - - PACKAGE_TYPE="rpm" REPO_TOOL="dnf" - - ALLOW_SOFT_FAILURE_HERE=true - - - name: "Build & Publish RPM package for openSUSE 15.2" - <<: *RPM_TEMPLATE - if: commit_message =~ /\[Package (amd64|arm64) RPM( openSUSE)?\]/ - env: - - BUILDER_NAME="builder" BUILD_DISTRO="opensuse" BUILD_RELEASE="15.2" BUILD_STRING="opensuse/15.2" - - PACKAGE_TYPE="rpm" REPO_TOOL="zypper" - - ALLOW_SOFT_FAILURE_HERE=true - # ###### End of packaging workflow section ###### # - # ############################################### # - - # We only publish if a TAG has been set during packaging - stage: Publish for release name: Create release draft @@ -358,9 +124,10 @@ jobs: - .travis/trigger_docker_build.sh "${GITHUB_TOKEN}" "${BUILD_VERSION}" after_failure: post_message "TRAVIS_MESSAGE" " Failed to trigger docker build during release" "${NOTIF_CHANNEL}" - - stage: Trigger deb and rpm package build (release) - name: Trigger deb and rpm package build - script: .travis/trigger_package_generation.sh + - name: Trigger DEB and RPM package build + script: + - git checkout "${TRAVIS_BRANCH}" && export BUILD_VERSION="$(cat packaging/version | sed 's/^v//' | cut -d'-' -f1)" + - .travis/trigger_package_build.sh "${GITHUB_TOKEN}" "${BUILD_VERSION}" "release" after_failure: post_message "TRAVIS_MESSAGE" " Failed to trigger deb and rpm package build during release" "${NOTIF_CHANNEL}" @@ -378,11 +145,6 @@ jobs: git: depth: false - - name: Clean up package cloud nightly repository from old versions - script: - - DEPLOY_REPO="netdata-edge" .travis/package_management/old_package_purging.sh - - DEPLOY_REPO="netdata-devel" .travis/package_management/old_package_purging.sh - # This is the nightly execution step # - stage: Nightly release @@ -454,7 +216,8 @@ jobs: script: .travis/trigger_docker_build.sh "${GITHUB_TOKEN}" "nightly" after_failure: post_message "TRAVIS_MESSAGE" " Failed to trigger docker build during nightly release" "${NOTIF_CHANNEL}" - - stage: Trigger deb and rpm package build (nightly release) - name: Trigger deb and rpm package build - script: .travis/trigger_package_generation.sh "[Build latest]" + - name: Trigger DEB and RPM package build + script: + - git checkout "${TRAVIS_BRANCH}" && export BUILD_VERSION="$(cat packaging/version | sed 's/^v//')" + - .travis/trigger_package_build.sh "${GITHUB_TOKEN}" "${BUILD_VERSION}" "nightly" after_failure: post_message "TRAVIS_MESSAGE" " Failed to trigger deb and rpm package build during nightly release" "${NOTIF_CHANNEL}" diff --git a/.travis/package_management/build.sh b/.travis/package_management/build.sh deleted file mode 100644 index bafaecc5a..000000000 --- a/.travis/package_management/build.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -UNPACKAGED_NETDATA_PATH="$1" -LATEST_RELEASE_VERSION="$2" - -if [ -z "${LATEST_RELEASE_VERSION}" ]; then - echo "Parameter 'LATEST_RELEASE_VERSION' not defined" - exit 1 -fi - -if [ -z "${UNPACKAGED_NETDATA_PATH}" ]; then - echo "Parameter 'UNPACKAGED_NETDATA_PATH' not defined" - exit 1 -fi - -echo "Running changelog generation mechanism since ${LATEST_RELEASE_VERSION}" - -echo "Entering ${UNPACKAGED_NETDATA_PATH}" -cd "${UNPACKAGED_NETDATA_PATH}" - -echo "Linking debian -> contrib/debian" -ln -sf contrib/debian debian - -echo "Executing dpkg-buildpackage" -# pre/post options are after 1.18.8, is simpler to just check help for their existence than parsing version -if dpkg-buildpackage --help | grep "\-\-post\-clean" 2> /dev/null > /dev/null; then - dpkg-buildpackage --post-clean --pre-clean --build=binary -us -uc -else - dpkg-buildpackage -b -us -uc -fi - -echo "DEB build script completed!" diff --git a/.travis/package_management/build_judy.sh b/.travis/package_management/build_judy.sh deleted file mode 100755 index 202ea0449..000000000 --- a/.travis/package_management/build_judy.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -# -# Build Judy from source, you need to run this script as root. -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) -set -e -JUDY_VER="1.0.5" -JUDY_DIR="/opt/judy-${JUDY_VER}" - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup) -if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" - echo "Build Judy package from source code failed" - exit 1 -fi - -echo "Fetching judy source tarball" -wget -O /opt/judy.tar.gz http://downloads.sourceforge.net/project/judy/judy/Judy-${JUDY_VER}/Judy-${JUDY_VER}.tar.gz - -echo "Entering /opt directory and extracting tarball" -cd /opt && tar -xf judy.tar.gz && rm judy.tar.gz - -echo "Entering ${JUDY_DIR}" -cd "${JUDY_DIR}" - -echo "Running configure" -CFLAGS="-O2 -s" CXXFLAGS="-O2 -s" ./configure - -echo "Compiling and installing" -make && make install - -echo "Done, enjoy Judy!" diff --git a/.travis/package_management/build_libuv.sh b/.travis/package_management/build_libuv.sh deleted file mode 100755 index c30eede64..000000000 --- a/.travis/package_management/build_libuv.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -# -# Build libuv from source, you need to run this script as root. -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis -set -e -LIBUV_VERSION="v1.32.0" -# Their folder is libuv-1.32.0 while the tarball version is v1.32.0, so fix that until they fix it... -LIBUV_DIR="/opt/libuv-${LIBUV_VERSION/v/}" - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup) -if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" - echo "Build libuv package from source code failed" - exit 1 -fi - -echo "Fetching libuv from github" -wget -O /opt/libuv.tar.gz "https://github.com/libuv/libuv/archive/${LIBUV_VERSION}.tar.gz" - -echo "Entering /opt and extracting source" -cd /opt && tar -xf libuv.tar.gz && rm libuv.tar.gz - -echo "Entering ${LIBUV_DIR}" -cd "${LIBUV_DIR}" - -echo "Compiling and installing" -sh autogen.sh -./configure -make && make check && make install - -echo "Done, enjoy libuv!" diff --git a/.travis/package_management/build_package_in_container.sh b/.travis/package_management/build_package_in_container.sh deleted file mode 100755 index 95a68e7a8..000000000 --- a/.travis/package_management/build_package_in_container.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env bash -# -# Entry point for package build process -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) -#shellcheck disable=SC1091 -set -e - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup) -if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" - echo "Docker build process aborted" - exit 1 -fi - -source .travis/package_management/functions.sh || (echo "Failed to load packaging library" && exit 1) - -# Check for presence of mandatory environment variables -if [ -z "${BUILD_STRING}" ]; then - echo "No Distribution was defined. Make sure BUILD_STRING is set on the environment before running this script" - exit 1 -fi - -if [ -z "${BUILDER_NAME}" ]; then - echo "No builder account and container name defined. Make sure BUILDER_NAME is set on the environment before running this script" - exit 1 -fi - -if [ -z "${BUILD_DISTRO}" ]; then - echo "No build distro information defined. Make sure BUILD_DISTRO is set on the environment before running this script" - exit 1 -fi - -if [ -z "${BUILD_RELEASE}" ]; then - echo "No build release information defined. Make sure BUILD_RELEASE is set on the environment before running this script" - exit 1 -fi - -if [ -z "${PACKAGE_TYPE}" ]; then - echo "No build release information defined. Make sure PACKAGE_TYPE is set on the environment before running this script" - exit 1 -fi - -# Detect architecture and load extra variables needed -detect_arch_from_commit - -case "${BUILD_ARCH}" in -"all") - echo "* * * Building all architectures, amd64 and i386 * * *" - echo "Building for amd64.." - export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-amd64" - export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" - .travis/package_management/trigger_${PACKAGE_TYPE}_lxc_build.py "${CONTAINER_NAME}" - - echo "Building for arm64.." - export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-arm64" - export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" - .travis/package_management/trigger_${PACKAGE_TYPE}_lxc_build.py "${CONTAINER_NAME}" - - echo "Building for i386.." - export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-i386" - export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" - .travis/package_management/trigger_${PACKAGE_TYPE}_lxc_build.py "${CONTAINER_NAME}" - - ;; -"amd64"|"arm64"|"i386") - echo "Building for ${BUILD_ARCH}.." - export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-${BUILD_ARCH}" - export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" - .travis/package_management/trigger_${PACKAGE_TYPE}_lxc_build.py "${CONTAINER_NAME}" - ;; -*) - echo "Unknown build architecture '${BUILD_ARCH}', nothing to do for build" - exit 1 - ;; -esac - -echo "Build process completed!" diff --git a/.travis/package_management/common.py b/.travis/package_management/common.py deleted file mode 100755 index 4cc04b93f..000000000 --- a/.travis/package_management/common.py +++ /dev/null @@ -1,182 +0,0 @@ -# -# -# Python library with commonly used functions within the package management scope -# -# Author : Pavlos Emm. Katsoulakis - -import lxc -import subprocess -import os -import sys -import tempfile -import shutil - -def fetch_version(orig_build_version): - tag = None - friendly_version = "" - - # TODO: Checksum validations - if str(orig_build_version).count(".latest") == 1: - version_list=str(orig_build_version).replace('v', '').split('.') - minor = version_list[3] if int(version_list[2]) == 0 else (version_list[2] + version_list[3]) - friendly_version='.'.join(version_list[0:2]) + "." + minor - else: - friendly_version = orig_build_version.replace('v', '') - tag = friendly_version # Go to stable tag - print("Version set to %s from %s" % (friendly_version, orig_build_version)) - - return friendly_version, tag - -def replace_tag(tag_name, spec, new_tag_content): - print("Fixing tag %s in %s" % (tag_name, spec)) - - ifp = open(spec, "r") - config = ifp.readlines() - ifp.close() - - source_line = -1 - for line in config: - if str(line).count(tag_name + ":") > 0: - source_line = config.index(line) - print("Found line: %s in item %d" % (line, source_line)) - break - - if source_line >= 0: - print("Replacing line %s with %s in spec file" %(config[source_line], new_tag_content)) - config[source_line] = "%s: %s\n" % (tag_name, new_tag_content) - config_str = ''.join(config) - ofp = open(spec, 'w') - ofp.write(config_str) - ofp.close() - -def run_command(container, command): - print("Running command: %s" % command) - command_result = container.attach_wait(lxc.attach_run_command, command, stdout=sys.stdout.buffer, stderr=sys.stdout.buffer) - - if command_result != 0: - raise Exception("Command failed with exit code %d" % command_result) - -def run_command_in_host(cmd, cwd=None): - print("Issue command in host: %s, cwd:%s" % (str(cmd), str(cwd))) - - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) - o, e = proc.communicate() - print('Output: ' + o.decode('ascii')) - print('Error: ' + e.decode('ascii')) - print('code: ' + str(proc.returncode)) - -def prepare_repo(container): - if str(os.environ["REPO_TOOL"]).count("zypper") == 1: - run_command(container, [os.environ["REPO_TOOL"], "clean", "-a"]) - run_command(container, [os.environ["REPO_TOOL"], "--no-gpg-checks", "update", "-y"]) - - elif str(os.environ["REPO_TOOL"]).count("yum") == 1: - run_command(container, [os.environ["REPO_TOOL"], "clean", "all"]) - run_command(container, [os.environ["REPO_TOOL"], "update", "-y"]) - - if os.environ["BUILD_STRING"].count("el/7") == 1 and os.environ["BUILD_ARCH"].count("i386") == 1: - print ("Skipping epel-release install for %s-%s" % (os.environ["BUILD_STRING"], os.environ["BUILD_ARCH"])) - else: - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "epel-release"]) - - elif str(os.environ["REPO_TOOL"]).count("apt-get") == 1: - if str(os.environ["BUILD_STRING"]).count("debian/jessie") == 1: - run_command(container, ["bash", "-c", "echo deb http://archive.debian.org/debian/ jessie-backports main contrib non-free >> /etc/apt/sources.list.d/99-archived.list"]) - run_command(container, [os.environ["REPO_TOOL"], "update", "-y", '-o', 'Acquire::Check-Valid-Until=false']) - else: - run_command(container, [os.environ["REPO_TOOL"], "update", "-y"]) - else: - run_command(container, [os.environ["REPO_TOOL"], "update", "-y"]) - - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "sudo"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "wget"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "bash"]) - -def install_common_dependendencies(container): - if str(os.environ["REPO_TOOL"]).count("zypper") == 1: - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "gcc-c++"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "json-glib-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "freeipmi-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "cups-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "snappy-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-c"]) - - elif str(os.environ["REPO_TOOL"]).count("yum") == 1: - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "gcc-c++"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "json-c-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "freeipmi-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "cups-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "snappy-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-c-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-compiler"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libwebsockets-devel"]) - - elif str(os.environ["REPO_TOOL"]).count("apt-get") == 1: - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "g++"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libipmimonitoring-dev"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libjson-c-dev"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libcups2-dev"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libsnappy-dev"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libprotobuf-dev"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libprotoc-dev"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-compiler"]) - if os.environ["BUILD_STRING"].count("debian/jessie") == 1: - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "snappy"]) - else: - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "gcc-c++"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "cups-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "freeipmi-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "json-c-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "snappy-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-c-devel"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "protobuf-compiler"]) - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libwebsockets-devel"]) - - if os.environ["BUILD_STRING"].count("el/6") <= 0: - run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "autogen"]) - -def prepare_version_source(dest_archive, pkg_friendly_version, tag=None): - print(".0 Preparing local implementation tarball for version %s" % pkg_friendly_version) - tar_file = os.environ['LXC_CONTAINER_ROOT'] + dest_archive - - print(".0 Copy repo to prepare it for tarball generation") - tmp_src = tempfile.mkdtemp(prefix='netdata-source-') - run_command_in_host(['cp', '-r', '.', tmp_src]) - - if tag is not None: - print(".1 Checking out tag %s" % tag) - run_command_in_host(['git', 'fetch', '--all'], tmp_src) - - # TODO: Keep in mind that tricky 'v' there, needs to be removed once we clear our versioning scheme - run_command_in_host(['git', 'checkout', 'v%s' % pkg_friendly_version], tmp_src) - - print(".2 Tagging the code with version: %s" % pkg_friendly_version) - run_command_in_host(['git', 'tag', '-a', pkg_friendly_version, '-m', 'Tagging while packaging on %s' % os.environ["CONTAINER_NAME"]], tmp_src) - - print(".3 Run autoreconf -ivf") - run_command_in_host(['autoreconf', '-ivf'], tmp_src) - - print(".4 Run configure") - run_command_in_host(['./configure', '--prefix=/usr', '--sysconfdir=/etc', '--localstatedir=/var', '--libdir=/usr/lib', '--libexecdir=/usr/libexec', '--with-math', '--with-zlib', '--with-user=netdata'], tmp_src) - - print(".5 Run make dist") - run_command_in_host(['make', 'dist'], tmp_src) - - print(".6 Copy generated tarbal to desired path") - generated_tarball = '%s/netdata-%s.tar.gz' % (tmp_src, pkg_friendly_version) - - if os.path.exists(generated_tarball): - run_command_in_host(['sudo', 'cp', generated_tarball, tar_file]) - - print(".7 Fixing permissions on tarball") - run_command_in_host(['sudo', 'chmod', '777', tar_file]) - - print(".8 Bring over netdata.spec, Remove temp directory"); - run_command_in_host(['cp', '%s/netdata.spec' % tmp_src, 'netdata.spec']) - shutil.rmtree(tmp_src) - else: - print("I could not find (%s) on the disk, stopping the build. Kindly check the logs and try again" % generated_tarball) - sys.exit(1) diff --git a/.travis/package_management/configure_deb_lxc_environment.py b/.travis/package_management/configure_deb_lxc_environment.py deleted file mode 100755 index 627493bf5..000000000 --- a/.travis/package_management/configure_deb_lxc_environment.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 -# -# Prepare the build environment within the container -# The script attaches to the running container and does the following: -# 1) Create the container -# 2) Start the container up -# 3) Create the builder user -# 4) Prepare the environment for DEB build -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis - -import common -import os -import sys -import lxc - -if len(sys.argv) != 2: - print('You need to provide a container name to get things started') - sys.exit(1) -container_name=sys.argv[1] - -# Setup the container object -print("Defining container %s" % container_name) -container = lxc.Container(container_name) -if not container.defined: - raise Exception("Container %s not defined!" % container_name) - -# Start the container -if not container.start(): - raise Exception("Failed to start the container") - -if not container.running or not container.state == "RUNNING": - raise Exception('Container %s is not running, configuration process aborted ' % container_name) - -# Wait for connectivity -print("Waiting for container connectivity to start configuration sequence") -if not container.get_ips(timeout=30): - raise Exception("Timeout while waiting for container") - -build_path = "/home/%s" % os.environ['BUILDER_NAME'] - -# Run the required activities now -# 1. Create the builder user -print("1. Adding user %s" % os.environ['BUILDER_NAME']) -common.run_command(container, ["useradd", "-m", os.environ['BUILDER_NAME']]) - -# Fetch package dependencies for the build -print("2. Preparing repo on LXC container") -common.prepare_repo(container) - -print("2.1 Install .DEB build support packages") -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "dpkg-dev"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libdistro-info-perl"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "dh-make"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "dh-systemd"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "dh-autoreconf"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "git-buildpackage"]) - -print("2.2 Add more dependencies") -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libnetfilter-acct-dev"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libcups2-dev"]) - -print ("3.1 Run install-required-packages scriptlet") -common.run_command(container, ["wget", "-T", "15", "-O", "%s/.install-required-packages.sh" % build_path, "https://raw.githubusercontent.com/netdata/netdata/master/packaging/installer/install-required-packages.sh"]) -common.run_command(container, ["bash", "%s/.install-required-packages.sh" % build_path, "netdata", "--dont-wait", "--non-interactive"]) - -print("3.2 Installing package dependencies within LXC container") -common.install_common_dependendencies(container) - -friendly_version="" -dest_archive="" -download_url="" -tag = None -friendly_version, tag = common.fetch_version(os.environ['BUILD_VERSION']) - -tar_file="%s/netdata-%s.tar.gz" % (os.path.dirname(dest_archive), friendly_version) - -print("5. I will be building version '%s' of netdata." % os.environ['BUILD_VERSION']) -dest_archive="%s/netdata-%s.tar.gz" % (build_path, friendly_version) - -common.prepare_version_source(dest_archive, friendly_version, tag=tag) - -print("6. Installing build.sh script to build path") -common.run_command_in_host(['sudo', 'cp', '.travis/package_management/build.sh', "%s/%s/build.sh" % (os.environ['LXC_CONTAINER_ROOT'], build_path)]) -common.run_command_in_host(['sudo', 'chmod', '777', "%s/%s/build.sh" % (os.environ['LXC_CONTAINER_ROOT'], build_path)]) -common.run_command_in_host(['sudo', 'ln', '-sf', 'contrib/debian', 'debian']) - -print("Done!") diff --git a/.travis/package_management/configure_rpm_lxc_environment.py b/.travis/package_management/configure_rpm_lxc_environment.py deleted file mode 100755 index 4dca0bf38..000000000 --- a/.travis/package_management/configure_rpm_lxc_environment.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python3 -# -# -# Prepare the build environment within the container -# The script attaches to the running container and does the following: -# 1) Create the container -# 2) Start the container up -# 3) Create the builder user -# 4) Prepare the environment for RPM build -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis - -import common -import os -import sys -import lxc - -if len(sys.argv) != 2: - print('You need to provide a container name to get things started') - sys.exit(1) -container_name=sys.argv[1] - -# Setup the container object -print("Defining container %s" % container_name) -container = lxc.Container(container_name) -if not container.defined: - raise Exception("Container %s not defined!" % container_name) - -# Start the container -if not container.start(): - raise Exception("Failed to start the container") - -if not container.running or not container.state == "RUNNING": - raise Exception('Container %s is not running, configuration process aborted ' % container_name) - -# Wait for connectivity -print("Waiting for container connectivity to start configuration sequence") -if not container.get_ips(timeout=30): - raise Exception("Timeout while waiting for container") - -# Run the required activities now -# Create the builder user -print("1. Adding user %s" % os.environ['BUILDER_NAME']) -common.run_command(container, ["useradd", "-m", os.environ['BUILDER_NAME']]) - -# Fetch package dependencies for the build -print("2.1 Preparing repo on LXC container") -common.prepare_repo(container) - -common.run_command(container, ["wget", "-T", "15", "-O", "/home/%s/.install-required-packages.sh" % (os.environ['BUILDER_NAME']), "https://raw.githubusercontent.com/netdata/netdata/master/packaging/installer/install-required-packages.sh"]) -common.run_command(container, ["bash", "/home/%s/.install-required-packages.sh" % (os.environ['BUILDER_NAME']), "netdata", "--dont-wait", "--non-interactive"]) - -# Exceptional cases, not available everywhere -# -print("2.2 Running uncommon dependencies and preparing LXC environment") -# Not on Centos-7 -if os.environ["BUILD_STRING"].count("el/7") <= 0: - common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libnetfilter_acct-devel"]) - -# Not on Centos-6 -if os.environ["BUILD_STRING"].count("el/6") <= 0: - common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "autoconf-archive"]) - -print("2.3 Installing common dependencies") -common.install_common_dependendencies(container) - -print("3. Setting up macros") -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "/bin/echo", "'%_topdir %(echo /home/" + os.environ['BUILDER_NAME'] + ")/rpmbuild' > /home/" + os.environ['BUILDER_NAME'] + "/.rpmmacros"]) - -print("4. Create rpmbuild directory") -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/BUILD"]) -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/RPMS"]) -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/SOURCES"]) -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/SPECS"]) -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/SRPMS"]) -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "ls", "-ltrR", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild"]) - -# Download the source -rpm_friendly_version="" -dest_archive="" -download_url="" -spec_file="/home/%s/rpmbuild/SPECS/netdata.spec" % os.environ['BUILDER_NAME'] -tag = None -rpm_friendly_version, tag = common.fetch_version(os.environ['BUILD_VERSION']) -tar_file="%s/netdata-%s.tar.gz" % (os.path.dirname(dest_archive), rpm_friendly_version) - -print("5. I will be building version '%s' of netdata." % os.environ['BUILD_VERSION']) -dest_archive="/home/%s/rpmbuild/SOURCES/netdata-%s.tar.gz" % (os.environ['BUILDER_NAME'], rpm_friendly_version) - -common.prepare_version_source(dest_archive, rpm_friendly_version, tag=tag) - -# Extract the spec file in place -print("6. Extract spec file from the source") -common.run_command_in_host(['sudo', 'cp', 'netdata.spec', os.environ['LXC_CONTAINER_ROOT'] + spec_file]) -common.run_command_in_host(['sudo', 'chmod', '777', os.environ['LXC_CONTAINER_ROOT'] + spec_file]) - -print("7. Temporary hack: Change Source0 to %s on spec file %s" % (dest_archive, spec_file)) -common.replace_tag("Source0", os.environ['LXC_CONTAINER_ROOT'] + spec_file, tar_file) - -print('Done!') diff --git a/.travis/package_management/create_lxc_for_build.sh b/.travis/package_management/create_lxc_for_build.sh deleted file mode 100755 index d733687a8..000000000 --- a/.travis/package_management/create_lxc_for_build.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env bash -# -# This script generates an LXC container and starts it up -# Once the script completes successfully, a container has become available for usage -# The container image to be used and the container name to be set, are part of variables -# that must be present for the script to work -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) -# shellcheck disable=SC1091 -set -e - -source .travis/package_management/functions.sh || (echo "Failed to load packaging library" && exit 1) - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup) -if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" - echo "LXC Container creation aborted" - exit 1 -fi - -# Check for presence of mandatory environment variables -if [ -z "${BUILD_STRING}" ]; then - echo "No Distribution was defined. Make sure BUILD_STRING is set on the environment before running this script" - exit 1 -fi - -if [ -z "${BUILDER_NAME}" ]; then - echo "No builder account and container name defined. Make sure BUILDER_NAME is set on the environment before running this script" - exit 1 -fi - -if [ -z "${BUILD_DISTRO}" ]; then - echo "No build distro information defined. Make sure BUILD_DISTRO is set on the environment before running this script" - exit 1 -fi - -if [ -z "${BUILD_RELEASE}" ]; then - echo "No build release information defined. Make sure BUILD_RELEASE is set on the environment before running this script" - exit 1 -fi - -if [ -z "${PACKAGE_TYPE}" ]; then - echo "No build release information defined. Make sure PACKAGE_TYPE is set on the environment before running this script" - exit 1 -fi - -# Detect architecture and load extra variables needed -detect_arch_from_commit - -echo "Creating LXC container ${BUILDER_NAME}/${BUILD_STRING}/${BUILD_ARCH}...." - -case "${BUILD_ARCH}" in -"all") - # i386 - echo "Creating LXC Container for i386.." - export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-i386" - export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" - lxc-create -n "${CONTAINER_NAME}" -t "download" -- --dist "${BUILD_DISTRO}" --release "${BUILD_RELEASE}" --arch "i386" --no-validate - - echo "Container(s) ready. Configuring container(s).." - .travis/package_management/configure_${PACKAGE_TYPE}_lxc_environment.py "${CONTAINER_NAME}" - - # amd64 - echo "Creating LXC Container for amd64.." - export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-amd64" - export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" - lxc-create -n "${CONTAINER_NAME}" -t "download" -- --dist "${BUILD_DISTRO}" --release "${BUILD_RELEASE}" --arch "amd64" --no-validate - - echo "Container(s) ready. Configuring container(s).." - .travis/package_management/configure_${PACKAGE_TYPE}_lxc_environment.py "${CONTAINER_NAME}" - - # arm64 - echo "Creating LXC Container for arm64.." - export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-arm64" - export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" - lxc-create -n "${CONTAINER_NAME}" -t "download" -- --dist "${BUILD_DISTRO}" --release "${BUILD_RELEASE}" --arch "arm64" --no-validate - - echo "Container(s) ready. Configuring container(s).." - .travis/package_management/configure_${PACKAGE_TYPE}_lxc_environment.py "${CONTAINER_NAME}" - ;; -"i386"|"amd64"|"arm64") - # amd64 or i386 - echo "Creating LXC Container for ${BUILD_ARCH}.." - export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-${BUILD_ARCH}" - export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" - lxc-create -n "${CONTAINER_NAME}" -t "download" -- --dist "${BUILD_DISTRO}" --release "${BUILD_RELEASE}" --arch "${BUILD_ARCH}" --no-validate - - echo "Container(s) ready. Configuring container(s).." - .travis/package_management/configure_${PACKAGE_TYPE}_lxc_environment.py "${CONTAINER_NAME}" - ;; -*) - echo "Unknown BUILD_ARCH value '${BUILD_ARCH}' given, process failed" - exit 1 - ;; -esac - -echo "..LXC creation complete!" diff --git a/.travis/package_management/functions.sh b/.travis/package_management/functions.sh deleted file mode 100644 index 0c00d2fcb..000000000 --- a/.travis/package_management/functions.sh +++ /dev/null @@ -1,33 +0,0 @@ -# no-shebang-needed-its-a-library -# -# Utility functions for packaging in travis CI -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) -#shellcheck disable=SC2148 -set -e - -function detect_arch_from_commit { - case "${TRAVIS_COMMIT_MESSAGE}" in - "[Package amd64"*) - export BUILD_ARCH="amd64" - ;; - "[Package i386"*) - export BUILD_ARCH="i386" - ;; - "[Package ALL"*) - export BUILD_ARCH="all" - ;; - "[Package arm64"*) - export BUILD_ARCH="arm64" - ;; - - *) - echo "Unknown build architecture in '${TRAVIS_COMMIT_MESSAGE}'. Assuming amd64" - export BUILD_ARCH="amd64" - ;; - esac - - echo "Detected build architecture ${BUILD_ARCH}" -} diff --git a/.travis/package_management/old_package_purging.sh b/.travis/package_management/old_package_purging.sh deleted file mode 100755 index 89ecd753f..000000000 --- a/.travis/package_management/old_package_purging.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env bash -# -# Script to handle package cloud retention policy -# Our open source subscription is limited, -# so we use this sript to control the number of packages maintained historically -# -# Dependencies: -# - PACKAGE_CLOUD_RETENTION_DAYS -# This is to indicate for how many days back we want to maintain the various RPM and DEB packages on package cloud -# -# Copyright : SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis -# -set -e - -delete_files_for_version() { - local v="$1" - - # Delete the selected filenames in version - FILES_IN_VERSION=$(jq --sort-keys --arg v "${v}" '.[] | select ( .version | contains($v))' "${PKG_LIST_FILE}" | grep filename | cut -d':' -f 2) - - # Iterate through the files and delete them - for pkg in ${FILES_IN_VERSION/\\n/}; do - pkg=${pkg/,/} - pkg=${pkg/\"/} - pkg=${pkg/\"/} - echo "Attempting yank on ${pkg}.." - .travis/package_management/package_cloud_wrapper.sh yank "${PACKAGING_USER}/${DEPLOY_REPO}" "${pkg}" || echo "Nothing to yank or error on ${pkg}" - done -} - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup) -if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" - echo "Old packages yanking cancelled" - exit 1 -fi - -if [ -z "${PACKAGING_USER}" ]; then - echo "No PACKAGING_USER variable found" - exit 1 -fi - -if [ -z "${DEPLOY_REPO}" ]; then - echo "No DEPLOY_REPO variable found" - exit 1 -fi - -if [ -z ${PKG_CLOUD_TOKEN} ]; then - echo "No PKG_CLOUD_TOKEN variable found" - exit 1 -fi - -if [ -z ${PACKAGE_CLOUD_RETENTION_DAYS} ]; then - echo "No PACKAGE_CLOUD_RETENTION_DAYS variable found" - exit 1 -fi - -TMP_DIR="$(mktemp -d /tmp/netdata-old-package-yanking-XXXXXX)" -PKG_LIST_FILE="${TMP_DIR}/complete_package_list.json" -DATE_EPOCH="1970-01-01" -DATE_UNTIL_TO_DELETE=$(date --date="${PACKAGE_CLOUD_RETENTION_DAYS} day ago" +%Y-%m-%d) - - -echo "Created temp directory: ${TMP_DIR}" -echo "We will be purging contents up until ${DATE_UNTIL_TO_DELETE}" - -echo "Calling package could to retrieve all available packages on ${PACKAGING_USER}/${DEPLOY_REPO}" -curl -sS "https://${PKG_CLOUD_TOKEN}:@packagecloud.io/api/v1/repos/${PACKAGING_USER}/${DEPLOY_REPO}/packages.json" > "${PKG_LIST_FILE}" - -# Get versions within the desired duration -# -VERSIONS_TO_PURGE=$(jq --arg s "${DATE_EPOCH}" --arg e "${DATE_UNTIL_TO_DELETE}" ' -[($s, $e) | strptime("%Y-%m-%d")[0:3]] as $r - | map(select( - (.created_at[:19] | strptime("%Y-%m-%dT%H:%M:%S")[0:3]) as $d - | $d >= $r[0] and $d <= $r[1] -))' "${PKG_LIST_FILE}" | grep '"version":' | sort -u | sed -e 's/ //g' | cut -d':' -f2) - -echo "We will be deleting the following versions: ${VERSIONS_TO_PURGE}" -for v in ${VERSIONS_TO_PURGE/\n//}; do - v=${v/\"/} - v=${v/\"/} - v=${v/,/} - echo "Remove all files for version $v" - delete_files_for_version "${v}" -done - -# Done, clean up -[ -d "${TMP_DIR}" ] && rm -rf "${TMP_DIR}" diff --git a/.travis/package_management/package_cloud_wrapper.sh b/.travis/package_management/package_cloud_wrapper.sh deleted file mode 100755 index 48a372d37..000000000 --- a/.travis/package_management/package_cloud_wrapper.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash -# -# This is a tool to help removal of packages from packagecloud.io -# It utilizes the package_cloud utility provided from packagecloud.io -# -# Depends on: -# 1) package cloud gem (detects absence and installs it) -# -# Requires: -# 1) PKG_CLOUD_TOKEN variable exported -# 2) To properly install package_cloud when not found, it requires: ruby gcc gcc-c++ ruby-devel -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) -#shellcheck disable=SC2068,SC2145 -set -e -PKG_CLOUD_CONFIG="$HOME/.package_cloud_configuration.cfg" - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup) -if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" - echo "Docker build process aborted" - exit 1 -fi - -# Install dependency if not there -if ! command -v package_cloud > /dev/null 2>&1; then - echo "No package cloud gem found, installing" - gem install -V package_cloud || (echo "Package cloud installation failed. you might want to check if required dependencies are there (ruby gcc gcc-c++ ruby-devel)" && exit 1) -else - echo "Found package_cloud gem, continuing" -fi - -# Check for required token and prepare config -if [ -z "${PKG_CLOUD_TOKEN}" ]; then - echo "Please set PKG_CLOUD_TOKEN to be able to use ${0}" - exit 1 -fi -echo "{\"url\":\"https://packagecloud.io\",\"token\":\"${PKG_CLOUD_TOKEN}\"}" > "${PKG_CLOUD_CONFIG}" - -echo "Executing package_cloud with config ${PKG_CLOUD_CONFIG} and parameters $@" -package_cloud $@ --config="${PKG_CLOUD_CONFIG}" - -rm -rf "${PKG_CLOUD_CONFIG}" -echo "Done!" diff --git a/.travis/package_management/prepare_packages.sh b/.travis/package_management/prepare_packages.sh deleted file mode 100755 index 12ed07cc7..000000000 --- a/.travis/package_management/prepare_packages.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env bash -# -# Utility that gathers generated packages, -# puts them together in a local folder for deploy facility to pick up -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) -#shellcheck disable=SC2068 -set -e - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup) -if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" - echo "Package preparation aborted" - exit 1 -fi - -export LXC_ROOT="/var/lib/lxc" - -# Go through the containers created for packaging and pick up all generated packages -CREATED_CONTAINERS=$(ls -A "${LXC_ROOT}") -for d in ${CREATED_CONTAINERS[@]}; do - echo "Picking up packaging contents from ${d}" - - # Pick up any RPMS from builder - RPM_BUILD_PATH="${LXC_ROOT}/${d}/rootfs/home/${BUILDER_NAME}/rpmbuild" - if [ -d "${RPM_BUILD_PATH}" ]; then - echo "Checking folder ${RPM_BUILD_PATH} for RPMS and SRPMS" - - if [ -d "${RPM_BUILD_PATH}/RPMS" ]; then - echo "Copying any RPMS in '${RPM_BUILD_PATH}', copying over the following:" - ls -ltrR "${RPM_BUILD_PATH}/RPMS" - [[ -d "${RPM_BUILD_PATH}/RPMS/x86_64" ]] && cp -r "${RPM_BUILD_PATH}"/RPMS/x86_64/* "${PACKAGES_DIRECTORY}" - [[ -d "${RPM_BUILD_PATH}/RPMS/i386" ]] && cp -r "${RPM_BUILD_PATH}"/RPMS/i386/* "${PACKAGES_DIRECTORY}" - [[ -d "${RPM_BUILD_PATH}/RPMS/i686" ]] && cp -r "${RPM_BUILD_PATH}"/RPMS/i686/* "${PACKAGES_DIRECTORY}" - fi - - if [ -d "${RPM_BUILD_PATH}/SRPMS" ]; then - echo "Copying any SRPMS in '${RPM_BUILD_PATH}', copying over the following:" - ls -ltrR "${RPM_BUILD_PATH}/SRPMS" - [[ -d "${RPM_BUILD_PATH}/SRPMS/x86_64" ]] && cp -r "${RPM_BUILD_PATH}"/SRPMS/x86_64/* "${PACKAGES_DIRECTORY}" - [[ -d "${RPM_BUILD_PATH}/SRPMS/i386" ]] && cp -r "${RPM_BUILD_PATH}"/SRPMS/i386/* "${PACKAGES_DIRECTORY}" - [[ -d "${RPM_BUILD_PATH}/SRPMS/i686" ]] && cp -r "${RPM_BUILD_PATH}"/SRPMS/i686/* "${PACKAGES_DIRECTORY}" - fi - else - DEB_BUILD_PATH="${LXC_ROOT}/${d}/rootfs/home/${BUILDER_NAME}" - echo "Checking folder ${DEB_BUILD_PATH} for DEB packages" - if [ -d "${DEB_BUILD_PATH}" ]; then - cp "${DEB_BUILD_PATH}"/netdata*.ddeb "${PACKAGES_DIRECTORY}" || echo "Could not copy any .ddeb files" - cp "${DEB_BUILD_PATH}"/netdata*.deb "${PACKAGES_DIRECTORY}" || echo "Could not copy any .deb files" - cp "${DEB_BUILD_PATH}"/netdata*.buildinfo "${PACKAGES_DIRECTORY}" || echo "Could not copy any .buildinfo files" - cp "${DEB_BUILD_PATH}"/netdata*.changes "${PACKAGES_DIRECTORY}" || echo "Could not copy any .changes files" - else - echo "Folder ${DEB_BUILD_PATH} does not exist or not a directory, nothing to do for package preparation" - fi - fi -done - -chmod -R 777 "${PACKAGES_DIRECTORY}" -echo "Packaging contents ready to ship!" diff --git a/.travis/package_management/trigger_deb_lxc_build.py b/.travis/package_management/trigger_deb_lxc_build.py deleted file mode 100755 index 464a7715f..000000000 --- a/.travis/package_management/trigger_deb_lxc_build.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 -# -# This script is responsible for running the RPM build on the running container -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis - -import common -import os -import sys -import lxc - -if len(sys.argv) != 2: - print('You need to provide a container name to get things started') - sys.exit(1) -container_name=sys.argv[1] - -# Load the container, break if its not there -print("Starting up container %s" % container_name) -container = lxc.Container(container_name) -if not container.defined: - raise Exception("Container %s does not exist!" % container_name) - -# Check if the container is running, attempt to start it up in case its not running -if not container.running or not container.state == "RUNNING": - print('Container %s is not running, attempt to start it up' % container_name) - - # Start the container - if not container.start(): - raise Exception("Failed to start the container") - - if not container.running or not container.state == "RUNNING": - raise Exception('Container %s is not running, configuration process aborted ' % container_name) - -# Wait for connectivity -if not container.get_ips(timeout=30): - raise Exception("Timeout while waiting for container") - -build_path = "/home/%s" % os.environ['BUILDER_NAME'] - -print("Setting up EMAIL and DEBFULLNAME variables required by the build tools") -os.environ["EMAIL"] = "bot@netdata.cloud" -os.environ["DEBFULLNAME"] = "Netdata builder" - -# Run the build process on the container -new_version, tag = common.fetch_version(os.environ['BUILD_VERSION']) -print("Starting DEB build process for version %s" % new_version) - -netdata_tarball = "%s/netdata-%s.tar.gz" % (build_path, new_version) -unpacked_netdata = netdata_tarball.replace(".tar.gz", "") - -print("Extracting tarball %s" % netdata_tarball) -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "tar", "xf", netdata_tarball, "-C", build_path]) - -print("Checking version consistency") -since_version = os.environ["LATEST_RELEASE_VERSION"] -if str(since_version).replace('v', '') == str(new_version) and str(new_version).count('.') == 2: - s = since_version.split('.') - if int(s[2]) > 0: - patch_prev = str(int(s[2]) - 1) - since_version = s[0] + '.' + s[1] + '.' + patch_prev - else: - prev = str(int(s[1]) - 1) - since_version = s[0] + '.' + prev + '.' + s[2] - - print("We seem to be building a new stable release, reduce by one since_version option. New since_version:%s" % since_version) - -print("Fixing changelog tags") -changelog_in_host = "contrib/debian/changelog" -common.run_command_in_host(['sed', '-i', 's/PREVIOUS_PACKAGE_VERSION/%s-1/g' % since_version.replace("v", ""), changelog_in_host]) -common.run_command_in_host(['sed', '-i', 's/PREVIOUS_PACKAGE_DATE/%s/g' % os.environ["LATEST_RELEASE_DATE"], changelog_in_host]) - -print("Executing gbp dch command..") -common.run_command_in_host(['gbp', 'dch', '--release', '--ignore-branch', '--spawn-editor=snapshot', '--since=%s' % since_version, '--new-version=%s' % new_version]) - -print("Copying over changelog to the destination machine") -common.run_command_in_host(['sudo', 'cp', 'debian/changelog', "%s/%s/netdata-%s/contrib/debian/" % (os.environ['LXC_CONTAINER_ROOT'], build_path, new_version)]) - -print("Running debian build script since %s" % since_version) -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "%s/build.sh" % build_path, unpacked_netdata, new_version]) - -print("Listing contents on build path") -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "ls", "-ltr", build_path]) - -print('Done!') diff --git a/.travis/package_management/trigger_rpm_lxc_build.py b/.travis/package_management/trigger_rpm_lxc_build.py deleted file mode 100755 index f9e109c72..000000000 --- a/.travis/package_management/trigger_rpm_lxc_build.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# -# This script is responsible for running the RPM build on the running container -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis - -import common -import os -import sys -import lxc - -if len(sys.argv) != 2: - print('You need to provide a container name to get things started') - sys.exit(1) -container_name=sys.argv[1] - -# Load the container, break if its not there -print("Starting up container %s" % container_name) -container = lxc.Container(container_name) -if not container.defined: - raise Exception("Container %s does not exist!" % container_name) - -# Check if the container is running, attempt to start it up in case its not running -if not container.running or not container.state == "RUNNING": - print('Container %s is not running, attempt to start it up' % container_name) - - # Start the container - if not container.start(): - raise Exception("Failed to start the container") - - if not container.running or not container.state == "RUNNING": - raise Exception('Container %s is not running, configuration process aborted ' % container_name) - -# Wait for connectivity -if not container.get_ips(timeout=30): - raise Exception("Timeout while waiting for container") - -print("Adding builder specific dependencies to the LXC container") -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "rpm-build"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "rpm-devel"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "rpmlint"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "make"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "python"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "bash"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "diffutils"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "patch"]) -common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "rpmdevtools"]) - -# Run the build process on the container -print("Starting RPM build process") -common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "rpmbuild", "-ba", "--rebuild", "/home/%s/rpmbuild/SPECS/netdata.spec" % os.environ['BUILDER_NAME']]) - -print('Done!') diff --git a/.travis/package_management/yank_stale_pkg.sh b/.travis/package_management/yank_stale_pkg.sh deleted file mode 100755 index 3f7669712..000000000 --- a/.travis/package_management/yank_stale_pkg.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash -# -# This script is responsible for the removal of stale RPM/DEB files. -# It runs on the pre-deploy step and takes care of the removal of the files -# prior to the upload of the freshly built ones -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) -#shellcheck disable=SC2010,SC2068 -set -e - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup) -if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" - echo "Package yanking cancelled" - exit 1 -fi - -PACKAGES_DIR="$1" -DISTRO="$2" -PACKAGES_LIST="$(ls -AR "${PACKAGES_DIR}" | grep -e '\.rpm' -e '\.deb' -e '\.ddeb' )" - -if [ ! -d "${PACKAGES_DIR}" ] || [ -z "${PACKAGES_LIST}" ]; then - echo "Folder ${PACKAGES_DIR} does not seem to be a valid directory or is empty. No packages to check for yanking" - exit 1 -fi - -for pkg in ${PACKAGES_LIST[@]}; do - echo "Attempting yank on ${pkg}.." - .travis/package_management/package_cloud_wrapper.sh yank "${PACKAGING_USER}/${DEPLOY_REPO}/${DISTRO}" "${pkg}" || echo "Nothing to yank or error on ${pkg}" -done - diff --git a/.travis/trigger_package_build.sh b/.travis/trigger_package_build.sh new file mode 100755 index 000000000..d1ecd48ca --- /dev/null +++ b/.travis/trigger_package_build.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +token="${1}" +version="${2}" +pkgtype="${3}" + +resp="$(curl -X POST \ + -H 'Accept: application/vnd.github.v3+json' \ + -H "Authorization: Bearer ${token}" \ + "https://api.github.com/repos/netdata/netdata/actions/workflows/packaging.yml/dispatches" \ + -d "{\"ref\": \"master\", \"inputs\": {\"version\": \"${version}\", \"type\": \"${pkgtype}\"}}")" + +if [ -z "${resp}" ]; then + echo "Successfully triggered binary package builds." + exit 0 +else + echo "Failed to trigger binary package builds. Output:" + echo "${resp}" + exit 1 +fi diff --git a/.travis/trigger_package_generation.sh b/.travis/trigger_package_generation.sh deleted file mode 100755 index b58832cf4..000000000 --- a/.travis/trigger_package_generation.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -# -# Trigger .RPM and .DEB package generation processes -# -# Copyright: SPDX-License-Identifier: GPL-3.0-or-later -# -# Author: Pavlos Emm. Katsoulakis -set -e -WAIT_TIME=15 -BUILD_NIGHTLY="$1" - -commit_change() { - local ARCH="$1" - local PKG="$2" - local GIT_MAIL="bot@netdata.cloud" - local GIT_USER="netdatabot" - - echo "---- Committing ${ARCH} .${PKG} package generation ----" - git commit --allow-empty --author "${GIT_USER} <${GIT_MAIL}>" -m "[Package ${ARCH} ${PKG}]${BUILD_NIGHTLY} Package build process trigger" -} - -push_change() { - - echo "---- Push changes to repository ----" - git push "https://${GITHUB_TOKEN}:@$(git config --get remote.origin.url | sed -e 's/^https:\/\///')" -} - -# If we are not in netdata git repo, at the top level directory, fail -TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") -CWD=$(git rev-parse --show-cdup || echo "") -if [ -n "${CWD}" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then - echo "Run as .travis/$(basename "$0") from top level directory of netdata git repository" - echo "Changelog generation process aborted" - exit 1 -fi - -echo "--- Initialize git configuration ---" -git checkout master -git fetch --all -git pull - -commit_change "amd64" "DEB" -push_change - -echo "---- Waiting for ${WAIT_TIME} seconds before triggering next process ----" -sleep "${WAIT_TIME}" - -commit_change "i386" "DEB" -push_change - -echo "---- Waiting for ${WAIT_TIME} seconds before triggering next process ----" -sleep "${WAIT_TIME}" - -commit_change "amd64" "RPM" -push_change - -echo "---- Done! ----" diff --git a/CHANGELOG.md b/CHANGELOG.md index 1caf3ed72..c574ec6d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,126 @@ # Changelog +## [v1.30.0](https://github.com/netdata/netdata/tree/v1.30.0) (2021-03-31) + +[Full Changelog](https://github.com/netdata/netdata/compare/v1.29.3...v1.30.0) + +**Merged pull requests:** + +- Properly handle different netcat command names in binary package test code. [\#10883](https://github.com/netdata/netdata/pull/10883) ([Ferroin](https://github.com/Ferroin)) +- Add carrier and mtu charts for network interfaces [\#10866](https://github.com/netdata/netdata/pull/10866) ([vlvkobal](https://github.com/vlvkobal)) +- Fix typo in main.h [\#10858](https://github.com/netdata/netdata/pull/10858) ([eltociear](https://github.com/eltociear)) +- health: improve alarms infos [\#10853](https://github.com/netdata/netdata/pull/10853) ([ilyam8](https://github.com/ilyam8)) +- minor - add info about --aclk-ng into netdata-installer [\#10852](https://github.com/netdata/netdata/pull/10852) ([underhood](https://github.com/underhood)) +- mqtt-c coverity fix [\#10851](https://github.com/netdata/netdata/pull/10851) ([underhood](https://github.com/underhood)) +- web/gui: make network state map sytanx consistent in the dashboard info [\#10849](https://github.com/netdata/netdata/pull/10849) ([ilyam8](https://github.com/ilyam8)) +- fix\_repeat: Update repeat\_every and avoid unecessary test [\#10846](https://github.com/netdata/netdata/pull/10846) ([thiagoftsm](https://github.com/thiagoftsm)) +- Fix agent crash when executing data query with context and non-existing chart\_label\_key [\#10844](https://github.com/netdata/netdata/pull/10844) ([stelfrag](https://github.com/stelfrag)) +- Check device names in diskstats plugin [\#10843](https://github.com/netdata/netdata/pull/10843) ([vlvkobal](https://github.com/vlvkobal)) +- Fix memory leak when archived data is requested [\#10837](https://github.com/netdata/netdata/pull/10837) ([stelfrag](https://github.com/stelfrag)) +- add Installation method to the bug template [\#10836](https://github.com/netdata/netdata/pull/10836) ([ilyam8](https://github.com/ilyam8)) +- Add lock check to avoid shutdown when compiled with internal and locking checks [\#10835](https://github.com/netdata/netdata/pull/10835) ([stelfrag](https://github.com/stelfrag)) +- health: apply megacli alarms for all adapters/physical disks [\#10834](https://github.com/netdata/netdata/pull/10834) ([ilyam8](https://github.com/ilyam8)) +- Fix broken link in StatsD guide [\#10831](https://github.com/netdata/netdata/pull/10831) ([joelhans](https://github.com/joelhans)) +- health: add collector prefix to the external collectors alarms/templates [\#10830](https://github.com/netdata/netdata/pull/10830) ([ilyam8](https://github.com/ilyam8)) +- health: remove exporting\_metrics\_lost template [\#10829](https://github.com/netdata/netdata/pull/10829) ([ilyam8](https://github.com/ilyam8)) +- Fix name of PackageCLoud API token secret in workflows. [\#10828](https://github.com/netdata/netdata/pull/10828) ([Ferroin](https://github.com/Ferroin)) +- installer: update go.d.plugin version to v0.28.1 [\#10826](https://github.com/netdata/netdata/pull/10826) ([ilyam8](https://github.com/ilyam8)) +- alarm\(irc\): add support to change IRC\_PORT [\#10824](https://github.com/netdata/netdata/pull/10824) ([RaitoBezarius](https://github.com/RaitoBezarius)) +- Update syntax for Caddy v2 [\#10823](https://github.com/netdata/netdata/pull/10823) ([salazarp](https://github.com/salazarp)) +- health: apply adapter\_raid alarms for every logical/physical device [\#10820](https://github.com/netdata/netdata/pull/10820) ([ilyam8](https://github.com/ilyam8)) +- Fix handling of nightly and release packages in GHA workflows. [\#10819](https://github.com/netdata/netdata/pull/10819) ([Ferroin](https://github.com/Ferroin)) +- health: log an error if any when send email notification [\#10818](https://github.com/netdata/netdata/pull/10818) ([ilyam8](https://github.com/ilyam8)) +- Ebpf extend sync [\#10814](https://github.com/netdata/netdata/pull/10814) ([thiagoftsm](https://github.com/thiagoftsm)) +- Fix coverity issue \(CID 367566\) [\#10813](https://github.com/netdata/netdata/pull/10813) ([stelfrag](https://github.com/stelfrag)) +- fix claiming via env vars in docker container [\#10811](https://github.com/netdata/netdata/pull/10811) ([ilyam8](https://github.com/ilyam8)) +- Fix eBPF compilation [\#10810](https://github.com/netdata/netdata/pull/10810) ([thiagoftsm](https://github.com/thiagoftsm)) +- update bug report template [\#10807](https://github.com/netdata/netdata/pull/10807) ([underhood](https://github.com/underhood)) +- health: exclude cgroups net ifaces from packets dropped alarms [\#10806](https://github.com/netdata/netdata/pull/10806) ([ilyam8](https://github.com/ilyam8)) +- Don't show alarms for charts without data [\#10804](https://github.com/netdata/netdata/pull/10804) ([vlvkobal](https://github.com/vlvkobal)) +- claiming: increase curl connect-timeout and decrease number of claim attempts [\#10800](https://github.com/netdata/netdata/pull/10800) ([ilyam8](https://github.com/ilyam8)) +- Added Ubuntu 21.04 and Fedora 34 to our CI checks and binary package builds. [\#10791](https://github.com/netdata/netdata/pull/10791) ([Ferroin](https://github.com/Ferroin)) +- health: remove ram\_in\_swap alarm [\#10789](https://github.com/netdata/netdata/pull/10789) ([ilyam8](https://github.com/ilyam8)) +- Add a new parameter 'chart' to the /api/v1/alarm\_log. [\#10788](https://github.com/netdata/netdata/pull/10788) ([MrZammler](https://github.com/MrZammler)) +- Add check for children connecting to a parent agent with unsupported memory mode [\#10787](https://github.com/netdata/netdata/pull/10787) ([stelfrag](https://github.com/stelfrag)) +- health: use separate packets\_dropped\_ratio alarms for wifi network interfaces [\#10785](https://github.com/netdata/netdata/pull/10785) ([ilyam8](https://github.com/ilyam8)) +- ACLK separate https client [\#10784](https://github.com/netdata/netdata/pull/10784) ([underhood](https://github.com/underhood)) +- health: add `wmi\_` prefix to the wmi collector network alarms [\#10782](https://github.com/netdata/netdata/pull/10782) ([ilyam8](https://github.com/ilyam8)) +- web/gui: add max value to the nvidia\_smi.fan\_speed gauge [\#10780](https://github.com/netdata/netdata/pull/10780) ([ilyam8](https://github.com/ilyam8)) +- health/: fix various alarms critical and warning thresholds hysteresis [\#10779](https://github.com/netdata/netdata/pull/10779) ([ilyam8](https://github.com/ilyam8)) +- Adds \_aclk\_impl label [\#10778](https://github.com/netdata/netdata/pull/10778) ([underhood](https://github.com/underhood)) +- adding a default job with some params and example of additional job. [\#10777](https://github.com/netdata/netdata/pull/10777) ([andrewm4894](https://github.com/andrewm4894)) +- Fix typo in dashboard\_info.js [\#10775](https://github.com/netdata/netdata/pull/10775) ([eltociear](https://github.com/eltociear)) +- Fixed Travis config issues related to new packaging workflows. [\#10774](https://github.com/netdata/netdata/pull/10774) ([Ferroin](https://github.com/Ferroin)) +- add a dump\_methods parameter to alarm-notify.sh.in [\#10772](https://github.com/netdata/netdata/pull/10772) ([MrZammler](https://github.com/MrZammler)) +- Add data query support for archived charts [\#10771](https://github.com/netdata/netdata/pull/10771) ([stelfrag](https://github.com/stelfrag)) +- health: make vernemq alarms less sensitive [\#10770](https://github.com/netdata/netdata/pull/10770) ([ilyam8](https://github.com/ilyam8)) +- Fixed handling of perf.plugin capabilities. [\#10766](https://github.com/netdata/netdata/pull/10766) ([Ferroin](https://github.com/Ferroin)) +- dashboard@v2.13.28 [\#10761](https://github.com/netdata/netdata/pull/10761) ([jacekkolasa](https://github.com/jacekkolasa)) +- collectors/cgroups: fix cpuset.cpus count [\#10757](https://github.com/netdata/netdata/pull/10757) ([ilyam8](https://github.com/ilyam8)) +- eBPF plugin \(fixes 10727\) [\#10756](https://github.com/netdata/netdata/pull/10756) ([thiagoftsm](https://github.com/thiagoftsm)) +- web/gui: add supervisord to the dashboard\_info.js [\#10754](https://github.com/netdata/netdata/pull/10754) ([ilyam8](https://github.com/ilyam8)) +- Add state map to duplex and operstate charts [\#10752](https://github.com/netdata/netdata/pull/10752) ([vlvkobal](https://github.com/vlvkobal)) +- comment out memory mode mention in example [\#10751](https://github.com/netdata/netdata/pull/10751) ([OdysLam](https://github.com/OdysLam)) +- collectors/apps.plugin: Add wireguard to vpn [\#10743](https://github.com/netdata/netdata/pull/10743) ([liepumartins](https://github.com/liepumartins)) +- Enable metadata persistence in all memory modes [\#10742](https://github.com/netdata/netdata/pull/10742) ([stelfrag](https://github.com/stelfrag)) +- Move network interface speed, duplex, and operstate variables to charts [\#10740](https://github.com/netdata/netdata/pull/10740) ([vlvkobal](https://github.com/vlvkobal)) +- Use of out-of-line struct definitions. [\#10739](https://github.com/netdata/netdata/pull/10739) ([vkalintiris](https://github.com/vkalintiris)) +- Use a parameter name that is not a reserved keyword in C++ [\#10738](https://github.com/netdata/netdata/pull/10738) ([vkalintiris](https://github.com/vkalintiris)) +- Skip C++ incompatible header in main libnetdata header [\#10737](https://github.com/netdata/netdata/pull/10737) ([vkalintiris](https://github.com/vkalintiris)) +- Rename struct avl to avl\_element and the typedef to avl\_t [\#10735](https://github.com/netdata/netdata/pull/10735) ([vkalintiris](https://github.com/vkalintiris)) +- Fix claim behind squid proxy [\#10734](https://github.com/netdata/netdata/pull/10734) ([underhood](https://github.com/underhood)) +- add k6.conf [\#10733](https://github.com/netdata/netdata/pull/10733) ([OdysLam](https://github.com/OdysLam)) +- Always configure multihost database context [\#10732](https://github.com/netdata/netdata/pull/10732) ([stelfrag](https://github.com/stelfrag)) +- Removes unused fnc warning in ACLK Legacy [\#10731](https://github.com/netdata/netdata/pull/10731) ([underhood](https://github.com/underhood)) +- Update chart's metadata in database when it already exists during creation [\#10728](https://github.com/netdata/netdata/pull/10728) ([stelfrag](https://github.com/stelfrag)) +- New thread for ebpf.plugin [\#10726](https://github.com/netdata/netdata/pull/10726) ([thiagoftsm](https://github.com/thiagoftsm)) +- Support VS Code container devenv [\#10723](https://github.com/netdata/netdata/pull/10723) ([OdysLam](https://github.com/OdysLam)) +- Fixed detection of already claimed node in Docker images. [\#10720](https://github.com/netdata/netdata/pull/10720) ([Ferroin](https://github.com/Ferroin)) +- Add statsd guide [\#10719](https://github.com/netdata/netdata/pull/10719) ([OdysLam](https://github.com/OdysLam)) +- Add the ability to store chart labels in the database [\#10718](https://github.com/netdata/netdata/pull/10718) ([stelfrag](https://github.com/stelfrag)) +- Fix a parameter binding issue when storing chart names in the database [\#10717](https://github.com/netdata/netdata/pull/10717) ([stelfrag](https://github.com/stelfrag)) +- Fix typo in backend\_prometheus.c [\#10716](https://github.com/netdata/netdata/pull/10716) ([eltociear](https://github.com/eltociear)) +- Add guide: Unsupervised anomaly detection for Raspberry Pi monitoring [\#10713](https://github.com/netdata/netdata/pull/10713) ([joelhans](https://github.com/joelhans)) +- Add Working Set charts to the cgroups plugin [\#10712](https://github.com/netdata/netdata/pull/10712) ([vlvkobal](https://github.com/vlvkobal)) +- python.d/smartd\_log: collect attribute 233 \(Media Wearout Indicator \(SSD\)\). [\#10711](https://github.com/netdata/netdata/pull/10711) ([aazedo](https://github.com/aazedo)) +- Add guide: Develop a custom data collector for Netdata in Python [\#10710](https://github.com/netdata/netdata/pull/10710) ([joelhans](https://github.com/joelhans)) +- New version eBPF programs. [\#10707](https://github.com/netdata/netdata/pull/10707) ([thiagoftsm](https://github.com/thiagoftsm)) +- Add JSON output option for buildinfo. [\#10706](https://github.com/netdata/netdata/pull/10706) ([Ferroin](https://github.com/Ferroin)) +- Fix disk utilization and backlog charts [\#10705](https://github.com/netdata/netdata/pull/10705) ([vlvkobal](https://github.com/vlvkobal)) +- update\_kernel\_version: Fix overflow on Centos and probably Ubuntu [\#10704](https://github.com/netdata/netdata/pull/10704) ([thiagoftsm](https://github.com/thiagoftsm)) +- Docs: Convert references to `service` to `systemctl` [\#10703](https://github.com/netdata/netdata/pull/10703) ([joelhans](https://github.com/joelhans)) +- Add noauthcodecheck workaround flag to the freeipmi plugin [\#10701](https://github.com/netdata/netdata/pull/10701) ([vlvkobal](https://github.com/vlvkobal)) +- Add guide: LAMP stack monitoring [\#10698](https://github.com/netdata/netdata/pull/10698) ([joelhans](https://github.com/joelhans)) +- Log ACLK cloud commands to access.log [\#10697](https://github.com/netdata/netdata/pull/10697) ([stelfrag](https://github.com/stelfrag)) +- Add Linux page cache metrics to eBPF [\#10693](https://github.com/netdata/netdata/pull/10693) ([thiagoftsm](https://github.com/thiagoftsm)) +- Update guide: Kubernetes monitoring with Netdata: Overview and visualizations [\#10691](https://github.com/netdata/netdata/pull/10691) ([joelhans](https://github.com/joelhans)) +- health: make alarms less sensitive [\#10688](https://github.com/netdata/netdata/pull/10688) ([ilyam8](https://github.com/ilyam8)) +- Ebpf support new collectors [\#10680](https://github.com/netdata/netdata/pull/10680) ([thiagoftsm](https://github.com/thiagoftsm)) +- Fix broken links in active alarms doc [\#10678](https://github.com/netdata/netdata/pull/10678) ([joelhans](https://github.com/joelhans)) +- Add new cookie to fix 8094 [\#10676](https://github.com/netdata/netdata/pull/10676) ([thiagoftsm](https://github.com/thiagoftsm)) +- Alarms collector add alarm values [\#10675](https://github.com/netdata/netdata/pull/10675) ([andrewm4894](https://github.com/andrewm4894)) +- Don't add duplicate \_total suffixes for the prometheus go.d module [\#10674](https://github.com/netdata/netdata/pull/10674) ([vlvkobal](https://github.com/vlvkobal)) +- fix a typo in the email notifications readme [\#10668](https://github.com/netdata/netdata/pull/10668) ([ossimantylahti](https://github.com/ossimantylahti)) +- Update screenshots and text for new Cloud nav [\#10664](https://github.com/netdata/netdata/pull/10664) ([joelhans](https://github.com/joelhans)) +- Improve the Kubernetes deployment documentation [\#10662](https://github.com/netdata/netdata/pull/10662) ([joelhans](https://github.com/joelhans)) +- installer: update go.d.plugin version to v0.28.0 [\#10660](https://github.com/netdata/netdata/pull/10660) ([ilyam8](https://github.com/ilyam8)) +- Changed Docker image tagging to use semver tags for releases. [\#10648](https://github.com/netdata/netdata/pull/10648) ([Ferroin](https://github.com/Ferroin)) +- Revamp statsd docs [\#10637](https://github.com/netdata/netdata/pull/10637) ([OdysLam](https://github.com/OdysLam)) +- replace GA with PostHog for backend telemetry events. [\#10636](https://github.com/netdata/netdata/pull/10636) ([andrewm4894](https://github.com/andrewm4894)) +- cpu stats per query thread [\#10634](https://github.com/netdata/netdata/pull/10634) ([MrZammler](https://github.com/MrZammler)) +- Assorted updater fixes. [\#10613](https://github.com/netdata/netdata/pull/10613) ([Ferroin](https://github.com/Ferroin)) +- add stats per cloud query type [\#10602](https://github.com/netdata/netdata/pull/10602) ([underhood](https://github.com/underhood)) +- Add a new workflow to test that updater works as expected [\#10599](https://github.com/netdata/netdata/pull/10599) ([kaskavel](https://github.com/kaskavel)) +- Add support for changing the number of pages per extent [\#10593](https://github.com/netdata/netdata/pull/10593) ([mfundul](https://github.com/mfundul)) +- web/gui: Fix broken external links [\#10586](https://github.com/netdata/netdata/pull/10586) ([Habetdin](https://github.com/Habetdin)) +- Fix wrong count for entries [\#10564](https://github.com/netdata/netdata/pull/10564) ([thiagoftsm](https://github.com/thiagoftsm)) +- Try to keep all pages from extents read from disk in the cache. [\#10558](https://github.com/netdata/netdata/pull/10558) ([mfundul](https://github.com/mfundul)) +- Remove unreachable \#else directives in plugins. [\#10523](https://github.com/netdata/netdata/pull/10523) ([vkalintiris](https://github.com/vkalintiris)) +- Fixed handling of permissions for some plugins. [\#10490](https://github.com/netdata/netdata/pull/10490) ([Ferroin](https://github.com/Ferroin)) +- increases ACLK TBEB randomness [\#10373](https://github.com/netdata/netdata/pull/10373) ([underhood](https://github.com/underhood)) +- Rename abs to ABS to avoid clash with standard definitions. Fixes \#10353. [\#10354](https://github.com/netdata/netdata/pull/10354) ([KickerTom](https://github.com/KickerTom)) +- ACLK-NG [\#10315](https://github.com/netdata/netdata/pull/10315) ([underhood](https://github.com/underhood)) + ## [v1.29.3](https://github.com/netdata/netdata/tree/v1.29.3) (2021-02-23) [Full Changelog](https://github.com/netdata/netdata/compare/v1.29.2...v1.29.3) @@ -51,7 +172,7 @@ ## [v1.29.0](https://github.com/netdata/netdata/tree/v1.29.0) (2021-02-03) -[Full Changelog](https://github.com/netdata/netdata/compare/v1.28.0...v1.29.0) +[Full Changelog](https://github.com/netdata/netdata/compare/v1.27.0_0104103941...v1.29.0) **Merged pull requests:** @@ -108,25 +229,32 @@ - New eBPF kernel [\#10434](https://github.com/netdata/netdata/pull/10434) ([thiagoftsm](https://github.com/thiagoftsm)) - Update and improve the Netdata style guide [\#10433](https://github.com/netdata/netdata/pull/10433) ([joelhans](https://github.com/joelhans)) - Change HDDtemp to report None instead of 0 [\#10429](https://github.com/netdata/netdata/pull/10429) ([slavox](https://github.com/slavox)) -- Use bash shell as user netdata for debug [\#10425](https://github.com/netdata/netdata/pull/10425) ([Steve8291](https://github.com/Steve8291)) - Qick and dirty fix for \#10420 [\#10424](https://github.com/netdata/netdata/pull/10424) ([skibbipl](https://github.com/skibbipl)) - Add instructions on enabling explicitly disabled collectors [\#10418](https://github.com/netdata/netdata/pull/10418) ([joelhans](https://github.com/joelhans)) - Change links at bottom of all install docs [\#10416](https://github.com/netdata/netdata/pull/10416) ([joelhans](https://github.com/joelhans)) - Improve configuration docs with common changes and start/stop/restart directions [\#10415](https://github.com/netdata/netdata/pull/10415) ([joelhans](https://github.com/joelhans)) -- Add Realtek network cards to the list of physical interfaces on FreeBSD [\#10414](https://github.com/netdata/netdata/pull/10414) ([vlvkobal](https://github.com/vlvkobal)) -- Update main README with release news [\#10412](https://github.com/netdata/netdata/pull/10412) ([joelhans](https://github.com/joelhans)) - Small updates, improvements, and housekeeping to docs [\#10405](https://github.com/netdata/netdata/pull/10405) ([joelhans](https://github.com/joelhans)) - python.d/fail2ban: Add handling "yes" and "no" as bool, match flexible spaces [\#10400](https://github.com/netdata/netdata/pull/10400) ([grinapo](https://github.com/grinapo)) - Dispatch cgroup discovery into another thread [\#10399](https://github.com/netdata/netdata/pull/10399) ([vlvkobal](https://github.com/vlvkobal)) -- Added instructions on which file to edit. [\#10398](https://github.com/netdata/netdata/pull/10398) ([kdvlr](https://github.com/kdvlr)) - Fix data source option for Prometheus web API in exporting configuration [\#10397](https://github.com/netdata/netdata/pull/10397) ([vlvkobal](https://github.com/vlvkobal)) -- ACLK collector list use mguid instead of hostname [\#10394](https://github.com/netdata/netdata/pull/10394) ([underhood](https://github.com/underhood)) - Docs housekeeping for SEO and syntax, part 1 [\#10388](https://github.com/netdata/netdata/pull/10388) ([joelhans](https://github.com/joelhans)) - Persist `$TMPDIR` from installer to updater. [\#10384](https://github.com/netdata/netdata/pull/10384) ([Ferroin](https://github.com/Ferroin)) -- Add centralized Cloud notifications to core docs [\#10374](https://github.com/netdata/netdata/pull/10374) ([joelhans](https://github.com/joelhans)) - Change linting standard for Markdown lists [\#10371](https://github.com/netdata/netdata/pull/10371) ([joelhans](https://github.com/joelhans)) - Move ACLK Legacy into subfolder [\#10265](https://github.com/netdata/netdata/pull/10265) ([underhood](https://github.com/underhood)) +## [v1.27.0_0104103941](https://github.com/netdata/netdata/tree/v1.27.0_0104103941) (2021-01-04) + +[Full Changelog](https://github.com/netdata/netdata/compare/v1.28.0...v1.27.0_0104103941) + +**Merged pull requests:** + +- Use bash shell as user netdata for debug [\#10425](https://github.com/netdata/netdata/pull/10425) ([Steve8291](https://github.com/Steve8291)) +- Add Realtek network cards to the list of physical interfaces on FreeBSD [\#10414](https://github.com/netdata/netdata/pull/10414) ([vlvkobal](https://github.com/vlvkobal)) +- Update main README with release news [\#10412](https://github.com/netdata/netdata/pull/10412) ([joelhans](https://github.com/joelhans)) +- Added instructions on which file to edit. [\#10398](https://github.com/netdata/netdata/pull/10398) ([kdvlr](https://github.com/kdvlr)) +- ACLK collector list use mguid instead of hostname [\#10394](https://github.com/netdata/netdata/pull/10394) ([underhood](https://github.com/underhood)) +- Add centralized Cloud notifications to core docs [\#10374](https://github.com/netdata/netdata/pull/10374) ([joelhans](https://github.com/joelhans)) + ## [v1.28.0](https://github.com/netdata/netdata/tree/v1.28.0) (2020-12-18) [Full Changelog](https://github.com/netdata/netdata/compare/v1.27.0...v1.28.0) @@ -202,55 +330,11 @@ - Add kernel to blacklist [\#10262](https://github.com/netdata/netdata/pull/10262) ([thiagoftsm](https://github.com/thiagoftsm)) - Made the update script significantly more robust and user friendly. [\#10261](https://github.com/netdata/netdata/pull/10261) ([Ferroin](https://github.com/Ferroin)) - new issue templates [\#10259](https://github.com/netdata/netdata/pull/10259) ([OdysLam](https://github.com/OdysLam)) -- Docs: Point users to proper configure doc [\#10254](https://github.com/netdata/netdata/pull/10254) ([joelhans](https://github.com/joelhans)) -- Docs: Cleanup and fix broken links [\#10253](https://github.com/netdata/netdata/pull/10253) ([joelhans](https://github.com/joelhans)) -- Update CONTRIBUTING.md [\#10252](https://github.com/netdata/netdata/pull/10252) ([joelhans](https://github.com/joelhans)) -- updated 3rd party static dependencies and use alpine 3.12 [\#10241](https://github.com/netdata/netdata/pull/10241) ([ktsaou](https://github.com/ktsaou)) -- Fix streaming buffer size [\#10240](https://github.com/netdata/netdata/pull/10240) ([vlvkobal](https://github.com/vlvkobal)) -- dashboard v2.9.2 [\#10239](https://github.com/netdata/netdata/pull/10239) ([jacekkolasa](https://github.com/jacekkolasa)) -- database: avoid endless loop when cleaning obsolete charts [\#10236](https://github.com/netdata/netdata/pull/10236) ([hexchain](https://github.com/hexchain)) -- Update ansible.md [\#10232](https://github.com/netdata/netdata/pull/10232) ([voriol](https://github.com/voriol)) -- Disable chart obsoletion code for archived chart creation. [\#10231](https://github.com/netdata/netdata/pull/10231) ([mfundul](https://github.com/mfundul)) -- add `nvidia\_smi` collector data to the dashboard\_info.js [\#10230](https://github.com/netdata/netdata/pull/10230) ([ilyam8](https://github.com/ilyam8)) -- health: convert `elasticsearch\_last\_collected` alarm to template [\#10226](https://github.com/netdata/netdata/pull/10226) ([ilyam8](https://github.com/ilyam8)) -- streaming: fix a typo in the README.md [\#10225](https://github.com/netdata/netdata/pull/10225) ([ilyam8](https://github.com/ilyam8)) -- collectors/xenstat.plugin: recieved =\> received [\#10224](https://github.com/netdata/netdata/pull/10224) ([ilyam8](https://github.com/ilyam8)) -- dashboard\_info.js: fix a typo \(vernemq\) [\#10223](https://github.com/netdata/netdata/pull/10223) ([ilyam8](https://github.com/ilyam8)) -- Fix chart filtering [\#10218](https://github.com/netdata/netdata/pull/10218) ([vlvkobal](https://github.com/vlvkobal)) -- Don't stop Prometheus remote write collector when data is not available for dimension formatting [\#10217](https://github.com/netdata/netdata/pull/10217) ([vlvkobal](https://github.com/vlvkobal)) -- Fix coverity issues [\#10216](https://github.com/netdata/netdata/pull/10216) ([vlvkobal](https://github.com/vlvkobal)) -- Fixed bug in auto-updater for FreeBSD \(\#10198\) [\#10204](https://github.com/netdata/netdata/pull/10204) ([abrbon](https://github.com/abrbon)) -- New ebpf release [\#10202](https://github.com/netdata/netdata/pull/10202) ([thiagoftsm](https://github.com/thiagoftsm)) -- Add guide: Deploy Netdata with Ansible [\#10199](https://github.com/netdata/netdata/pull/10199) ([joelhans](https://github.com/joelhans)) -- Add allocated space metrics to oracledb charts [\#10197](https://github.com/netdata/netdata/pull/10197) ([jurgenhaas](https://github.com/jurgenhaas)) -- File descr alarm v01 [\#10192](https://github.com/netdata/netdata/pull/10192) ([Ancairon](https://github.com/Ancairon)) -- Update CoC and widen scope to community [\#10186](https://github.com/netdata/netdata/pull/10186) ([OdysLam](https://github.com/OdysLam)) -- Migrate metadata log to SQLite [\#10139](https://github.com/netdata/netdata/pull/10139) ([stelfrag](https://github.com/stelfrag)) -- Kubernetes labels [\#10107](https://github.com/netdata/netdata/pull/10107) ([ilyam8](https://github.com/ilyam8)) -- Remove Docker example from update docs and add section to claim troubleshooting [\#10103](https://github.com/netdata/netdata/pull/10103) ([joelhans](https://github.com/joelhans)) -- Anomalies collector [\#10060](https://github.com/netdata/netdata/pull/10060) ([andrewm4894](https://github.com/andrewm4894)) -- Alarms collector [\#10042](https://github.com/netdata/netdata/pull/10042) ([andrewm4894](https://github.com/andrewm4894)) -- ACLK allow child query [\#10030](https://github.com/netdata/netdata/pull/10030) ([underhood](https://github.com/underhood)) ## [v1.26.0](https://github.com/netdata/netdata/tree/v1.26.0) (2020-10-14) [Full Changelog](https://github.com/netdata/netdata/compare/before_rebase...v1.26.0) -**Merged pull requests:** - -- Fix systemd comment syntax [\#10066](https://github.com/netdata/netdata/pull/10066) ([HolgerHees](https://github.com/HolgerHees)) -- health/portcheck: add `failed` dim to the `connection\_fails` alarm [\#10048](https://github.com/netdata/netdata/pull/10048) ([ilyam8](https://github.com/ilyam8)) -- installer: update go.d.plugin version to v0.23.0 [\#10046](https://github.com/netdata/netdata/pull/10046) ([ilyam8](https://github.com/ilyam8)) -- Rename NETDATA\_PORT to NETDATA\_LISTENER\_PORT [\#10045](https://github.com/netdata/netdata/pull/10045) ([knatsakis](https://github.com/knatsakis)) -- small docs update - adding note about using `nolock` when debugging [\#10036](https://github.com/netdata/netdata/pull/10036) ([andrewm4894](https://github.com/andrewm4894)) -- Fixed the data endpoint to prioritize chart over context if both are present [\#10032](https://github.com/netdata/netdata/pull/10032) ([stelfrag](https://github.com/stelfrag)) -- python.d/rabbitmq: Add chart for churn rates [\#10031](https://github.com/netdata/netdata/pull/10031) ([chadknutson](https://github.com/chadknutson)) -- Fixed gauges for go web\_log module [\#10029](https://github.com/netdata/netdata/pull/10029) ([hamedbrd](https://github.com/hamedbrd)) -- Fixed incorrect condition in updater type detection. [\#10028](https://github.com/netdata/netdata/pull/10028) ([Ferroin](https://github.com/Ferroin)) -- Fix README exporting link [\#10020](https://github.com/netdata/netdata/pull/10020) ([Dim-P](https://github.com/Dim-P)) -- Clean up and better cross-link new docsv2 documents [\#10015](https://github.com/netdata/netdata/pull/10015) ([joelhans](https://github.com/joelhans)) -- collector infiniband: fix file descriptor leak [\#10013](https://github.com/netdata/netdata/pull/10013) ([Saruspete](https://github.com/Saruspete)) - ## [before_rebase](https://github.com/netdata/netdata/tree/before_rebase) (2020-09-24) [Full Changelog](https://github.com/netdata/netdata/compare/v1.25.0...before_rebase) diff --git a/CMakeLists.txt b/CMakeLists.txt index 415c673d8..5088d1381 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -481,10 +481,14 @@ set(SLABINFO_PLUGIN_FILES set(EBPF_PROCESS_PLUGIN_FILES collectors/ebpf.plugin/ebpf.c collectors/ebpf.plugin/ebpf.h + collectors/ebpf.plugin/ebpf_cachestat.c + collectors/ebpf.plugin/ebpf_cachestat.h collectors/ebpf.plugin/ebpf_process.c collectors/ebpf.plugin/ebpf_process.h collectors/ebpf.plugin/ebpf_socket.c collectors/ebpf.plugin/ebpf_socket.h + collectors/ebpf.plugin/ebpf_sync.c + collectors/ebpf.plugin/ebpf_sync.h collectors/ebpf.plugin/ebpf_apps.c collectors/ebpf.plugin/ebpf_apps.h ) diff --git a/Makefile.am b/Makefile.am index 1846d1e1c..180d337ce 100644 --- a/Makefile.am +++ b/Makefile.am @@ -116,10 +116,18 @@ SUBDIRS += \ web \ claim \ parser \ - aclk/legacy \ spawn \ $(NULL) +if ACLK_NG +SUBDIRS += \ + mqtt_websockets \ + $(NULL) +else +SUBDIRS += \ + aclk/legacy \ + $(NULL) +endif AM_CFLAGS = \ $(OPTIONAL_MATH_CFLAGS) \ @@ -281,10 +289,14 @@ PERF_PLUGIN_FILES = \ EBPF_PLUGIN_FILES = \ collectors/ebpf.plugin/ebpf.c \ + collectors/ebpf.plugin/ebpf_cachestat.c \ + collectors/ebpf.plugin/ebpf_cachestat.h \ collectors/ebpf.plugin/ebpf_process.c \ collectors/ebpf.plugin/ebpf_process.h \ collectors/ebpf.plugin/ebpf_socket.c \ collectors/ebpf.plugin/ebpf_socket.h \ + collectors/ebpf.plugin/ebpf_sync.c \ + collectors/ebpf.plugin/ebpf_sync.h \ collectors/ebpf.plugin/ebpf.h \ collectors/ebpf.plugin/ebpf_apps.c \ collectors/ebpf.plugin/ebpf_apps.h \ @@ -374,14 +386,14 @@ RRD_PLUGIN_FILES = \ database/rrdsetvar.h \ database/rrdvar.c \ database/rrdvar.h \ + database/sqlite/sqlite_functions.c \ + database/sqlite/sqlite_functions.h \ + database/sqlite/sqlite3.c \ + database/sqlite/sqlite3.h \ $(NULL) if ENABLE_DBENGINE RRD_PLUGIN_FILES += \ - database/sqlite/sqlite_functions.c \ - database/sqlite/sqlite_functions.h \ - database/sqlite/sqlite3.c \ - database/sqlite/sqlite3.h \ database/engine/rrdengine.c \ database/engine/rrdengine.h \ database/engine/rrddiskprotocol.h \ @@ -523,6 +535,30 @@ PARSER_FILES = \ parser/parser.h \ $(NULL) +if ACLK_NG +ACLK_FILES = \ + aclk/aclk.c \ + aclk/aclk.h \ + aclk/aclk_util.c \ + aclk/aclk_util.h \ + aclk/aclk_stats.c \ + aclk/aclk_stats.h \ + aclk/aclk_query.c \ + aclk/aclk_query.h \ + aclk/aclk_query_queue.c \ + aclk/aclk_query_queue.h \ + aclk/aclk_collector_list.c \ + aclk/aclk_collector_list.h \ + aclk/aclk_otp.c \ + aclk/aclk_otp.h \ + aclk/aclk_tx_msgs.c \ + aclk/aclk_tx_msgs.h \ + aclk/aclk_rx_msgs.c \ + aclk/aclk_rx_msgs.h \ + aclk/https_client.c \ + aclk/https_client.h \ + $(NULL) +else #ACLK_NG ACLK_FILES = \ aclk/legacy/aclk_rrdhost_state.h \ aclk/legacy/aclk_common.c \ @@ -546,9 +582,8 @@ ACLK_FILES += \ aclk/legacy/aclk_lws_https_client.c \ aclk/legacy/aclk_lws_https_client.h \ $(NULL) -endif - - +endif #ENABLE_ACLK +endif #ACLK_NG SPAWN_PLUGIN_FILES = \ spawn/spawn.c \ @@ -712,6 +747,12 @@ NETDATACLI_FILES = \ sbin_PROGRAMS += netdata netdata_SOURCES = $(NETDATA_FILES) +if ACLK_NG +netdata_LDADD = \ + mqtt_websockets/libmqttwebsockets.a \ + $(NETDATA_COMMON_LIBS) \ + $(NULL) +else #ACLK_NG if ENABLE_ACLK netdata_LDADD = \ externaldeps/mosquitto/libmosquitto.a \ @@ -719,11 +760,12 @@ netdata_LDADD = \ $(OPTIONAL_LWS_LIBS) \ $(NETDATA_COMMON_LIBS) \ $(NULL) -else +else #ENABLE_ACLK netdata_LDADD = \ $(NETDATA_COMMON_LIBS) \ $(NULL) -endif +endif #ENABLE_ACLK +endif #ACLK_NG if ENABLE_CXX_LINKER netdata_LINK = $(CXXLD) $(CXXFLAGS) $(LDFLAGS) -o $@ diff --git a/aclk/aclk.c b/aclk/aclk.c new file mode 100644 index 000000000..889fa1e4d --- /dev/null +++ b/aclk/aclk.c @@ -0,0 +1,821 @@ +#include "aclk.h" + +#include "aclk_stats.h" +#include "mqtt_wss_client.h" +#include "aclk_otp.h" +#include "aclk_tx_msgs.h" +#include "aclk_query.h" +#include "aclk_query_queue.h" +#include "aclk_util.h" +#include "aclk_rx_msgs.h" +#include "aclk_collector_list.h" + +#ifdef ACLK_LOG_CONVERSATION_DIR +#include +#include +#include +#endif + +#define ACLK_STABLE_TIMEOUT 3 // Minimum delay to mark AGENT as stable + +//TODO remove most (as in 99.999999999%) of this crap +int aclk_connected = 0; +int aclk_disable_runtime = 0; +int aclk_disable_single_updates = 0; +int aclk_kill_link = 0; + +int aclk_pubacks_per_conn = 0; // How many PubAcks we got since MQTT conn est. + +usec_t aclk_session_us = 0; // Used by the mqtt layer +time_t aclk_session_sec = 0; // Used by the mqtt layer + +mqtt_wss_client mqttwss_client; + +netdata_mutex_t aclk_shared_state_mutex = NETDATA_MUTEX_INITIALIZER; +#define ACLK_SHARED_STATE_LOCK netdata_mutex_lock(&aclk_shared_state_mutex) +#define ACLK_SHARED_STATE_UNLOCK netdata_mutex_unlock(&aclk_shared_state_mutex) + +struct aclk_shared_state aclk_shared_state = { + .agent_state = AGENT_INITIALIZING, + .last_popcorn_interrupt = 0, + .version_neg = 0, + .version_neg_wait_till = 0, + .mqtt_shutdown_msg_id = -1, + .mqtt_shutdown_msg_rcvd = 0 +}; + +void aclk_single_update_disable() +{ + aclk_disable_single_updates = 1; +} + +void aclk_single_update_enable() +{ + aclk_disable_single_updates = 0; +} + +//ENDTODO + +static RSA *aclk_private_key = NULL; +static int load_private_key() +{ + if (aclk_private_key != NULL) + RSA_free(aclk_private_key); + aclk_private_key = NULL; + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/cloud.d/private.pem", netdata_configured_varlib_dir); + + long bytes_read; + char *private_key = read_by_filename(filename, &bytes_read); + if (!private_key) { + error("Claimed agent cannot establish ACLK - unable to load private key '%s' failed.", filename); + return 1; + } + debug(D_ACLK, "Claimed agent loaded private key len=%ld bytes", bytes_read); + + BIO *key_bio = BIO_new_mem_buf(private_key, -1); + if (key_bio==NULL) { + error("Claimed agent cannot establish ACLK - failed to create BIO for key"); + goto biofailed; + } + + aclk_private_key = PEM_read_bio_RSAPrivateKey(key_bio, NULL, NULL, NULL); + BIO_free(key_bio); + if (aclk_private_key!=NULL) + { + freez(private_key); + return 0; + } + char err[512]; + ERR_error_string_n(ERR_get_error(), err, sizeof(err)); + error("Claimed agent cannot establish ACLK - cannot create private key: %s", err); + +biofailed: + freez(private_key); + return 1; +} + +static int wait_till_cloud_enabled() +{ + info("Waiting for Cloud to be enabled"); + while (!netdata_cloud_setting) { + sleep_usec(USEC_PER_SEC * 1); + if (netdata_exit) + return 1; + } + return 0; +} + +/** + * Will block until agent is claimed. Returns only if agent claimed + * or if agent needs to shutdown. + * + * @return `0` if agent has been claimed, + * `1` if interrupted due to agent shutting down + */ +static int wait_till_agent_claimed(void) +{ + //TODO prevent malloc and freez + char *agent_id = is_agent_claimed(); + while (likely(!agent_id)) { + sleep_usec(USEC_PER_SEC * 1); + if (netdata_exit) + return 1; + agent_id = is_agent_claimed(); + } + freez(agent_id); + return 0; +} + +/** + * Checks everything is ready for connection + * agent claimed, cloud url set and private key available + * + * @param aclk_hostname points to location where string pointer to hostname will be set + * @param ackl_port port to int where port will be saved + * + * @return If non 0 returned irrecoverable error happened and ACLK should be terminated + */ +static int wait_till_agent_claim_ready() +{ + int port; + char *hostname = NULL; + while (!netdata_exit) { + if (wait_till_agent_claimed()) + return 1; + + // The NULL return means the value was never initialised, but this value has been initialized in post_conf_load. + // We trap the impossible NULL here to keep the linter happy without using a fatal() in the code. + char *cloud_base_url = appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", NULL); + if (cloud_base_url == NULL) { + error("Do not move the cloud base url out of post_conf_load!!"); + return 1; + } + + // We just check configuration is valid here + // TODO make it without malloc/free + if (aclk_decode_base_url(cloud_base_url, &hostname, &port)) { + error("Agent is claimed but the configuration is invalid, please fix"); + freez(hostname); + hostname = NULL; + sleep(5); + continue; + } + freez(hostname); + hostname = NULL; + + if (!load_private_key()) { + sleep(5); + break; + } + } + + return 0; +} + +void aclk_mqtt_wss_log_cb(mqtt_wss_log_type_t log_type, const char* str) +{ + switch(log_type) { + case MQTT_WSS_LOG_ERROR: + case MQTT_WSS_LOG_FATAL: + case MQTT_WSS_LOG_WARN: + error("%s", str); + return; + case MQTT_WSS_LOG_INFO: + info("%s", str); + return; + case MQTT_WSS_LOG_DEBUG: + debug(D_ACLK, "%s", str); + return; + default: + error("Unknown log type from mqtt_wss"); + } +} + +//TODO prevent big buffer on stack +#define RX_MSGLEN_MAX 4096 +static void msg_callback(const char *topic, const void *msg, size_t msglen, int qos) +{ + char cmsg[RX_MSGLEN_MAX]; + size_t len = (msglen < RX_MSGLEN_MAX - 1) ? msglen : (RX_MSGLEN_MAX - 1); + + if (msglen > RX_MSGLEN_MAX - 1) + error("Incoming ACLK message was bigger than MAX of %d and got truncated.", RX_MSGLEN_MAX); + + memcpy(cmsg, + msg, + len); + cmsg[len] = 0; + +#ifdef ACLK_LOG_CONVERSATION_DIR +#define FN_MAX_LEN 512 + char filename[FN_MAX_LEN]; + int logfd; + snprintf(filename, FN_MAX_LEN, ACLK_LOG_CONVERSATION_DIR "/%010d-rx.json", ACLK_GET_CONV_LOG_NEXT()); + logfd = open(filename, O_CREAT | O_TRUNC | O_WRONLY, S_IRUSR | S_IWUSR ); + if(logfd < 0) + error("Error opening ACLK Conversation logfile \"%s\" for RX message.", filename); + write(logfd, msg, msglen); + close(logfd); +#endif + + debug(D_ACLK, "Got Message From Broker Topic \"%s\" QOS %d MSG: \"%s\"", topic, qos, cmsg); + + if (strcmp(aclk_get_topic(ACLK_TOPICID_COMMAND), topic)) + error("Received message on unexpected topic %s", topic); + + if (aclk_shared_state.mqtt_shutdown_msg_id > 0) { + error("Link is shutting down. Ignoring message."); + return; + } + + aclk_handle_cloud_message(cmsg); +} + +static void puback_callback(uint16_t packet_id) +{ + if (++aclk_pubacks_per_conn == ACLK_PUBACKS_CONN_STABLE) + aclk_reconnect_delay(0); + +#ifdef NETDATA_INTERNAL_CHECKS + aclk_stats_msg_puback(packet_id); +#endif + + if (aclk_shared_state.mqtt_shutdown_msg_id == (int)packet_id) { + error("Got PUBACK for shutdown message. Can exit gracefully."); + aclk_shared_state.mqtt_shutdown_msg_rcvd = 1; + } +} + +static int read_query_thread_count() +{ + int threads = MIN(processors/2, 6); + threads = MAX(threads, 2); + threads = config_get_number(CONFIG_SECTION_CLOUD, "query thread count", threads); + if(threads < 1) { + error("You need at least one query thread. Overriding configured setting of \"%d\"", threads); + threads = 1; + config_set_number(CONFIG_SECTION_CLOUD, "query thread count", threads); + } + return threads; +} + +/* Keeps connection alive and handles all network comms. + * Returns on error or when netdata is shutting down. + * @param client instance of mqtt_wss_client + * @returns 0 - Netdata Exits + * >0 - Error happened. Reconnect and start over. + */ +static int handle_connection(mqtt_wss_client client) +{ + time_t last_periodic_query_wakeup = now_monotonic_sec(); + while (!netdata_exit) { + // timeout 1000 to check at least once a second + // for netdata_exit + if (mqtt_wss_service(client, 1000) < 0){ + error("Connection Error or Dropped"); + return 1; + } + + // mqtt_wss_service will return faster than in one second + // if there is enough work to do + time_t now = now_monotonic_sec(); + if (last_periodic_query_wakeup < now) { + // wake up at least one Query Thread at least + // once per second + last_periodic_query_wakeup = now; + QUERY_THREAD_WAKEUP; + } + } + return 0; +} + +inline static int aclk_popcorn_check_bump() +{ + ACLK_SHARED_STATE_LOCK; + if (unlikely(aclk_shared_state.agent_state == AGENT_INITIALIZING)) { + aclk_shared_state.last_popcorn_interrupt = now_realtime_sec(); + ACLK_SHARED_STATE_UNLOCK; + return 1; + } + ACLK_SHARED_STATE_UNLOCK; + return 0; +} + +static inline void queue_connect_payloads(void) +{ + aclk_query_t query = aclk_query_new(METADATA_INFO); + query->data.metadata_info.host = localhost; + query->data.metadata_info.initial_on_connect = 1; + aclk_queue_query(query); + query = aclk_query_new(METADATA_ALARMS); + query->data.metadata_alarms.initial_on_connect = 1; + aclk_queue_query(query); +} + +static inline void mqtt_connected_actions(mqtt_wss_client client) +{ + // TODO global vars? + usec_t now = now_realtime_usec(); + aclk_session_sec = now / USEC_PER_SEC; + aclk_session_us = now % USEC_PER_SEC; + + mqtt_wss_subscribe(client, aclk_get_topic(ACLK_TOPICID_COMMAND), 1); + + aclk_stats_upd_online(1); + aclk_connected = 1; + aclk_pubacks_per_conn = 0; + aclk_hello_msg(client); + ACLK_SHARED_STATE_LOCK; + if (aclk_shared_state.agent_state != AGENT_INITIALIZING) { + error("Sending `connect` payload immediatelly as popcorning was finished already."); + queue_connect_payloads(); + } + ACLK_SHARED_STATE_UNLOCK; +} + +/* Waits until agent is ready or needs to exit + * @param client instance of mqtt_wss_client + * @param query_threads pointer to aclk_query_threads + * structure where to store data about started query threads + * @return 0 - Popcorning Finished - Agent STABLE, + * !0 - netdata_exit + */ +static int wait_popcorning_finishes(mqtt_wss_client client, struct aclk_query_threads *query_threads) +{ + time_t elapsed; + int need_wait; + while (!netdata_exit) { + ACLK_SHARED_STATE_LOCK; + if (likely(aclk_shared_state.agent_state != AGENT_INITIALIZING)) { + ACLK_SHARED_STATE_UNLOCK; + return 0; + } + elapsed = now_realtime_sec() - aclk_shared_state.last_popcorn_interrupt; + if (elapsed >= ACLK_STABLE_TIMEOUT) { + aclk_shared_state.agent_state = AGENT_STABLE; + ACLK_SHARED_STATE_UNLOCK; + error("ACLK localhost popocorn finished"); + if (unlikely(!query_threads->thread_list)) + aclk_query_threads_start(query_threads, client); + queue_connect_payloads(); + return 0; + } + ACLK_SHARED_STATE_UNLOCK; + need_wait = ACLK_STABLE_TIMEOUT - elapsed; + error("ACLK localhost popocorn wait %d seconds longer", need_wait); + sleep(need_wait); + } + return 1; +} + +void aclk_graceful_disconnect(mqtt_wss_client client) +{ + error("Preparing to Gracefully Shutdown the ACLK"); + aclk_queue_lock(); + aclk_queue_flush(); + aclk_shared_state.mqtt_shutdown_msg_id = aclk_send_app_layer_disconnect(client, "graceful"); + time_t t = now_monotonic_sec(); + while (!mqtt_wss_service(client, 100)) { + if (now_monotonic_sec() - t >= 2) { + error("Wasn't able to gracefully shutdown ACLK in time!"); + break; + } + if (aclk_shared_state.mqtt_shutdown_msg_rcvd) { + error("MQTT App Layer `disconnect` message sent successfully"); + break; + } + } + aclk_stats_upd_online(0); + aclk_connected = 0; + + error("Attempting to Gracefully Shutdown MQTT/WSS connection"); + mqtt_wss_disconnect(client, 1000); +} + +/* Block till aclk_reconnect_delay is satisifed or netdata_exit is signalled + * @return 0 - Go ahead and connect (delay expired) + * 1 - netdata_exit + */ +#define NETDATA_EXIT_POLL_MS (MSEC_PER_SEC/4) +static int aclk_block_till_recon_allowed() { + // Handle reconnect exponential backoff + // fnc aclk_reconnect_delay comes from ACLK Legacy @amoss + // but has been modifed slightly (more randomness) + unsigned long recon_delay = aclk_reconnect_delay(1); + info("Wait before attempting to reconnect in %.3f seconds\n", recon_delay / (float)MSEC_PER_SEC); + // we want to wake up from time to time to check netdata_exit + while (recon_delay) + { + if (netdata_exit) + return 1; + if (recon_delay > NETDATA_EXIT_POLL_MS) { + sleep_usec(NETDATA_EXIT_POLL_MS * USEC_PER_MS); + recon_delay -= NETDATA_EXIT_POLL_MS; + continue; + } + sleep_usec(recon_delay * USEC_PER_MS); + recon_delay = 0; + } + return 0; +} + +#define HTTP_PROXY_PREFIX "http://" +static void set_proxy(struct mqtt_wss_proxy *out) +{ + ACLK_PROXY_TYPE pt; + const char *ptr = aclk_get_proxy(&pt); + char *tmp; + char *host; + if (pt != PROXY_TYPE_HTTP) + return; + + out->port = 0; + + if (!strncmp(ptr, HTTP_PROXY_PREFIX, strlen(HTTP_PROXY_PREFIX))) + ptr += strlen(HTTP_PROXY_PREFIX); + + if ((tmp = strchr(ptr, '@'))) + ptr = tmp; + + if ((tmp = strchr(ptr, '/'))) { + host = mallocz((tmp - ptr) + 1); + memcpy(host, ptr, (tmp - ptr)); + host[tmp - ptr] = 0; + } else + host = strdupz(ptr); + + if ((tmp = strchr(host, ':'))) { + *tmp = 0; + tmp++; + out->port = atoi(tmp); + } + + if (out->port <= 0 || out->port > 65535) + out->port = 8080; + + out->host = host; + + out->type = MQTT_WSS_PROXY_HTTP; +} + +/* Attempts to make a connection to MQTT broker over WSS + * @param client instance of mqtt_wss_client + * @return 0 - Successfull Connection, + * <0 - Irrecoverable Error -> Kill ACLK, + * >0 - netdata_exit + */ +#define CLOUD_BASE_URL_READ_RETRY 30 +#ifdef ACLK_SSL_ALLOW_SELF_SIGNED +#define ACLK_SSL_FLAGS MQTT_WSS_SSL_ALLOW_SELF_SIGNED +#else +#define ACLK_SSL_FLAGS MQTT_WSS_SSL_CERT_CHECK_FULL +#endif +static int aclk_attempt_to_connect(mqtt_wss_client client) +{ + char *aclk_hostname = NULL; + int aclk_port; + +#ifndef ACLK_DISABLE_CHALLENGE + char *mqtt_otp_user = NULL; + char *mqtt_otp_pass = NULL; +#endif + + json_object *lwt; + + while (!netdata_exit) { + char *cloud_base_url = appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", NULL); + if (cloud_base_url == NULL) { + error("Do not move the cloud base url out of post_conf_load!!"); + return -1; + } + + if (aclk_block_till_recon_allowed()) + return 1; + + info("Attempting connection now"); + if (aclk_decode_base_url(cloud_base_url, &aclk_hostname, &aclk_port)) { + error("ACLK base URL configuration key could not be parsed. Will retry in %d seconds.", CLOUD_BASE_URL_READ_RETRY); + sleep(CLOUD_BASE_URL_READ_RETRY); + continue; + } + + struct mqtt_wss_proxy proxy_conf; + proxy_conf.type = MQTT_WSS_DIRECT; + set_proxy(&proxy_conf); + + struct mqtt_connect_params mqtt_conn_params = { + .clientid = "anon", + .username = "anon", + .password = "anon", + .will_topic = aclk_get_topic(ACLK_TOPICID_METADATA), + .will_msg = NULL, + .will_flags = MQTT_WSS_PUB_QOS2, + .keep_alive = 60 + }; +#ifndef ACLK_DISABLE_CHALLENGE + aclk_get_mqtt_otp(aclk_private_key, aclk_hostname, aclk_port, &mqtt_otp_user, &mqtt_otp_pass); + mqtt_conn_params.clientid = mqtt_otp_user; + mqtt_conn_params.username = mqtt_otp_user; + mqtt_conn_params.password = mqtt_otp_pass; +#endif + + lwt = aclk_generate_disconnect(NULL); + mqtt_conn_params.will_msg = json_object_to_json_string_ext(lwt, JSON_C_TO_STRING_PLAIN); + + mqtt_conn_params.will_msg_len = strlen(mqtt_conn_params.will_msg); + if (!mqtt_wss_connect(client, aclk_hostname, aclk_port, &mqtt_conn_params, ACLK_SSL_FLAGS, &proxy_conf)) { + json_object_put(lwt); + freez(aclk_hostname); + aclk_hostname = NULL; + info("MQTTWSS connection succeeded"); + mqtt_connected_actions(client); + return 0; + } + + freez(aclk_hostname); + aclk_hostname = NULL; + json_object_put(lwt); + error("Connect failed\n"); + } + + return 1; +} + +/** + * Main agent cloud link thread + * + * This thread will simply call the main event loop that handles + * pending requests - both inbound and outbound + * + * @param ptr is a pointer to the netdata_static_thread structure. + * + * @return It always returns NULL + */ +void *aclk_main(void *ptr) +{ + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; + + struct aclk_stats_thread *stats_thread = NULL; + + struct aclk_query_threads query_threads; + query_threads.thread_list = NULL; + + ACLK_PROXY_TYPE proxy_type; + aclk_get_proxy(&proxy_type); + if (proxy_type == PROXY_TYPE_SOCKS5) { + error("SOCKS5 proxy is not supported by ACLK-NG yet."); + static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + return NULL; + } + + // This thread is unusual in that it cannot be cancelled by cancel_main_threads() + // as it must notify the far end that it shutdown gracefully and avoid the LWT. + netdata_thread_disable_cancelability(); + +#if defined( DISABLE_CLOUD ) || !defined( ENABLE_ACLK ) + info("Killing ACLK thread -> cloud functionality has been disabled"); + static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + return NULL; +#endif + aclk_popcorn_check_bump(); // start localhost popcorn timer + query_threads.count = read_query_thread_count(); + + if (wait_till_cloud_enabled()) + goto exit; + + if (wait_till_agent_claim_ready()) + goto exit; + + if (!(mqttwss_client = mqtt_wss_new("mqtt_wss", aclk_mqtt_wss_log_cb, msg_callback, puback_callback))) { + error("Couldn't initialize MQTT_WSS network library"); + goto exit; + } + + aclk_stats_enabled = config_get_boolean(CONFIG_SECTION_CLOUD, "statistics", CONFIG_BOOLEAN_YES); + if (aclk_stats_enabled) { + stats_thread = callocz(1, sizeof(struct aclk_stats_thread)); + stats_thread->thread = mallocz(sizeof(netdata_thread_t)); + stats_thread->query_thread_count = query_threads.count; + netdata_thread_create( + stats_thread->thread, ACLK_STATS_THREAD_NAME, NETDATA_THREAD_OPTION_JOINABLE, aclk_stats_main_thread, + stats_thread); + } + + // Keep reconnecting and talking until our time has come + // and the Grim Reaper (netdata_exit) calls + do { + if (aclk_attempt_to_connect(mqttwss_client)) + goto exit_full; + + // warning this assumes the popcorning is relative short (3s) + // if that changes call mqtt_wss_service from within + // to keep OpenSSL, WSS and MQTT connection alive + if (wait_popcorning_finishes(mqttwss_client, &query_threads)) + goto exit_full; + + if (!handle_connection(mqttwss_client)) { + aclk_stats_upd_online(0); + aclk_connected = 0; + } + } while (!netdata_exit); + + aclk_graceful_disconnect(mqttwss_client); + +exit_full: +// Tear Down + QUERY_THREAD_WAKEUP_ALL; + + aclk_query_threads_cleanup(&query_threads); + + if (aclk_stats_enabled) { + netdata_thread_join(*stats_thread->thread, NULL); + aclk_stats_thread_cleanup(); + freez(stats_thread->thread); + freez(stats_thread); + } + free_topic_cache(); + mqtt_wss_destroy(mqttwss_client); +exit: + static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + return NULL; +} + +// TODO this is taken over as workaround from old ACLK +// fix this in both old and new ACLK +extern void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host); + +void aclk_alarm_reload(void) +{ + ACLK_SHARED_STATE_LOCK; + if (unlikely(aclk_shared_state.agent_state == AGENT_INITIALIZING)) { + ACLK_SHARED_STATE_UNLOCK; + return; + } + ACLK_SHARED_STATE_UNLOCK; + + aclk_queue_query(aclk_query_new(METADATA_ALARMS)); +} + +int aclk_update_alarm(RRDHOST *host, ALARM_ENTRY *ae) +{ + BUFFER *local_buffer; + json_object *msg; + + if (host != localhost) + return 0; + + ACLK_SHARED_STATE_LOCK; + if (unlikely(aclk_shared_state.agent_state == AGENT_INITIALIZING)) { + ACLK_SHARED_STATE_UNLOCK; + return 0; + } + ACLK_SHARED_STATE_UNLOCK; + + local_buffer = buffer_create(NETDATA_WEB_RESPONSE_HEADER_SIZE); + + netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + health_alarm_entry2json_nolock(local_buffer, ae, host); + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + + msg = json_tokener_parse(local_buffer->buffer); + + struct aclk_query *query = aclk_query_new(ALARM_STATE_UPDATE); + query->data.alarm_update = msg; + aclk_queue_query(query); + + buffer_free(local_buffer); + return 0; +} + +int aclk_update_chart(RRDHOST *host, char *chart_name, int create) +{ + struct aclk_query *query; + + if (aclk_popcorn_check_bump()) + return 0; + + query = aclk_query_new(create ? CHART_NEW : CHART_DEL); + if(create) { + query->data.chart_add_del.host = host; + query->data.chart_add_del.chart_name = strdupz(chart_name); + } else { + query->data.metadata_info.host = host; + query->data.metadata_info.initial_on_connect = 0; + } + + aclk_queue_query(query); + return 0; +} + +/* + * Add a new collector to the list + * If it exists, update the chart count + */ +void aclk_add_collector(RRDHOST *host, const char *plugin_name, const char *module_name) +{ + struct aclk_query *query; + struct _collector *tmp_collector; + if (unlikely(!netdata_ready)) { + return; + } + + COLLECTOR_LOCK; + + tmp_collector = _add_collector(host->machine_guid, plugin_name, module_name); + + if (unlikely(tmp_collector->count != 1)) { + COLLECTOR_UNLOCK; + return; + } + + COLLECTOR_UNLOCK; + + if (aclk_popcorn_check_bump()) + return; + + if (host != localhost) + return; + + query = aclk_query_new(METADATA_INFO); + query->data.metadata_info.host = localhost; //TODO + query->data.metadata_info.initial_on_connect = 0; + aclk_queue_query(query); + + query = aclk_query_new(METADATA_ALARMS); + query->data.metadata_alarms.initial_on_connect = 0; + aclk_queue_query(query); +} + +/* + * Delete a collector from the list + * If the chart count reaches zero the collector will be removed + * from the list by calling del_collector. + * + * This function will release the memory used and schedule + * a cloud update + */ +void aclk_del_collector(RRDHOST *host, const char *plugin_name, const char *module_name) +{ + struct aclk_query *query; + struct _collector *tmp_collector; + if (unlikely(!netdata_ready)) { + return; + } + + COLLECTOR_LOCK; + + tmp_collector = _del_collector(host->machine_guid, plugin_name, module_name); + + if (unlikely(!tmp_collector || tmp_collector->count)) { + COLLECTOR_UNLOCK; + return; + } + + debug( + D_ACLK, "DEL COLLECTOR [%s:%s] -- charts %u", plugin_name ? plugin_name : "*", module_name ? module_name : "*", + tmp_collector->count); + + COLLECTOR_UNLOCK; + + _free_collector(tmp_collector); + + if (aclk_popcorn_check_bump()) + return; + + if (host != localhost) + return; + + query = aclk_query_new(METADATA_INFO); + query->data.metadata_info.host = localhost; //TODO + query->data.metadata_info.initial_on_connect = 0; + aclk_queue_query(query); + + query = aclk_query_new(METADATA_ALARMS); + query->data.metadata_alarms.initial_on_connect = 0; + aclk_queue_query(query); +} + +struct label *add_aclk_host_labels(struct label *label) { +#ifdef ENABLE_ACLK + ACLK_PROXY_TYPE aclk_proxy; + char *proxy_str; + aclk_get_proxy(&aclk_proxy); + + switch(aclk_proxy) { + case PROXY_TYPE_SOCKS5: + proxy_str = "SOCKS5"; + break; + case PROXY_TYPE_HTTP: + proxy_str = "HTTP"; + break; + default: + proxy_str = "none"; + break; + } + label = add_label_to_list(label, "_aclk_impl", "Next Generation", LABEL_SOURCE_AUTO); + return add_label_to_list(label, "_aclk_proxy", proxy_str, LABEL_SOURCE_AUTO); +#else + return label; +#endif +} diff --git a/aclk/aclk.h b/aclk/aclk.h new file mode 100644 index 000000000..29626c7f4 --- /dev/null +++ b/aclk/aclk.h @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +#ifndef ACLK_H +#define ACLK_H + +typedef struct aclk_rrdhost_state { + char *claimed_id; // Claimed ID if host has one otherwise NULL +} aclk_rrdhost_state; + +#include "../daemon/common.h" +#include "aclk_util.h" + +// minimum and maximum supported version of ACLK +// in this version of agent +#define ACLK_VERSION_MIN 2 +#define ACLK_VERSION_MAX 2 + +// Version negotiation messages have they own versioning +// this is also used for LWT message as we set that up +// before version negotiation +#define ACLK_VERSION_NEG_VERSION 1 + +// Maximum time to wait for version negotiation before aborting +// and defaulting to oldest supported version +#define VERSION_NEG_TIMEOUT 3 + +#if ACLK_VERSION_MIN > ACLK_VERSION_MAX +#error "ACLK_VERSION_MAX must be >= than ACLK_VERSION_MIN" +#endif + +// Define ACLK Feature Version Boundaries Here +#define ACLK_V_COMPRESSION 2 + +// How many MQTT PUBACKs we need to get to consider connection +// stable for the purposes of TBEB (truncated binary exponential backoff) +#define ACLK_PUBACKS_CONN_STABLE 3 + +// TODO get rid of this shit +extern int aclk_disable_runtime; +extern int aclk_disable_single_updates; +extern int aclk_kill_link; +extern int aclk_connected; + +extern usec_t aclk_session_us; +extern time_t aclk_session_sec; + +void *aclk_main(void *ptr); +void aclk_single_update_disable(); +void aclk_single_update_enable(); + +#define NETDATA_ACLK_HOOK \ + { .name = "ACLK_Main", \ + .config_section = NULL, \ + .config_name = NULL, \ + .enabled = 1, \ + .thread = NULL, \ + .init_routine = NULL, \ + .start_routine = aclk_main }, + +extern netdata_mutex_t aclk_shared_state_mutex; +#define ACLK_SHARED_STATE_LOCK netdata_mutex_lock(&aclk_shared_state_mutex) +#define ACLK_SHARED_STATE_UNLOCK netdata_mutex_unlock(&aclk_shared_state_mutex) + +typedef enum aclk_agent_state { + AGENT_INITIALIZING, + AGENT_STABLE +} ACLK_AGENT_STATE; +extern struct aclk_shared_state { + ACLK_AGENT_STATE agent_state; + time_t last_popcorn_interrupt; + + // read only while ACLK connected + // protect by lock otherwise + int version_neg; + usec_t version_neg_wait_till; + + // To wait for `disconnect` message PUBACK + // when shuting down + // at the same time if > 0 we know link is + // shutting down + int mqtt_shutdown_msg_id; + int mqtt_shutdown_msg_rcvd; +} aclk_shared_state; + +void aclk_alarm_reload(void); +int aclk_update_alarm(RRDHOST *host, ALARM_ENTRY *ae); + +// TODO this is for bacward compatibility with ACLK legacy +#define ACLK_CMD_CHART 1 +#define ACLK_CMD_CHARTDEL 0 +/* Informs ACLK about created/deleted chart + * @param create 0 - if chart was deleted, other if chart created + */ +int aclk_update_chart(RRDHOST *host, char *chart_name, int create); + +void aclk_add_collector(RRDHOST *host, const char *plugin_name, const char *module_name); +void aclk_del_collector(RRDHOST *host, const char *plugin_name, const char *module_name); + +struct label *add_aclk_host_labels(struct label *label); + +#endif /* ACLK_H */ diff --git a/aclk/aclk_collector_list.c b/aclk/aclk_collector_list.c new file mode 100644 index 000000000..a251a23a8 --- /dev/null +++ b/aclk/aclk_collector_list.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +// This is copied from Legacy ACLK, Original Autor: amoss + +// TODO unmess this + +#include "aclk_collector_list.h" + +netdata_mutex_t collector_mutex = NETDATA_MUTEX_INITIALIZER; + +struct _collector *collector_list = NULL; + +/* + * Free a collector structure + */ +void _free_collector(struct _collector *collector) +{ + if (likely(collector->plugin_name)) + freez(collector->plugin_name); + + if (likely(collector->module_name)) + freez(collector->module_name); + + if (likely(collector->hostname)) + freez(collector->hostname); + + freez(collector); +} + +/* + * This will report the collector list + * + */ +#ifdef ACLK_DEBUG +static void _dump_collector_list() +{ + struct _collector *tmp_collector; + + COLLECTOR_LOCK; + + info("DUMPING ALL COLLECTORS"); + + if (unlikely(!collector_list || !collector_list->next)) { + COLLECTOR_UNLOCK; + info("DUMPING ALL COLLECTORS -- nothing found"); + return; + } + + // Note that the first entry is "dummy" + tmp_collector = collector_list->next; + + while (tmp_collector) { + info( + "COLLECTOR %s : [%s:%s] count = %u", tmp_collector->hostname, + tmp_collector->plugin_name ? tmp_collector->plugin_name : "", + tmp_collector->module_name ? tmp_collector->module_name : "", tmp_collector->count); + + tmp_collector = tmp_collector->next; + } + info("DUMPING ALL COLLECTORS DONE"); + COLLECTOR_UNLOCK; +} +#endif + +/* + * This will cleanup the collector list + * + */ +void _reset_collector_list() +{ + struct _collector *tmp_collector, *next_collector; + + COLLECTOR_LOCK; + + if (unlikely(!collector_list || !collector_list->next)) { + COLLECTOR_UNLOCK; + return; + } + + // Note that the first entry is "dummy" + tmp_collector = collector_list->next; + collector_list->count = 0; + collector_list->next = NULL; + + // We broke the link; we can unlock + COLLECTOR_UNLOCK; + + while (tmp_collector) { + next_collector = tmp_collector->next; + _free_collector(tmp_collector); + tmp_collector = next_collector; + } +} + +/* + * Find a collector (if it exists) + * Must lock before calling this + * If last_collector is not null, it will return the previous collector in the linked + * list (used in collector delete) + */ +static struct _collector *_find_collector( + const char *hostname, const char *plugin_name, const char *module_name, struct _collector **last_collector) +{ + struct _collector *tmp_collector, *prev_collector; + uint32_t plugin_hash; + uint32_t module_hash; + uint32_t hostname_hash; + + if (unlikely(!collector_list)) { + collector_list = callocz(1, sizeof(struct _collector)); + return NULL; + } + + if (unlikely(!collector_list->next)) + return NULL; + + plugin_hash = plugin_name ? simple_hash(plugin_name) : 1; + module_hash = module_name ? simple_hash(module_name) : 1; + hostname_hash = simple_hash(hostname); + + // Note that the first entry is "dummy" + tmp_collector = collector_list->next; + prev_collector = collector_list; + while (tmp_collector) { + if (plugin_hash == tmp_collector->plugin_hash && module_hash == tmp_collector->module_hash && + hostname_hash == tmp_collector->hostname_hash && (!strcmp(hostname, tmp_collector->hostname)) && + (!plugin_name || !tmp_collector->plugin_name || !strcmp(plugin_name, tmp_collector->plugin_name)) && + (!module_name || !tmp_collector->module_name || !strcmp(module_name, tmp_collector->module_name))) { + if (unlikely(last_collector)) + *last_collector = prev_collector; + + return tmp_collector; + } + + prev_collector = tmp_collector; + tmp_collector = tmp_collector->next; + } + + return tmp_collector; +} + +/* + * Called to delete a collector + * It will reduce the count (chart_count) and will remove it + * from the linked list if the count reaches zero + * The structure will be returned to the caller to free + * the resources + * + */ +struct _collector *_del_collector(const char *hostname, const char *plugin_name, const char *module_name) +{ + struct _collector *tmp_collector, *prev_collector = NULL; + + tmp_collector = _find_collector(hostname, plugin_name, module_name, &prev_collector); + + if (likely(tmp_collector)) { + --tmp_collector->count; + if (unlikely(!tmp_collector->count)) + prev_collector->next = tmp_collector->next; + } + return tmp_collector; +} + +/* + * Add a new collector (plugin / module) to the list + * If it already exists just update the chart count + * + * Lock before calling + */ +struct _collector *_add_collector(const char *hostname, const char *plugin_name, const char *module_name) +{ + struct _collector *tmp_collector; + + tmp_collector = _find_collector(hostname, plugin_name, module_name, NULL); + + if (unlikely(!tmp_collector)) { + tmp_collector = callocz(1, sizeof(struct _collector)); + tmp_collector->hostname_hash = simple_hash(hostname); + tmp_collector->plugin_hash = plugin_name ? simple_hash(plugin_name) : 1; + tmp_collector->module_hash = module_name ? simple_hash(module_name) : 1; + + tmp_collector->hostname = strdupz(hostname); + tmp_collector->plugin_name = plugin_name ? strdupz(plugin_name) : NULL; + tmp_collector->module_name = module_name ? strdupz(module_name) : NULL; + + tmp_collector->next = collector_list->next; + collector_list->next = tmp_collector; + } + tmp_collector->count++; + debug( + D_ACLK, "ADD COLLECTOR %s [%s:%s] -- chart %u", hostname, plugin_name ? plugin_name : "*", + module_name ? module_name : "*", tmp_collector->count); + return tmp_collector; +} diff --git a/aclk/aclk_collector_list.h b/aclk/aclk_collector_list.h new file mode 100644 index 000000000..98d30ba94 --- /dev/null +++ b/aclk/aclk_collector_list.h @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +// This is copied from Legacy ACLK, Original Autor: amoss + +// TODO unmess this + +#ifndef ACLK_COLLECTOR_LIST_H +#define ACLK_COLLECTOR_LIST_H + +#include "libnetdata/libnetdata.h" + +extern netdata_mutex_t collector_mutex; + +#define COLLECTOR_LOCK netdata_mutex_lock(&collector_mutex) +#define COLLECTOR_UNLOCK netdata_mutex_unlock(&collector_mutex) + +/* + * Maintain a list of collectors and chart count + * If all the charts of a collector are deleted + * then a new metadata dataset must be send to the cloud + * + */ +struct _collector { + time_t created; + uint32_t count; //chart count + uint32_t hostname_hash; + uint32_t plugin_hash; + uint32_t module_hash; + char *hostname; + char *plugin_name; + char *module_name; + struct _collector *next; +}; + +struct _collector *_add_collector(const char *hostname, const char *plugin_name, const char *module_name); +struct _collector *_del_collector(const char *hostname, const char *plugin_name, const char *module_name); +void _reset_collector_list(); +void _free_collector(struct _collector *collector); + +#endif /* ACLK_COLLECTOR_LIST_H */ diff --git a/aclk/aclk_otp.c b/aclk/aclk_otp.c new file mode 100644 index 000000000..fcb9d600c --- /dev/null +++ b/aclk/aclk_otp.c @@ -0,0 +1,261 @@ + +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "aclk_otp.h" + +#include "https_client.h" + +#include "../daemon/common.h" + +#include "../mqtt_websockets/c-rbuf/include/ringbuffer.h" + +struct dictionary_singleton { + char *key; + char *result; +}; + +static int json_extract_singleton(JSON_ENTRY *e) +{ + struct dictionary_singleton *data = e->callback_data; + + switch (e->type) { + case JSON_OBJECT: + case JSON_ARRAY: + break; + case JSON_STRING: + if (!strcmp(e->name, data->key)) { + data->result = strdupz(e->data.string); + break; + } + break; + case JSON_NUMBER: + case JSON_BOOLEAN: + case JSON_NULL: + break; + } + return 0; +} + +// Base-64 decoder. +// Note: This is non-validating, invalid input will be decoded without an error. +// Challenges are packed into json strings so we don't skip newlines. +// Size errors (i.e. invalid input size or insufficient output space) are caught. +static size_t base64_decode(unsigned char *input, size_t input_size, unsigned char *output, size_t output_size) +{ + static char lookup[256]; + static int first_time=1; + if (first_time) + { + first_time = 0; + for(int i=0; i<256; i++) + lookup[i] = -1; + for(int i='A'; i<='Z'; i++) + lookup[i] = i-'A'; + for(int i='a'; i<='z'; i++) + lookup[i] = i-'a' + 26; + for(int i='0'; i<='9'; i++) + lookup[i] = i-'0' + 52; + lookup['+'] = 62; + lookup['/'] = 63; + } + if ((input_size & 3) != 0) + { + error("Can't decode base-64 input length %zu", input_size); + return 0; + } + size_t unpadded_size = (input_size/4) * 3; + if ( unpadded_size > output_size ) + { + error("Output buffer size %zu is too small to decode %zu into", output_size, input_size); + return 0; + } + // Don't check padding within full quantums + for (size_t i = 0 ; i < input_size-4 ; i+=4 ) + { + uint32_t value = (lookup[input[0]] << 18) + (lookup[input[1]] << 12) + (lookup[input[2]] << 6) + lookup[input[3]]; + output[0] = value >> 16; + output[1] = value >> 8; + output[2] = value; + //error("Decoded %c %c %c %c -> %02x %02x %02x", input[0], input[1], input[2], input[3], output[0], output[1], output[2]); + output += 3; + input += 4; + } + // Handle padding only in last quantum + if (input[2] == '=') { + uint32_t value = (lookup[input[0]] << 6) + lookup[input[1]]; + output[0] = value >> 4; + //error("Decoded %c %c %c %c -> %02x", input[0], input[1], input[2], input[3], output[0]); + return unpadded_size-2; + } + else if (input[3] == '=') { + uint32_t value = (lookup[input[0]] << 12) + (lookup[input[1]] << 6) + lookup[input[2]]; + output[0] = value >> 10; + output[1] = value >> 2; + //error("Decoded %c %c %c %c -> %02x %02x", input[0], input[1], input[2], input[3], output[0], output[1]); + return unpadded_size-1; + } + else + { + uint32_t value = (input[0] << 18) + (input[1] << 12) + (input[2]<<6) + input[3]; + output[0] = value >> 16; + output[1] = value >> 8; + output[2] = value; + //error("Decoded %c %c %c %c -> %02x %02x %02x", input[0], input[1], input[2], input[3], output[0], output[1], output[2]); + return unpadded_size; + } +} + +static size_t base64_encode(unsigned char *input, size_t input_size, char *output, size_t output_size) +{ + uint32_t value; + static char lookup[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + if ((input_size/3+1)*4 >= output_size) + { + error("Output buffer for encoding size=%zu is not large enough for %zu-bytes input", output_size, input_size); + return 0; + } + size_t count = 0; + while (input_size>3) + { + value = ((input[0] << 16) + (input[1] << 8) + input[2]) & 0xffffff; + output[0] = lookup[value >> 18]; + output[1] = lookup[(value >> 12) & 0x3f]; + output[2] = lookup[(value >> 6) & 0x3f]; + output[3] = lookup[value & 0x3f]; + //error("Base-64 encode (%04x) -> %c %c %c %c\n", value, output[0], output[1], output[2], output[3]); + output += 4; + input += 3; + input_size -= 3; + count += 4; + } + switch (input_size) + { + case 2: + value = (input[0] << 10) + (input[1] << 2); + output[0] = lookup[(value >> 12) & 0x3f]; + output[1] = lookup[(value >> 6) & 0x3f]; + output[2] = lookup[value & 0x3f]; + output[3] = '='; + //error("Base-64 encode (%06x) -> %c %c %c %c\n", (value>>2)&0xffff, output[0], output[1], output[2], output[3]); + count += 4; + break; + case 1: + value = input[0] << 4; + output[0] = lookup[(value >> 6) & 0x3f]; + output[1] = lookup[value & 0x3f]; + output[2] = '='; + output[3] = '='; + //error("Base-64 encode (%06x) -> %c %c %c %c\n", value, output[0], output[1], output[2], output[3]); + count += 4; + break; + case 0: + break; + } + return count; +} + +static int private_decrypt(RSA *p_key, unsigned char * enc_data, int data_len, unsigned char *decrypted) +{ + int result = RSA_private_decrypt( data_len, enc_data, decrypted, p_key, RSA_PKCS1_OAEP_PADDING); + if (result == -1) { + char err[512]; + ERR_error_string_n(ERR_get_error(), err, sizeof(err)); + error("Decryption of the challenge failed: %s", err); + } + return result; +} + +// aclk_get_mqtt_otp is slightly modified original code from @amoss +void aclk_get_mqtt_otp(RSA *p_key, char *aclk_hostname, int port, char **mqtt_usr, char **mqtt_pass) +{ + char *data_buffer = mallocz(NETDATA_WEB_RESPONSE_INITIAL_SIZE); + debug(D_ACLK, "Performing challenge-response sequence"); + if (*mqtt_pass != NULL) + { + freez(*mqtt_pass); + *mqtt_pass = NULL; + } + // curl http://cloud-iam-agent-service:8080/api/v1/auth/node/00000000-0000-0000-0000-000000000000/challenge + // TODO - target host? + char *agent_id = is_agent_claimed(); + if (agent_id == NULL) + { + error("Agent was not claimed - cannot perform challenge/response"); + goto CLEANUP; + } + char url[1024]; + sprintf(url, "/api/v1/auth/node/%s/challenge", agent_id); + info("Retrieving challenge from cloud: %s %d %s", aclk_hostname, port, url); + if (https_request(HTTP_REQ_GET, aclk_hostname, port, url, data_buffer, NETDATA_WEB_RESPONSE_INITIAL_SIZE, NULL)) + { + error("Challenge failed: %s", data_buffer); + goto CLEANUP; + } + struct dictionary_singleton challenge = { .key = "challenge", .result = NULL }; + + debug(D_ACLK, "Challenge response from cloud: %s", data_buffer); + if (json_parse(data_buffer, &challenge, json_extract_singleton) != JSON_OK) + { + freez(challenge.result); + error("Could not parse the json response with the challenge: %s", data_buffer); + goto CLEANUP; + } + if (challenge.result == NULL) { + error("Could not retrieve challenge from auth response: %s", data_buffer); + goto CLEANUP; + } + + + size_t challenge_len = strlen(challenge.result); + unsigned char decoded[512]; + size_t decoded_len = base64_decode((unsigned char*)challenge.result, challenge_len, decoded, sizeof(decoded)); + + unsigned char plaintext[4096]={}; + int decrypted_length = private_decrypt(p_key, decoded, decoded_len, plaintext); + freez(challenge.result); + char encoded[512]; + size_t encoded_len = base64_encode(plaintext, decrypted_length, encoded, sizeof(encoded)); + encoded[encoded_len] = 0; + debug(D_ACLK, "Encoded len=%zu Decryption len=%d: '%s'", encoded_len, decrypted_length, encoded); + + char response_json[4096]={}; + sprintf(response_json, "{\"response\":\"%s\"}", encoded); + debug(D_ACLK, "Password phase: %s",response_json); + // TODO - host + sprintf(url, "/api/v1/auth/node/%s/password", agent_id); + if (https_request(HTTP_REQ_POST, aclk_hostname, port, url, data_buffer, NETDATA_WEB_RESPONSE_INITIAL_SIZE, response_json)) + { + error("Challenge-response failed: %s", data_buffer); + goto CLEANUP; + } + + debug(D_ACLK, "Password response from cloud: %s", data_buffer); + + struct dictionary_singleton password = { .key = "password", .result = NULL }; + if (json_parse(data_buffer, &password, json_extract_singleton) != JSON_OK) + { + freez(password.result); + error("Could not parse the json response with the password: %s", data_buffer); + goto CLEANUP; + } + + if (password.result == NULL ) { + error("Could not retrieve password from auth response"); + goto CLEANUP; + } + if (*mqtt_pass != NULL ) + freez(*mqtt_pass); + *mqtt_pass = password.result; + if (*mqtt_usr != NULL) + freez(*mqtt_usr); + *mqtt_usr = agent_id; + agent_id = NULL; + +CLEANUP: + if (agent_id != NULL) + freez(agent_id); + freez(data_buffer); + return; +} diff --git a/aclk/aclk_otp.h b/aclk/aclk_otp.h new file mode 100644 index 000000000..31e81c5a1 --- /dev/null +++ b/aclk/aclk_otp.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef ACLK_OTP_H +#define ACLK_OTP_H + +#include "../daemon/common.h" + +void aclk_get_mqtt_otp(RSA *p_key, char *aclk_hostname, int port, char **mqtt_usr, char **mqtt_pass); + +#endif /* ACLK_OTP_H */ diff --git a/aclk/aclk_query.c b/aclk/aclk_query.c new file mode 100644 index 000000000..71c63f647 --- /dev/null +++ b/aclk/aclk_query.c @@ -0,0 +1,295 @@ +#include "aclk_query.h" +#include "aclk_stats.h" +#include "aclk_query_queue.h" +#include "aclk_tx_msgs.h" + +#define ACLK_QUERY_THREAD_NAME "ACLK_Query" + +#define WEB_HDR_ACCEPT_ENC "Accept-Encoding:" + +pthread_cond_t query_cond_wait = PTHREAD_COND_INITIALIZER; +pthread_mutex_t query_lock_wait = PTHREAD_MUTEX_INITIALIZER; +#define QUERY_THREAD_LOCK pthread_mutex_lock(&query_lock_wait) +#define QUERY_THREAD_UNLOCK pthread_mutex_unlock(&query_lock_wait) + +typedef struct aclk_query_handler { + aclk_query_type_t type; + char *name; // for logging purposes + int(*fnc)(mqtt_wss_client client, aclk_query_t query); +} aclk_query_handler; + +static int info_metadata(mqtt_wss_client client, aclk_query_t query) +{ + aclk_send_info_metadata(client, + !query->data.metadata_info.initial_on_connect, + query->data.metadata_info.host); + return 0; +} + +static int alarms_metadata(mqtt_wss_client client, aclk_query_t query) +{ + aclk_send_alarm_metadata(client, + !query->data.metadata_info.initial_on_connect); + return 0; +} + +static usec_t aclk_web_api_v1_request(RRDHOST *host, struct web_client *w, char *url) +{ + usec_t t; + + t = now_monotonic_high_precision_usec(); + w->response.code = web_client_api_request_v1(host, w, url); + t = now_monotonic_high_precision_usec() - t; + + if (aclk_stats_enabled) { + ACLK_STATS_LOCK; + aclk_metrics_per_sample.cloud_q_process_total += t; + aclk_metrics_per_sample.cloud_q_process_count++; + if (aclk_metrics_per_sample.cloud_q_process_max < t) + aclk_metrics_per_sample.cloud_q_process_max = t; + ACLK_STATS_UNLOCK; + } + + return t; +} + +static int http_api_v2(mqtt_wss_client client, aclk_query_t query) +{ + int retval = 0; + usec_t t; + BUFFER *local_buffer = NULL; + +#ifdef NETDATA_WITH_ZLIB + int z_ret; + BUFFER *z_buffer = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); + char *start, *end; +#endif + + struct web_client *w = (struct web_client *)callocz(1, sizeof(struct web_client)); + w->response.data = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); + w->response.header = buffer_create(NETDATA_WEB_RESPONSE_HEADER_SIZE); + w->response.header_output = buffer_create(NETDATA_WEB_RESPONSE_HEADER_SIZE); + strcpy(w->origin, "*"); // Simulate web_client_create_on_fd() + w->cookie1[0] = 0; // Simulate web_client_create_on_fd() + w->cookie2[0] = 0; // Simulate web_client_create_on_fd() + w->acl = 0x1f; + + char *mysep = strchr(query->data.http_api_v2.query, '?'); + if (mysep) { + url_decode_r(w->decoded_query_string, mysep, NETDATA_WEB_REQUEST_URL_SIZE + 1); + *mysep = '\0'; + } else + url_decode_r(w->decoded_query_string, query->data.http_api_v2.query, NETDATA_WEB_REQUEST_URL_SIZE + 1); + + mysep = strrchr(query->data.http_api_v2.query, '/'); + + // execute the query + t = aclk_web_api_v1_request(localhost, w, mysep ? mysep + 1 : "noop"); + +#ifdef NETDATA_WITH_ZLIB + // check if gzip encoding can and should be used + if ((start = strstr((char *)query->data.http_api_v2.payload, WEB_HDR_ACCEPT_ENC))) { + start += strlen(WEB_HDR_ACCEPT_ENC); + end = strstr(start, "\x0D\x0A"); + start = strstr(start, "gzip"); + + if (start && start < end) { + w->response.zstream.zalloc = Z_NULL; + w->response.zstream.zfree = Z_NULL; + w->response.zstream.opaque = Z_NULL; + if(deflateInit2(&w->response.zstream, web_gzip_level, Z_DEFLATED, 15 + 16, 8, web_gzip_strategy) == Z_OK) { + w->response.zinitialized = 1; + w->response.zoutput = 1; + } else + error("Failed to initialize zlib. Proceeding without compression."); + } + } + + if (w->response.data->len && w->response.zinitialized) { + w->response.zstream.next_in = (Bytef *)w->response.data->buffer; + w->response.zstream.avail_in = w->response.data->len; + do { + w->response.zstream.avail_out = NETDATA_WEB_RESPONSE_ZLIB_CHUNK_SIZE; + w->response.zstream.next_out = w->response.zbuffer; + z_ret = deflate(&w->response.zstream, Z_FINISH); + if(z_ret < 0) { + if(w->response.zstream.msg) + error("Error compressing body. ZLIB error: \"%s\"", w->response.zstream.msg); + else + error("Unknown error during zlib compression."); + retval = 1; + goto cleanup; + } + int bytes_to_cpy = NETDATA_WEB_RESPONSE_ZLIB_CHUNK_SIZE - w->response.zstream.avail_out; + buffer_need_bytes(z_buffer, bytes_to_cpy); + memcpy(&z_buffer->buffer[z_buffer->len], w->response.zbuffer, bytes_to_cpy); + z_buffer->len += bytes_to_cpy; + } while(z_ret != Z_STREAM_END); + // so that web_client_build_http_header + // puts correct content lenght into header + buffer_free(w->response.data); + w->response.data = z_buffer; + z_buffer = NULL; + } +#endif + + now_realtime_timeval(&w->tv_ready); + w->response.data->date = w->tv_ready.tv_sec; + web_client_build_http_header(w); + local_buffer = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); + local_buffer->contenttype = CT_APPLICATION_JSON; + + buffer_strcat(local_buffer, w->response.header_output->buffer); + + if (w->response.data->len) { +#ifdef NETDATA_WITH_ZLIB + if (w->response.zinitialized) { + buffer_need_bytes(local_buffer, w->response.data->len); + memcpy(&local_buffer->buffer[local_buffer->len], w->response.data->buffer, w->response.data->len); + local_buffer->len += w->response.data->len; + } else { +#endif + buffer_strcat(local_buffer, w->response.data->buffer); +#ifdef NETDATA_WITH_ZLIB + } +#endif + } + + aclk_http_msg_v2(client, query->callback_topic, query->msg_id, t, query->created, w->response.code, local_buffer->buffer, local_buffer->len); + +cleanup: +#ifdef NETDATA_WITH_ZLIB + if(w->response.zinitialized) + deflateEnd(&w->response.zstream); + buffer_free(z_buffer); +#endif + buffer_free(w->response.data); + buffer_free(w->response.header); + buffer_free(w->response.header_output); + freez(w); + buffer_free(local_buffer); + return retval; +} + +static int chart_query(mqtt_wss_client client, aclk_query_t query) +{ + aclk_chart_msg(client, query->data.chart_add_del.host, query->data.chart_add_del.chart_name); + return 0; +} + +static int alarm_state_update_query(mqtt_wss_client client, aclk_query_t query) +{ + aclk_alarm_state_msg(client, query->data.alarm_update); + // aclk_alarm_state_msg frees the json object including the header it generates + query->data.alarm_update = NULL; + return 0; +} + +aclk_query_handler aclk_query_handlers[] = { + { .type = HTTP_API_V2, .name = "http api request v2", .fnc = http_api_v2 }, + { .type = ALARM_STATE_UPDATE, .name = "alarm state update", .fnc = alarm_state_update_query }, + { .type = METADATA_INFO, .name = "info metadata", .fnc = info_metadata }, + { .type = METADATA_ALARMS, .name = "alarms metadata", .fnc = alarms_metadata }, + { .type = CHART_NEW, .name = "chart new", .fnc = chart_query }, + { .type = CHART_DEL, .name = "chart delete", .fnc = info_metadata }, + { .type = UNKNOWN, .name = NULL, .fnc = NULL } +}; + + +static void aclk_query_process_msg(struct aclk_query_thread *info, aclk_query_t query) +{ + for (int i = 0; aclk_query_handlers[i].type != UNKNOWN; i++) { + if (aclk_query_handlers[i].type == query->type) { + debug(D_ACLK, "Processing Queued Message of type: \"%s\"", aclk_query_handlers[i].name); + aclk_query_handlers[i].fnc(info->client, query); + aclk_query_free(query); + if (aclk_stats_enabled) { + ACLK_STATS_LOCK; + aclk_metrics_per_sample.queries_dispatched++; + aclk_queries_per_thread[info->idx]++; + ACLK_STATS_UNLOCK; + } + return; + } + } + fatal("Unknown query in query queue. %u", query->type); +} + +/* Processes messages from queue. Compete for work with other threads + */ +int aclk_query_process_msgs(struct aclk_query_thread *info) +{ + aclk_query_t query; + while ((query = aclk_queue_pop())) + aclk_query_process_msg(info, query); + + return 0; +} + +/** + * Main query processing thread + */ +void *aclk_query_main_thread(void *ptr) +{ + struct aclk_query_thread *info = ptr; + while (!netdata_exit) { + ACLK_SHARED_STATE_LOCK; + if (unlikely(!aclk_shared_state.version_neg)) { + if (!aclk_shared_state.version_neg_wait_till || aclk_shared_state.version_neg_wait_till > now_monotonic_usec()) { + ACLK_SHARED_STATE_UNLOCK; + info("Waiting for ACLK Version Negotiation message from Cloud"); + sleep(1); + continue; + } + errno = 0; + error("ACLK version negotiation failed. No reply to \"hello\" with \"version\" from cloud in time of %ds." + " Reverting to default ACLK version of %d.", VERSION_NEG_TIMEOUT, ACLK_VERSION_MIN); + aclk_shared_state.version_neg = ACLK_VERSION_MIN; +// When ACLK v3 is implemented you will need this +// aclk_set_rx_handlers(aclk_shared_state.version_neg); + } + ACLK_SHARED_STATE_UNLOCK; + + aclk_query_process_msgs(info); + + QUERY_THREAD_LOCK; + + if (unlikely(pthread_cond_wait(&query_cond_wait, &query_lock_wait))) + sleep_usec(USEC_PER_SEC * 1); + + QUERY_THREAD_UNLOCK; + } + return NULL; +} + +#define TASK_LEN_MAX 16 +void aclk_query_threads_start(struct aclk_query_threads *query_threads, mqtt_wss_client client) +{ + info("Starting %d query threads.", query_threads->count); + + char thread_name[TASK_LEN_MAX]; + query_threads->thread_list = callocz(query_threads->count, sizeof(struct aclk_query_thread)); + for (int i = 0; i < query_threads->count; i++) { + query_threads->thread_list[i].idx = i; //thread needs to know its index for statistics + + if(unlikely(snprintf(thread_name, TASK_LEN_MAX, "%s_%d", ACLK_QUERY_THREAD_NAME, i) < 0)) + error("snprintf encoding error"); + netdata_thread_create( + &query_threads->thread_list[i].thread, thread_name, NETDATA_THREAD_OPTION_JOINABLE, aclk_query_main_thread, + &query_threads->thread_list[i]); + + query_threads->thread_list[i].client = client; + } +} + +void aclk_query_threads_cleanup(struct aclk_query_threads *query_threads) +{ + if (query_threads && query_threads->thread_list) { + for (int i = 0; i < query_threads->count; i++) { + netdata_thread_join(query_threads->thread_list[i].thread, NULL); + } + freez(query_threads->thread_list); + } + aclk_queue_lock(); + aclk_queue_flush(); +} diff --git a/aclk/aclk_query.h b/aclk/aclk_query.h new file mode 100644 index 000000000..43741fb32 --- /dev/null +++ b/aclk/aclk_query.h @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_ACLK_QUERY_H +#define NETDATA_ACLK_QUERY_H + +#include "libnetdata/libnetdata.h" + +#include "mqtt_wss_client.h" + +extern pthread_cond_t query_cond_wait; +extern pthread_mutex_t query_lock_wait; +#define QUERY_THREAD_WAKEUP pthread_cond_signal(&query_cond_wait) +#define QUERY_THREAD_WAKEUP_ALL pthread_cond_broadcast(&query_cond_wait) + +// TODO +//extern volatile int aclk_connected; + +struct aclk_query_thread { + netdata_thread_t thread; + int idx; + mqtt_wss_client client; +}; + +struct aclk_query_threads { + struct aclk_query_thread *thread_list; + int count; +}; + +void aclk_query_threads_start(struct aclk_query_threads *query_threads, mqtt_wss_client client); +void aclk_query_threads_cleanup(struct aclk_query_threads *query_threads); + +#endif //NETDATA_AGENT_CLOUD_LINK_H diff --git a/aclk/aclk_query_queue.c b/aclk/aclk_query_queue.c new file mode 100644 index 000000000..c9461b233 --- /dev/null +++ b/aclk/aclk_query_queue.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "aclk_query_queue.h" +#include "aclk_query.h" +#include "aclk_stats.h" + +static netdata_mutex_t aclk_query_queue_mutex = NETDATA_MUTEX_INITIALIZER; +#define ACLK_QUEUE_LOCK netdata_mutex_lock(&aclk_query_queue_mutex) +#define ACLK_QUEUE_UNLOCK netdata_mutex_unlock(&aclk_query_queue_mutex) + +static struct aclk_query_queue { + aclk_query_t head; + aclk_query_t tail; + int block_push; +} aclk_query_queue = { + .head = NULL, + .tail = NULL, + .block_push = 0 +}; + +static inline int _aclk_queue_query(aclk_query_t query) +{ + query->created = now_realtime_usec(); + ACLK_QUEUE_LOCK; + if (aclk_query_queue.block_push) { + ACLK_QUEUE_UNLOCK; + if(!netdata_exit) + error("Query Queue is blocked from accepting new requests. This is normally the case when ACLK prepares to shutdown."); + aclk_query_free(query); + return 1; + } + if (!aclk_query_queue.head) { + aclk_query_queue.head = query; + aclk_query_queue.tail = query; + ACLK_QUEUE_UNLOCK; + return 0; + } + // TODO deduplication + aclk_query_queue.tail->next = query; + aclk_query_queue.tail = query; + ACLK_QUEUE_UNLOCK; + return 0; + +} + +int aclk_queue_query(aclk_query_t query) +{ + int ret = _aclk_queue_query(query); + if (!ret) { + QUERY_THREAD_WAKEUP; + if (aclk_stats_enabled) { + ACLK_STATS_LOCK; + aclk_metrics_per_sample.queries_queued++; + ACLK_STATS_UNLOCK; + } + } + return ret; +} + +aclk_query_t aclk_queue_pop(void) +{ + aclk_query_t ret; + + ACLK_QUEUE_LOCK; + if (aclk_query_queue.block_push) { + ACLK_QUEUE_UNLOCK; + if(!netdata_exit) + error("POP Query Queue is blocked from accepting new requests. This is normally the case when ACLK prepares to shutdown."); + return NULL; + } + + ret = aclk_query_queue.head; + if (!ret) { + ACLK_QUEUE_UNLOCK; + return ret; + } + + aclk_query_queue.head = ret->next; + if (unlikely(!aclk_query_queue.head)) + aclk_query_queue.tail = aclk_query_queue.head; + ACLK_QUEUE_UNLOCK; + + ret->next = NULL; + return ret; +} + +void aclk_queue_flush(void) +{ + aclk_query_t query = aclk_queue_pop(); + while (query) { + aclk_query_free(query); + query = aclk_queue_pop(); + }; +} + +aclk_query_t aclk_query_new(aclk_query_type_t type) +{ + aclk_query_t query = callocz(1, sizeof(struct aclk_query)); + query->type = type; + return query; +} + +void aclk_query_free(aclk_query_t query) +{ + if (query->type == HTTP_API_V2) { + freez(query->data.http_api_v2.payload); + if (query->data.http_api_v2.query != query->dedup_id) + freez(query->data.http_api_v2.query); + } + + if (query->type == CHART_NEW) + freez(query->data.chart_add_del.chart_name); + + if (query->type == ALARM_STATE_UPDATE && query->data.alarm_update) + json_object_put(query->data.alarm_update); + + freez(query->dedup_id); + freez(query->callback_topic); + freez(query->msg_id); + freez(query); +} + +void aclk_queue_lock(void) +{ + ACLK_QUEUE_LOCK; + aclk_query_queue.block_push = 1; + ACLK_QUEUE_UNLOCK; +} diff --git a/aclk/aclk_query_queue.h b/aclk/aclk_query_queue.h new file mode 100644 index 000000000..c46513567 --- /dev/null +++ b/aclk/aclk_query_queue.h @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_ACLK_QUERY_QUEUE_H +#define NETDATA_ACLK_QUERY_QUEUE_H + +#include "libnetdata/libnetdata.h" +#include "../daemon/common.h" + +typedef enum { + UNKNOWN, + METADATA_INFO, + METADATA_ALARMS, + HTTP_API_V2, + CHART_NEW, + CHART_DEL, + ALARM_STATE_UPDATE +} aclk_query_type_t; + +struct aclk_query_metadata { + RRDHOST *host; + int initial_on_connect; +}; + +struct aclk_query_chart_add_del { + RRDHOST *host; + char* chart_name; +}; + +struct aclk_query_http_api_v2 { + char *payload; + char *query; +}; + +typedef struct aclk_query *aclk_query_t; +struct aclk_query { + aclk_query_type_t type; + + // dedup_id is used to deduplicate queries in the list + // if type and dedup_id is the same message is deduplicated + // set dedup_id to NULL to never deduplicate the message + // set dedup_id to constant (e.g. empty string "") to make + // message of this type ever exist only once in the list + char *dedup_id; + char *callback_topic; + char *msg_id; + + usec_t created; + + aclk_query_t next; + + // TODO maybe remove? + int version; + union { + struct aclk_query_metadata metadata_info; + struct aclk_query_metadata metadata_alarms; + struct aclk_query_http_api_v2 http_api_v2; + struct aclk_query_chart_add_del chart_add_del; + json_object *alarm_update; + } data; +}; + +aclk_query_t aclk_query_new(aclk_query_type_t type); +void aclk_query_free(aclk_query_t query); + +int aclk_queue_query(aclk_query_t query); +aclk_query_t aclk_queue_pop(void); +void aclk_queue_flush(void); + +void aclk_queue_lock(void); + +#endif /* NETDATA_ACLK_QUERY_QUEUE_H */ diff --git a/aclk/aclk_rx_msgs.c b/aclk/aclk_rx_msgs.c new file mode 100644 index 000000000..fcb8d9968 --- /dev/null +++ b/aclk/aclk_rx_msgs.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "aclk_rx_msgs.h" + +#include "aclk_stats.h" +#include "aclk_query_queue.h" + +#define ACLK_V2_PAYLOAD_SEPARATOR "\x0D\x0A\x0D\x0A" +#define ACLK_CLOUD_REQ_V2_PREFIX "GET /api/v1/" + +struct aclk_request { + char *type_id; + char *msg_id; + char *callback_topic; + char *payload; + int version; + int min_version; + int max_version; +}; + +int cloud_to_agent_parse(JSON_ENTRY *e) +{ + struct aclk_request *data = e->callback_data; + + switch (e->type) { + case JSON_OBJECT: + case JSON_ARRAY: + break; + case JSON_STRING: + if (!strcmp(e->name, "msg-id")) { + data->msg_id = strdupz(e->data.string); + break; + } + if (!strcmp(e->name, "type")) { + data->type_id = strdupz(e->data.string); + break; + } + if (!strcmp(e->name, "callback-topic")) { + data->callback_topic = strdupz(e->data.string); + break; + } + if (!strcmp(e->name, "payload")) { + if (likely(e->data.string)) { + size_t len = strlen(e->data.string); + data->payload = mallocz(len+1); + if (!url_decode_r(data->payload, e->data.string, len + 1)) + strcpy(data->payload, e->data.string); + } + break; + } + break; + case JSON_NUMBER: + if (!strcmp(e->name, "version")) { + data->version = e->data.number; + break; + } + if (!strcmp(e->name, "min-version")) { + data->min_version = e->data.number; + break; + } + if (!strcmp(e->name, "max-version")) { + data->max_version = e->data.number; + break; + } + + break; + + case JSON_BOOLEAN: + break; + + case JSON_NULL: + break; + } + return 0; +} + +static inline int aclk_extract_v2_data(char *payload, char **data) +{ + char* ptr = strstr(payload, ACLK_V2_PAYLOAD_SEPARATOR); + if(!ptr) + return 1; + ptr += strlen(ACLK_V2_PAYLOAD_SEPARATOR); + *data = strdupz(ptr); + return 0; +} + +static inline int aclk_v2_payload_get_query(const char *payload, char **query_url) +{ + const char *start, *end; + + if(strncmp(payload, ACLK_CLOUD_REQ_V2_PREFIX, strlen(ACLK_CLOUD_REQ_V2_PREFIX))) { + errno = 0; + error("Only accepting requests that start with \"%s\" from CLOUD.", ACLK_CLOUD_REQ_V2_PREFIX); + return 1; + } + start = payload + 4; + + if(!(end = strstr(payload, " HTTP/1.1\x0D\x0A"))) { + errno = 0; + error("Doesn't look like HTTP GET request."); + return 1; + } + + *query_url = mallocz((end - start) + 1); + strncpyz(*query_url, start, end - start); + + return 0; +} + +#define HTTP_CHECK_AGENT_INITIALIZED() ACLK_SHARED_STATE_LOCK;\ + if (unlikely(aclk_shared_state.agent_state == AGENT_INITIALIZING)) {\ + debug(D_ACLK, "Ignoring \"http\" cloud request; agent not in stable state");\ + ACLK_SHARED_STATE_UNLOCK;\ + return 1;\ + }\ + ACLK_SHARED_STATE_UNLOCK; + +static int aclk_handle_cloud_request_v2(struct aclk_request *cloud_to_agent, char *raw_payload) +{ + HTTP_CHECK_AGENT_INITIALIZED(); + + aclk_query_t query; + + errno = 0; + if (cloud_to_agent->version < ACLK_V_COMPRESSION) { + error( + "This handler cannot reply to request with version older than %d, received %d.", + ACLK_V_COMPRESSION, + cloud_to_agent->version); + return 1; + } + + query = aclk_query_new(HTTP_API_V2); + + if (unlikely(aclk_extract_v2_data(raw_payload, &query->data.http_api_v2.payload))) { + error("Error extracting payload expected after the JSON dictionary."); + goto error; + } + + if (unlikely(aclk_v2_payload_get_query(query->data.http_api_v2.payload, &query->dedup_id))) { + error("Could not extract payload from query"); + goto error; + } + + if (unlikely(!cloud_to_agent->callback_topic)) { + error("Missing callback_topic"); + goto error; + } + + if (unlikely(!cloud_to_agent->msg_id)) { + error("Missing msg_id"); + goto error; + } + + // aclk_queue_query takes ownership of data pointer + query->callback_topic = cloud_to_agent->callback_topic; + // for clarity and code readability as when we process the request + // it would be strange to get URL from `dedup_id` + query->data.http_api_v2.query = query->dedup_id; + query->msg_id = cloud_to_agent->msg_id; + aclk_queue_query(query); + return 0; + +error: + aclk_query_free(query); + return 1; +} + +// This handles `version` message from cloud used to negotiate +// protocol version we will use +static int aclk_handle_version_response(struct aclk_request *cloud_to_agent, char *raw_payload) +{ + UNUSED(raw_payload); + int version = -1; + errno = 0; + + if (unlikely(cloud_to_agent->version != ACLK_VERSION_NEG_VERSION)) { + error( + "Unsuported version of \"version\" message from cloud. Expected %d, Got %d", + ACLK_VERSION_NEG_VERSION, + cloud_to_agent->version); + return 1; + } + if (unlikely(!cloud_to_agent->min_version)) { + error("Min version missing or 0"); + return 1; + } + if (unlikely(!cloud_to_agent->max_version)) { + error("Max version missing or 0"); + return 1; + } + if (unlikely(cloud_to_agent->max_version < cloud_to_agent->min_version)) { + error( + "Max version (%d) must be >= than min version (%d)", cloud_to_agent->max_version, + cloud_to_agent->min_version); + return 1; + } + + if (unlikely(cloud_to_agent->min_version > ACLK_VERSION_MAX)) { + error( + "Agent too old for this cloud. Minimum version required by cloud %d." + " Maximum version supported by this agent %d.", + cloud_to_agent->min_version, ACLK_VERSION_MAX); + aclk_kill_link = 1; + aclk_disable_runtime = 1; + return 1; + } + if (unlikely(cloud_to_agent->max_version < ACLK_VERSION_MIN)) { + error( + "Cloud version is too old for this agent. Maximum version supported by cloud %d." + " Minimum (oldest) version supported by this agent %d.", + cloud_to_agent->max_version, ACLK_VERSION_MIN); + aclk_kill_link = 1; + return 1; + } + + version = MIN(cloud_to_agent->max_version, ACLK_VERSION_MAX); + + ACLK_SHARED_STATE_LOCK; + if (unlikely(now_monotonic_usec() > aclk_shared_state.version_neg_wait_till)) { + errno = 0; + error("The \"version\" message came too late ignoring."); + goto err_cleanup; + } + if (unlikely(aclk_shared_state.version_neg)) { + errno = 0; + error("Version has already been set to %d", aclk_shared_state.version_neg); + goto err_cleanup; + } + aclk_shared_state.version_neg = version; + ACLK_SHARED_STATE_UNLOCK; + + info("Choosing version %d of ACLK", version); + + aclk_set_rx_handlers(version); + + return 0; + +err_cleanup: + ACLK_SHARED_STATE_UNLOCK; + return 1; +} + +typedef struct aclk_incoming_msg_type{ + char *name; + int(*fnc)(struct aclk_request *, char *); +}aclk_incoming_msg_type; + +aclk_incoming_msg_type aclk_incoming_msg_types_compression[] = { + { .name = "http", .fnc = aclk_handle_cloud_request_v2 }, + { .name = "version", .fnc = aclk_handle_version_response }, + { .name = NULL, .fnc = NULL } +}; + +struct aclk_incoming_msg_type *aclk_incoming_msg_types = aclk_incoming_msg_types_compression; + +void aclk_set_rx_handlers(int version) +{ +// ACLK_NG ACLK version support starts at 2 +// TODO ACLK v3 + UNUSED(version); + aclk_incoming_msg_types = aclk_incoming_msg_types_compression; +} + +int aclk_handle_cloud_message(char *payload) +{ + struct aclk_request cloud_to_agent; + memset(&cloud_to_agent, 0, sizeof(struct aclk_request)); + + if (aclk_stats_enabled) { + ACLK_STATS_LOCK; + aclk_metrics_per_sample.cloud_req_recvd++; + ACLK_STATS_UNLOCK; + } + + if (unlikely(!payload)) { + errno = 0; + error("ACLK incoming message is empty"); + goto err_cleanup_nojson; + } + + debug(D_ACLK, "ACLK incoming message (%s)", payload); + + int rc = json_parse(payload, &cloud_to_agent, cloud_to_agent_parse); + + if (unlikely(rc != JSON_OK)) { + errno = 0; + error("Malformed json request (%s)", payload); + goto err_cleanup; + } + + if (!cloud_to_agent.type_id) { + errno = 0; + error("Cloud message is missing compulsory key \"type\""); + goto err_cleanup; + } + + if (!aclk_shared_state.version_neg && strcmp(cloud_to_agent.type_id, "version")) { + error("Only \"version\" message is allowed before popcorning and version negotiation is finished. Ignoring"); + goto err_cleanup; + } + + for (int i = 0; aclk_incoming_msg_types[i].name; i++) { + if (strcmp(cloud_to_agent.type_id, aclk_incoming_msg_types[i].name) == 0) { + if (likely(!aclk_incoming_msg_types[i].fnc(&cloud_to_agent, payload))) { + // in case of success handler is supposed to clean up after itself + // or as in the case of aclk_handle_cloud_request take + // ownership of the pointers (done to avoid copying) + // see what `aclk_queue_query` parameter `internal` does + + // NEVER CONTINUE THIS LOOP AFTER CALLING FUNCTION!!! + // msg handlers (namely aclk_handle_version_responce) + // can freely change what aclk_incoming_msg_types points to + // so either exit or restart this for loop + freez(cloud_to_agent.type_id); + return 0; + } + goto err_cleanup; + } + } + + errno = 0; + error("Unknown message type from Cloud \"%s\"", cloud_to_agent.type_id); + +err_cleanup: + if (cloud_to_agent.payload) + freez(cloud_to_agent.payload); + if (cloud_to_agent.type_id) + freez(cloud_to_agent.type_id); + if (cloud_to_agent.msg_id) + freez(cloud_to_agent.msg_id); + if (cloud_to_agent.callback_topic) + freez(cloud_to_agent.callback_topic); + +err_cleanup_nojson: + if (aclk_stats_enabled) { + ACLK_STATS_LOCK; + aclk_metrics_per_sample.cloud_req_err++; + ACLK_STATS_UNLOCK; + } + + return 1; +} diff --git a/aclk/aclk_rx_msgs.h b/aclk/aclk_rx_msgs.h new file mode 100644 index 000000000..c9f0bd37a --- /dev/null +++ b/aclk/aclk_rx_msgs.h @@ -0,0 +1,14 @@ + + +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef ACLK_RX_MSGS_H +#define ACLK_RX_MSGS_H + +#include "../daemon/common.h" +#include "libnetdata/libnetdata.h" + +int aclk_handle_cloud_message(char *payload); +void aclk_set_rx_handlers(int version); + +#endif /* ACLK_RX_MSGS_H */ diff --git a/aclk/aclk_stats.c b/aclk/aclk_stats.c new file mode 100644 index 000000000..b61ac05f7 --- /dev/null +++ b/aclk/aclk_stats.c @@ -0,0 +1,274 @@ +#include "aclk_stats.h" + +netdata_mutex_t aclk_stats_mutex = NETDATA_MUTEX_INITIALIZER; + +int aclk_stats_enabled; + +int query_thread_count; + +// data ACLK stats need per query thread +struct aclk_qt_data { + RRDDIM *dim; +} *aclk_qt_data = NULL; + +uint32_t *aclk_queries_per_thread = NULL; +uint32_t *aclk_queries_per_thread_sample = NULL; + +struct aclk_metrics aclk_metrics = { + .online = 0, +}; + +struct aclk_metrics_per_sample aclk_metrics_per_sample; + +static void aclk_stats_collect(struct aclk_metrics_per_sample *per_sample, struct aclk_metrics *permanent) +{ + static RRDSET *st_aclkstats = NULL; + static RRDDIM *rd_online_status = NULL; + + if (unlikely(!st_aclkstats)) { + st_aclkstats = rrdset_create_localhost( + "netdata", "aclk_status", NULL, "aclk", NULL, "ACLK/Cloud connection status", + "connected", "netdata", "stats", 200000, localhost->rrd_update_every, RRDSET_TYPE_LINE); + + rd_online_status = rrddim_add(st_aclkstats, "online", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st_aclkstats); + + rrddim_set_by_pointer(st_aclkstats, rd_online_status, per_sample->offline_during_sample ? 0 : permanent->online); + + rrdset_done(st_aclkstats); +} + +static void aclk_stats_query_queue(struct aclk_metrics_per_sample *per_sample) +{ + static RRDSET *st_query_thread = NULL; + static RRDDIM *rd_queued = NULL; + static RRDDIM *rd_dispatched = NULL; + + if (unlikely(!st_query_thread)) { + st_query_thread = rrdset_create_localhost( + "netdata", "aclk_query_per_second", NULL, "aclk", NULL, "ACLK Queries per second", "queries/s", + "netdata", "stats", 200001, localhost->rrd_update_every, RRDSET_TYPE_AREA); + + rd_queued = rrddim_add(st_query_thread, "added", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + rd_dispatched = rrddim_add(st_query_thread, "dispatched", NULL, -1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st_query_thread); + + rrddim_set_by_pointer(st_query_thread, rd_queued, per_sample->queries_queued); + rrddim_set_by_pointer(st_query_thread, rd_dispatched, per_sample->queries_dispatched); + + rrdset_done(st_query_thread); +} + +#ifdef NETDATA_INTERNAL_CHECKS +static void aclk_stats_latency(struct aclk_metrics_per_sample *per_sample) +{ + static RRDSET *st = NULL; + static RRDDIM *rd_avg = NULL; + static RRDDIM *rd_max = NULL; + + if (unlikely(!st)) { + st = rrdset_create_localhost( + "netdata", "aclk_latency_mqtt", NULL, "aclk", NULL, "ACLK Message Publish Latency", "ms", + "netdata", "stats", 200002, localhost->rrd_update_every, RRDSET_TYPE_LINE); + + rd_avg = rrddim_add(st, "avg", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_max = rrddim_add(st, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st); + if(per_sample->latency_count) + rrddim_set_by_pointer(st, rd_avg, roundf((float)per_sample->latency_total / per_sample->latency_count)); + else + rrddim_set_by_pointer(st, rd_avg, 0); + + rrddim_set_by_pointer(st, rd_max, per_sample->latency_max); + + rrdset_done(st); +} +#endif + +static void aclk_stats_cloud_req(struct aclk_metrics_per_sample *per_sample) +{ + static RRDSET *st = NULL; + static RRDDIM *rd_rq_rcvd = NULL; + static RRDDIM *rd_rq_err = NULL; + + if (unlikely(!st)) { + st = rrdset_create_localhost( + "netdata", "aclk_cloud_req", NULL, "aclk", NULL, "Requests received from cloud", "req/s", + "netdata", "stats", 200005, localhost->rrd_update_every, RRDSET_TYPE_STACKED); + + rd_rq_rcvd = rrddim_add(st, "received", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + rd_rq_err = rrddim_add(st, "malformed", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st); + + rrddim_set_by_pointer(st, rd_rq_rcvd, per_sample->cloud_req_recvd - per_sample->cloud_req_err); + rrddim_set_by_pointer(st, rd_rq_err, per_sample->cloud_req_err); + + rrdset_done(st); +} + +#define MAX_DIM_NAME 16 +static void aclk_stats_query_threads(uint32_t *queries_per_thread) +{ + static RRDSET *st = NULL; + + char dim_name[MAX_DIM_NAME]; + + if (unlikely(!st)) { + st = rrdset_create_localhost( + "netdata", "aclk_query_threads", NULL, "aclk", NULL, "Queries Processed Per Thread", "req/s", + "netdata", "stats", 200007, localhost->rrd_update_every, RRDSET_TYPE_STACKED); + + for (int i = 0; i < query_thread_count; i++) { + if (snprintf(dim_name, MAX_DIM_NAME, "Query %d", i) < 0) + error("snprintf encoding error"); + aclk_qt_data[i].dim = rrddim_add(st, dim_name, NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + } + } else + rrdset_next(st); + + for (int i = 0; i < query_thread_count; i++) { + rrddim_set_by_pointer(st, aclk_qt_data[i].dim, queries_per_thread[i]); + } + + rrdset_done(st); +} + +static void aclk_stats_query_time(struct aclk_metrics_per_sample *per_sample) +{ + static RRDSET *st = NULL; + static RRDDIM *rd_rq_avg = NULL; + static RRDDIM *rd_rq_max = NULL; + static RRDDIM *rd_rq_total = NULL; + + if (unlikely(!st)) { + st = rrdset_create_localhost( + "netdata", "aclk_query_time", NULL, "aclk", NULL, "Time it took to process cloud requested DB queries", "us", + "netdata", "stats", 200006, localhost->rrd_update_every, RRDSET_TYPE_LINE); + + rd_rq_avg = rrddim_add(st, "avg", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + rd_rq_max = rrddim_add(st, "max", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + rd_rq_total = rrddim_add(st, "total", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st); + + if(per_sample->cloud_q_process_count) + rrddim_set_by_pointer(st, rd_rq_avg, roundf((float)per_sample->cloud_q_process_total / per_sample->cloud_q_process_count)); + else + rrddim_set_by_pointer(st, rd_rq_avg, 0); + rrddim_set_by_pointer(st, rd_rq_max, per_sample->cloud_q_process_max); + rrddim_set_by_pointer(st, rd_rq_total, per_sample->cloud_q_process_total); + + rrdset_done(st); +} + +void aclk_stats_thread_cleanup() +{ + freez(aclk_qt_data); + freez(aclk_queries_per_thread); + freez(aclk_queries_per_thread_sample); +} + +void *aclk_stats_main_thread(void *ptr) +{ + struct aclk_stats_thread *args = ptr; + + query_thread_count = args->query_thread_count; + aclk_qt_data = callocz(query_thread_count, sizeof(struct aclk_qt_data)); + aclk_queries_per_thread = callocz(query_thread_count, sizeof(uint32_t)); + aclk_queries_per_thread_sample = callocz(query_thread_count, sizeof(uint32_t)); + + heartbeat_t hb; + heartbeat_init(&hb); + usec_t step_ut = localhost->rrd_update_every * USEC_PER_SEC; + + memset(&aclk_metrics_per_sample, 0, sizeof(struct aclk_metrics_per_sample)); + + struct aclk_metrics_per_sample per_sample; + struct aclk_metrics permanent; + + while (!netdata_exit) { + netdata_thread_testcancel(); + // ------------------------------------------------------------------------ + // Wait for the next iteration point. + + heartbeat_next(&hb, step_ut); + if (netdata_exit) break; + + ACLK_STATS_LOCK; + // to not hold lock longer than necessary, especially not to hold it + // during database rrd* operations + memcpy(&per_sample, &aclk_metrics_per_sample, sizeof(struct aclk_metrics_per_sample)); + memcpy(&permanent, &aclk_metrics, sizeof(struct aclk_metrics)); + memset(&aclk_metrics_per_sample, 0, sizeof(struct aclk_metrics_per_sample)); + + memcpy(aclk_queries_per_thread_sample, aclk_queries_per_thread, sizeof(uint32_t) * query_thread_count); + memset(aclk_queries_per_thread, 0, sizeof(uint32_t) * query_thread_count); + ACLK_STATS_UNLOCK; + + aclk_stats_collect(&per_sample, &permanent); + aclk_stats_query_queue(&per_sample); +#ifdef NETDATA_INTERNAL_CHECKS + aclk_stats_latency(&per_sample); +#endif + + aclk_stats_cloud_req(&per_sample); + aclk_stats_query_threads(aclk_queries_per_thread_sample); + + aclk_stats_query_time(&per_sample); + } + + return 0; +} + +void aclk_stats_upd_online(int online) { + if(!aclk_stats_enabled) + return; + + ACLK_STATS_LOCK; + aclk_metrics.online = online; + + if(!online) + aclk_metrics_per_sample.offline_during_sample = 1; + ACLK_STATS_UNLOCK; +} + +#ifdef NETDATA_INTERNAL_CHECKS +static usec_t pub_time[UINT16_MAX]; +void aclk_stats_msg_published(uint16_t id) +{ + ACLK_STATS_LOCK; + pub_time[id] = now_boottime_usec(); + ACLK_STATS_UNLOCK; +} + +void aclk_stats_msg_puback(uint16_t id) +{ + ACLK_STATS_LOCK; + usec_t t; + + if (!aclk_stats_enabled) { + ACLK_STATS_UNLOCK; + return; + } + + if (unlikely(!pub_time[id])) { + ACLK_STATS_UNLOCK; + error("Received PUBACK for unknown message?!"); + return; + } + + t = now_boottime_usec() - pub_time[id]; + t /= USEC_PER_MS; + pub_time[id] = 0; + if (aclk_metrics_per_sample.latency_max < t) + aclk_metrics_per_sample.latency_max = t; + + aclk_metrics_per_sample.latency_total += t; + aclk_metrics_per_sample.latency_count++; + ACLK_STATS_UNLOCK; +} +#endif /* NETDATA_INTERNAL_CHECKS */ diff --git a/aclk/aclk_stats.h b/aclk/aclk_stats.h new file mode 100644 index 000000000..33d016965 --- /dev/null +++ b/aclk/aclk_stats.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_ACLK_STATS_H +#define NETDATA_ACLK_STATS_H + +#include "../daemon/common.h" +#include "libnetdata/libnetdata.h" + +#define ACLK_STATS_THREAD_NAME "ACLK_Stats" + +extern netdata_mutex_t aclk_stats_mutex; + +#define ACLK_STATS_LOCK netdata_mutex_lock(&aclk_stats_mutex) +#define ACLK_STATS_UNLOCK netdata_mutex_unlock(&aclk_stats_mutex) + +extern int aclk_stats_enabled; + +struct aclk_stats_thread { + netdata_thread_t *thread; + int query_thread_count; +}; + +// preserve between samples +struct aclk_metrics { + volatile uint8_t online; +}; + +// reset to 0 on every sample +extern struct aclk_metrics_per_sample { + /* in the unlikely event of ACLK disconnecting + and reconnecting under 1 sampling rate + we want to make sure we record the disconnection + despite it being then seemingly longer in graph */ + volatile uint8_t offline_during_sample; + + volatile uint32_t queries_queued; + volatile uint32_t queries_dispatched; + +#ifdef NETDATA_INTERNAL_CHECKS + volatile uint32_t latency_max; + volatile uint32_t latency_total; + volatile uint32_t latency_count; +#endif + + volatile uint32_t cloud_req_recvd; + volatile uint32_t cloud_req_err; + + volatile uint32_t cloud_q_process_total; + volatile uint32_t cloud_q_process_count; + volatile uint32_t cloud_q_process_max; +} aclk_metrics_per_sample; + +extern uint32_t *aclk_queries_per_thread; + +void *aclk_stats_main_thread(void *ptr); +void aclk_stats_thread_cleanup(); +void aclk_stats_upd_online(int online); + +#ifdef NETDATA_INTERNAL_CHECKS +void aclk_stats_msg_published(uint16_t id); +void aclk_stats_msg_puback(uint16_t id); +#endif /* NETDATA_INTERNAL_CHECKS */ + +#endif /* NETDATA_ACLK_STATS_H */ diff --git a/aclk/aclk_tx_msgs.c b/aclk/aclk_tx_msgs.c new file mode 100644 index 000000000..158fc4e26 --- /dev/null +++ b/aclk/aclk_tx_msgs.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "aclk_tx_msgs.h" +#include "../daemon/common.h" +#include "aclk_util.h" +#include "aclk_stats.h" + +#ifndef __GNUC__ +#pragma region aclk_tx_msgs helper functions +#endif + +static void aclk_send_message_subtopic(mqtt_wss_client client, json_object *msg, enum aclk_topics subtopic) +{ + uint16_t packet_id; + const char *str = json_object_to_json_string_ext(msg, JSON_C_TO_STRING_PLAIN); + + mqtt_wss_publish_pid(client, aclk_get_topic(subtopic), str, strlen(str), MQTT_WSS_PUB_QOS1, &packet_id); +#ifdef NETDATA_INTERNAL_CHECKS + aclk_stats_msg_published(packet_id); +#endif +#ifdef ACLK_LOG_CONVERSATION_DIR +#define FN_MAX_LEN 1024 + char filename[FN_MAX_LEN]; + snprintf(filename, FN_MAX_LEN, ACLK_LOG_CONVERSATION_DIR "/%010d-tx.json", ACLK_GET_CONV_LOG_NEXT()); + json_object_to_file_ext(filename, msg, JSON_C_TO_STRING_PRETTY); +#endif +} + +static uint16_t aclk_send_message_subtopic_pid(mqtt_wss_client client, json_object *msg, enum aclk_topics subtopic) +{ + uint16_t packet_id; + const char *str = json_object_to_json_string_ext(msg, JSON_C_TO_STRING_PLAIN); + + mqtt_wss_publish_pid(client, aclk_get_topic(subtopic), str, strlen(str), MQTT_WSS_PUB_QOS1, &packet_id); +#ifdef NETDATA_INTERNAL_CHECKS + aclk_stats_msg_published(packet_id); +#endif +#ifdef ACLK_LOG_CONVERSATION_DIR +#define FN_MAX_LEN 1024 + char filename[FN_MAX_LEN]; + snprintf(filename, FN_MAX_LEN, ACLK_LOG_CONVERSATION_DIR "/%010d-tx.json", ACLK_GET_CONV_LOG_NEXT()); + json_object_to_file_ext(filename, msg, JSON_C_TO_STRING_PRETTY); +#endif + return packet_id; +} + +/* UNUSED now but can be used soon MVP1? +static void aclk_send_message_topic(mqtt_wss_client client, json_object *msg, const char *topic) +{ + if (unlikely(!topic || topic[0] != '/')) { + error ("Full topic required!"); + return; + } + + const char *str = json_object_to_json_string_ext(msg, JSON_C_TO_STRING_PLAIN); + + mqtt_wss_publish(client, topic, str, strlen(str), MQTT_WSS_PUB_QOS1); +#ifdef NETDATA_INTERNAL_CHECKS + aclk_stats_msg_published(); +#endif +#ifdef ACLK_LOG_CONVERSATION_DIR +#define FN_MAX_LEN 1024 + char filename[FN_MAX_LEN]; + snprintf(filename, FN_MAX_LEN, ACLK_LOG_CONVERSATION_DIR "/%010d-tx.json", ACLK_GET_CONV_LOG_NEXT()); + json_object_to_file_ext(filename, msg, JSON_C_TO_STRING_PRETTY); +#endif +} +*/ + +#define TOPIC_MAX_LEN 512 +#define V2_BIN_PAYLOAD_SEPARATOR "\x0D\x0A\x0D\x0A" +static void aclk_send_message_with_bin_payload(mqtt_wss_client client, json_object *msg, const char *topic, const void *payload, size_t payload_len) +{ + uint16_t packet_id; + const char *str; + char *full_msg; + int len; + + if (unlikely(!topic || topic[0] != '/')) { + error ("Full topic required!"); + return; + } + + str = json_object_to_json_string_ext(msg, JSON_C_TO_STRING_PLAIN); + len = strlen(str); + + full_msg = mallocz(len + strlen(V2_BIN_PAYLOAD_SEPARATOR) + payload_len); + + memcpy(full_msg, str, len); + memcpy(&full_msg[len], V2_BIN_PAYLOAD_SEPARATOR, strlen(V2_BIN_PAYLOAD_SEPARATOR)); + len += strlen(V2_BIN_PAYLOAD_SEPARATOR); + memcpy(&full_msg[len], payload, payload_len); + len += payload_len; + +/* TODO +#ifdef ACLK_LOG_CONVERSATION_DIR +#define FN_MAX_LEN 1024 + char filename[FN_MAX_LEN]; + snprintf(filename, FN_MAX_LEN, ACLK_LOG_CONVERSATION_DIR "/%010d-tx.json", ACLK_GET_CONV_LOG_NEXT()); + json_object_to_file_ext(filename, msg, JSON_C_TO_STRING_PRETTY); +#endif */ + + mqtt_wss_publish_pid(client, topic, full_msg, len, MQTT_WSS_PUB_QOS1, &packet_id); +#ifdef NETDATA_INTERNAL_CHECKS + aclk_stats_msg_published(packet_id); +#endif + freez(full_msg); +} + +/* + * Creates universal header common for all ACLK messages. User gets ownership of json object created. + * Usually this is freed by send function after message has been sent. + */ +static struct json_object *create_hdr(const char *type, const char *msg_id, time_t ts_secs, usec_t ts_us, int version) +{ + uuid_t uuid; + char uuid_str[36 + 1]; + json_object *tmp; + json_object *obj = json_object_new_object(); + + tmp = json_object_new_string(type); + json_object_object_add(obj, "type", tmp); + + if (unlikely(!msg_id)) { + uuid_generate(uuid); + uuid_unparse(uuid, uuid_str); + msg_id = uuid_str; + } + + if (ts_secs == 0) { + ts_us = now_realtime_usec(); + ts_secs = ts_us / USEC_PER_SEC; + ts_us = ts_us % USEC_PER_SEC; + } + + tmp = json_object_new_string(msg_id); + json_object_object_add(obj, "msg-id", tmp); + + tmp = json_object_new_int64(ts_secs); + json_object_object_add(obj, "timestamp", tmp); + +// TODO handle this somehow on older json-c +// tmp = json_object_new_uint64(ts_us); +// probably jso->_to_json_strinf -> custom function +// jso->o.c_uint64 -> map this with pointer to signed int +// commit that implements json_object_new_uint64 is 3c3b592 +// between 0.14 and 0.15 + tmp = json_object_new_int64(ts_us); + json_object_object_add(obj, "timestamp-offset-usec", tmp); + + tmp = json_object_new_int64(aclk_session_sec); + json_object_object_add(obj, "connect", tmp); + +// TODO handle this somehow see above +// tmp = json_object_new_uint64(0 /* TODO aclk_session_us */); + tmp = json_object_new_int64(aclk_session_us); + json_object_object_add(obj, "connect-offset-usec", tmp); + + tmp = json_object_new_int(version); + json_object_object_add(obj, "version", tmp); + + return obj; +} + +static char *create_uuid() +{ + uuid_t uuid; + char *uuid_str = mallocz(36 + 1); + + uuid_generate(uuid); + uuid_unparse(uuid, uuid_str); + + return uuid_str; +} + +#ifndef __GNUC__ +#pragma endregion +#endif + +#ifndef __GNUC__ +#pragma region aclk_tx_msgs message generators +#endif + +/* + * This will send the /api/v1/info + */ +#define BUFFER_INITIAL_SIZE (1024 * 16) +void aclk_send_info_metadata(mqtt_wss_client client, int metadata_submitted, RRDHOST *host) +{ + BUFFER *local_buffer = buffer_create(BUFFER_INITIAL_SIZE); + json_object *msg, *payload, *tmp; + + char *msg_id = create_uuid(); + buffer_flush(local_buffer); + local_buffer->contenttype = CT_APPLICATION_JSON; + + // on_connect messages are sent on a health reload, if the on_connect message is real then we + // use the session time as the fake timestamp to indicate that it starts the session. If it is + // a fake on_connect message then use the real timestamp to indicate it is within the existing + // session. + if (metadata_submitted) + msg = create_hdr("update", msg_id, 0, 0, aclk_shared_state.version_neg); + else + msg = create_hdr("connect", msg_id, aclk_session_sec, aclk_session_us, aclk_shared_state.version_neg); + + payload = json_object_new_object(); + json_object_object_add(msg, "payload", payload); + + web_client_api_request_v1_info_fill_buffer(host, local_buffer); + tmp = json_tokener_parse(local_buffer->buffer); + json_object_object_add(payload, "info", tmp); + + buffer_flush(local_buffer); + + charts2json(host, local_buffer, 1, 0); + tmp = json_tokener_parse(local_buffer->buffer); + json_object_object_add(payload, "charts", tmp); + + aclk_send_message_subtopic(client, msg, ACLK_TOPICID_METADATA); + + json_object_put(msg); + freez(msg_id); + buffer_free(local_buffer); +} + +// TODO should include header instead +void health_active_log_alarms_2json(RRDHOST *host, BUFFER *wb); + +void aclk_send_alarm_metadata(mqtt_wss_client client, int metadata_submitted) +{ + BUFFER *local_buffer = buffer_create(BUFFER_INITIAL_SIZE); + json_object *msg, *payload, *tmp; + + char *msg_id = create_uuid(); + buffer_flush(local_buffer); + local_buffer->contenttype = CT_APPLICATION_JSON; + + // on_connect messages are sent on a health reload, if the on_connect message is real then we + // use the session time as the fake timestamp to indicate that it starts the session. If it is + // a fake on_connect message then use the real timestamp to indicate it is within the existing + // session. + + if (metadata_submitted) + msg = create_hdr("connect_alarms", msg_id, 0, 0, aclk_shared_state.version_neg); + else + msg = create_hdr("connect_alarms", msg_id, aclk_session_sec, aclk_session_us, aclk_shared_state.version_neg); + + payload = json_object_new_object(); + json_object_object_add(msg, "payload", payload); + + health_alarms2json(localhost, local_buffer, 1); + tmp = json_tokener_parse(local_buffer->buffer); + json_object_object_add(payload, "configured-alarms", tmp); + + buffer_flush(local_buffer); + + health_active_log_alarms_2json(localhost, local_buffer); + tmp = json_tokener_parse(local_buffer->buffer); + json_object_object_add(payload, "alarms-active", tmp); + + aclk_send_message_subtopic(client, msg, ACLK_TOPICID_ALARMS); + + json_object_put(msg); + freez(msg_id); + buffer_free(local_buffer); +} + +void aclk_hello_msg(mqtt_wss_client client) +{ + json_object *tmp, *msg; + + char *msg_id = create_uuid(); + + ACLK_SHARED_STATE_LOCK; + aclk_shared_state.version_neg = 0; + aclk_shared_state.version_neg_wait_till = now_monotonic_usec() + USEC_PER_SEC * VERSION_NEG_TIMEOUT; + ACLK_SHARED_STATE_UNLOCK; + + //Hello message is versioned separatelly from the rest of the protocol + msg = create_hdr("hello", msg_id, 0, 0, ACLK_VERSION_NEG_VERSION); + + tmp = json_object_new_int(ACLK_VERSION_MIN); + json_object_object_add(msg, "min-version", tmp); + + tmp = json_object_new_int(ACLK_VERSION_MAX); + json_object_object_add(msg, "max-version", tmp); + +#ifdef ACLK_NG + tmp = json_object_new_string("Next Generation"); +#else + tmp = json_object_new_string("Legacy"); +#endif + json_object_object_add(msg, "aclk-implementation", tmp); + + aclk_send_message_subtopic(client, msg, ACLK_TOPICID_METADATA); + + json_object_put(msg); + freez(msg_id); +} + +void aclk_http_msg_v2(mqtt_wss_client client, const char *topic, const char *msg_id, usec_t t_exec, usec_t created, int http_code, const char *payload, size_t payload_len) +{ + json_object *tmp, *msg; + + msg = create_hdr("http", msg_id, 0, 0, 2); + + tmp = json_object_new_int64(t_exec); + json_object_object_add(msg, "t-exec", tmp); + + tmp = json_object_new_int64(created); + json_object_object_add(msg, "t-rx", tmp); + + tmp = json_object_new_int(http_code); + json_object_object_add(msg, "http-code", tmp); + + aclk_send_message_with_bin_payload(client, msg, topic, payload, payload_len); + json_object_put(msg); +} + +void aclk_chart_msg(mqtt_wss_client client, RRDHOST *host, const char *chart) +{ + json_object *msg, *payload; + BUFFER *tmp_buffer; + RRDSET *st; + + st = rrdset_find(host, chart); + if (!st) + st = rrdset_find_byname(host, chart); + if (!st) { + info("FAILED to find chart %s", chart); + return; + } + + tmp_buffer = buffer_create(BUFFER_INITIAL_SIZE); + rrdset2json(st, tmp_buffer, NULL, NULL, 1); + payload = json_tokener_parse(tmp_buffer->buffer); + if (!payload) { + error("Failed to parse JSON from rrdset2json"); + buffer_free(tmp_buffer); + return; + } + + msg = create_hdr("chart", NULL, 0, 0, aclk_shared_state.version_neg); + json_object_object_add(msg, "payload", payload); + + aclk_send_message_subtopic(client, msg, ACLK_TOPICID_CHART); + + buffer_free(tmp_buffer); + json_object_put(msg); +} + +void aclk_alarm_state_msg(mqtt_wss_client client, json_object *msg) +{ + // we create header here on purpose (and not send message with it already as `msg` param) + // one is version_neg is guaranteed to be done here + // other are timestamps etc. which in ACLK legacy would be wrong (because ACLK legacy + // send message with timestamps already to Query Queue they would be incorrect at time + // when query queue would get to send them) + json_object *obj = create_hdr("status-change", NULL, 0, 0, aclk_shared_state.version_neg); + json_object_object_add(obj, "payload", msg); + + aclk_send_message_subtopic(client, obj, ACLK_TOPICID_ALARMS); + json_object_put(obj); +} + +/* + * Will generate disconnect message. + * @param message if NULL it will generate LWT message (unexpected). + * Otherwise string pointed to by this parameter will be used as + * reason. + */ +json_object *aclk_generate_disconnect(const char *message) +{ + json_object *tmp, *msg; + + msg = create_hdr("disconnect", NULL, 0, 0, 2); + + tmp = json_object_new_string(message ? message : "unexpected"); + json_object_object_add(msg, "payload", tmp); + + return msg; +} + +int aclk_send_app_layer_disconnect(mqtt_wss_client client, const char *message) +{ + int pid; + json_object *msg = aclk_generate_disconnect(message); + pid = aclk_send_message_subtopic_pid(client, msg, ACLK_TOPICID_METADATA); + json_object_put(msg); + return pid; +} + +#ifndef __GNUC__ +#pragma endregion +#endif diff --git a/aclk/aclk_tx_msgs.h b/aclk/aclk_tx_msgs.h new file mode 100644 index 000000000..cb4d44c96 --- /dev/null +++ b/aclk/aclk_tx_msgs.h @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +#ifndef ACLK_TX_MSGS_H +#define ACLK_TX_MSGS_H + +#include +#include "libnetdata/libnetdata.h" +#include "../daemon/common.h" +#include "mqtt_wss_client.h" + +void aclk_send_info_metadata(mqtt_wss_client client, int metadata_submitted, RRDHOST *host); +void aclk_send_alarm_metadata(mqtt_wss_client client, int metadata_submitted); + +void aclk_hello_msg(mqtt_wss_client client); + +void aclk_http_msg_v2(mqtt_wss_client client, const char *topic, const char *msg_id, usec_t t_exec, usec_t created, int http_code, const char *payload, size_t payload_len); + +void aclk_chart_msg(mqtt_wss_client client, RRDHOST *host, const char *chart); + +void aclk_alarm_state_msg(mqtt_wss_client client, json_object *msg); + +json_object *aclk_generate_disconnect(const char *message); +int aclk_send_app_layer_disconnect(mqtt_wss_client client, const char *message); + +#endif diff --git a/aclk/aclk_util.c b/aclk/aclk_util.c new file mode 100644 index 000000000..a5347c466 --- /dev/null +++ b/aclk/aclk_util.c @@ -0,0 +1,347 @@ +#include "aclk_util.h" + +#include + +#include "../daemon/common.h" + +// CentOS 7 has older version that doesn't define this +// same goes for MacOS +#ifndef UUID_STR_LEN +#define UUID_STR_LEN 37 +#endif + +#ifdef ACLK_LOG_CONVERSATION_DIR +volatile int aclk_conversation_log_counter = 0; +#if !defined(HAVE_C___ATOMIC) || defined(NETDATA_NO_ATOMIC_INSTRUCTIONS) +netdata_mutex_t aclk_conversation_log_mutex = NETDATA_MUTEX_INITIALIZER; +int aclk_get_conv_log_next() +{ + int ret; + netdata_mutex_lock(&aclk_conversation_log_mutex); + ret = aclk_conversation_log_counter++; + netdata_mutex_unlock(&aclk_conversation_log_mutex); + return ret; +} +#endif +#endif + +#define ACLK_TOPIC_PREFIX "/agent/" + +struct aclk_topic { + const char *topic_suffix; + char *topic; +}; + +// This helps to cache finalized topics (assembled with claim_id) +// to not have to alloc or create buffer and construct topic every +// time message is sent as in old ACLK +static struct aclk_topic aclk_topic_cache[] = { + { .topic_suffix = "outbound/meta", .topic = NULL }, // ACLK_TOPICID_CHART + { .topic_suffix = "outbound/alarms", .topic = NULL }, // ACLK_TOPICID_ALARMS + { .topic_suffix = "outbound/meta", .topic = NULL }, // ACLK_TOPICID_METADATA + { .topic_suffix = "inbound/cmd", .topic = NULL }, // ACLK_TOPICID_COMMAND + { .topic_suffix = NULL, .topic = NULL } +}; + +void free_topic_cache(void) +{ + struct aclk_topic *tc = aclk_topic_cache; + while (tc->topic_suffix) { + if (tc->topic) { + freez(tc->topic); + tc->topic = NULL; + } + tc++; + } +} + +static inline void generate_topic_cache(void) +{ + struct aclk_topic *tc = aclk_topic_cache; + char *ptr; + if (unlikely(!tc->topic)) { + rrdhost_aclk_state_lock(localhost); + while(tc->topic_suffix) { + tc->topic = mallocz(strlen(ACLK_TOPIC_PREFIX) + (UUID_STR_LEN - 1) + 2 /* '/' and \0 */ + strlen(tc->topic_suffix)); + ptr = tc->topic; + strcpy(ptr, ACLK_TOPIC_PREFIX); + ptr += strlen(ACLK_TOPIC_PREFIX); + strcpy(ptr, localhost->aclk_state.claimed_id); + ptr += (UUID_STR_LEN - 1); + *ptr++ = '/'; + strcpy(ptr, tc->topic_suffix); + tc++; + } + rrdhost_aclk_state_unlock(localhost); + } +} + +/* + * Build a topic based on sub_topic and final_topic + * if the sub topic starts with / assume that is an absolute topic + * + */ +const char *aclk_get_topic(enum aclk_topics topic) +{ + generate_topic_cache(); + + return aclk_topic_cache[topic].topic; +} + +int aclk_decode_base_url(char *url, char **aclk_hostname, int *aclk_port) +{ + int pos = 0; + if (!strncmp("https://", url, 8)) { + pos = 8; + } else if (!strncmp("http://", url, 7)) { + error("Cannot connect ACLK over %s -> unencrypted link is not supported", url); + return 1; + } + int host_end = pos; + while (url[host_end] != 0 && url[host_end] != '/' && url[host_end] != ':') + host_end++; + if (url[host_end] == 0) { + *aclk_hostname = strdupz(url + pos); + *aclk_port = 443; + info("Setting ACLK target host=%s port=%d from %s", *aclk_hostname, *aclk_port, url); + return 0; + } + if (url[host_end] == ':') { + *aclk_hostname = callocz(host_end - pos + 1, 1); + strncpy(*aclk_hostname, url + pos, host_end - pos); + int port_end = host_end + 1; + while (url[port_end] >= '0' && url[port_end] <= '9') + port_end++; + if (port_end - host_end > 6) { + error("Port specified in %s is invalid", url); + freez(*aclk_hostname); + *aclk_hostname = NULL; + return 1; + } + *aclk_port = atoi(&url[host_end+1]); + } + if (url[host_end] == '/') { + *aclk_port = 443; + *aclk_hostname = callocz(1, host_end - pos + 1); + strncpy(*aclk_hostname, url+pos, host_end - pos); + } + info("Setting ACLK target host=%s port=%d from %s", *aclk_hostname, *aclk_port, url); + return 0; +} + +/* + * TBEB with randomness + * + * @param mode 0 - to reset the delay, + * 1 - to advance a step and calculate sleep time [0 .. ACLK_MAX_BACKOFF_DELAY * 1000] ms + * @returns delay in ms + * + */ +#define ACLK_MAX_BACKOFF_DELAY 1024 +unsigned long int aclk_reconnect_delay(int mode) +{ + static int fail = -1; + unsigned long int delay; + + if (!mode || fail == -1) { + srandom(time(NULL)); + fail = mode - 1; + return 0; + } + + delay = (1 << fail); + + if (delay >= ACLK_MAX_BACKOFF_DELAY) { + delay = ACLK_MAX_BACKOFF_DELAY * 1000; + } else { + fail++; + delay *= 1000; + delay += (random() % (MAX(1000, delay/2))); + } + + return delay; +} + +#define ACLK_PROXY_PROTO_ADDR_SEPARATOR "://" +#define ACLK_PROXY_ENV "env" +#define ACLK_PROXY_CONFIG_VAR "proxy" + +struct { + ACLK_PROXY_TYPE type; + const char *url_str; +} supported_proxy_types[] = { + { .type = PROXY_TYPE_SOCKS5, .url_str = "socks5" ACLK_PROXY_PROTO_ADDR_SEPARATOR }, + { .type = PROXY_TYPE_SOCKS5, .url_str = "socks5h" ACLK_PROXY_PROTO_ADDR_SEPARATOR }, + { .type = PROXY_TYPE_HTTP, .url_str = "http" ACLK_PROXY_PROTO_ADDR_SEPARATOR }, + { .type = PROXY_TYPE_UNKNOWN, .url_str = NULL }, +}; + +const char *aclk_proxy_type_to_s(ACLK_PROXY_TYPE *type) +{ + switch (*type) { + case PROXY_DISABLED: + return "disabled"; + case PROXY_TYPE_HTTP: + return "HTTP"; + case PROXY_TYPE_SOCKS5: + return "SOCKS"; + default: + return "Unknown"; + } +} + +static inline ACLK_PROXY_TYPE aclk_find_proxy(const char *string) +{ + int i = 0; + while (supported_proxy_types[i].url_str) { + if (!strncmp(supported_proxy_types[i].url_str, string, strlen(supported_proxy_types[i].url_str))) + return supported_proxy_types[i].type; + i++; + } + return PROXY_TYPE_UNKNOWN; +} + +ACLK_PROXY_TYPE aclk_verify_proxy(const char *string) +{ + if (!string) + return PROXY_TYPE_UNKNOWN; + + while (*string == 0x20 && *string!=0) // Help coverity (compiler will remove) + string++; + + if (!*string) + return PROXY_TYPE_UNKNOWN; + + return aclk_find_proxy(string); +} + +// helper function to censor user&password +// for logging purposes +void safe_log_proxy_censor(char *proxy) +{ + size_t length = strlen(proxy); + char *auth = proxy + length - 1; + char *cur; + + while ((auth >= proxy) && (*auth != '@')) + auth--; + + //if not found or @ is first char do nothing + if (auth <= proxy) + return; + + cur = strstr(proxy, ACLK_PROXY_PROTO_ADDR_SEPARATOR); + if (!cur) + cur = proxy; + else + cur += strlen(ACLK_PROXY_PROTO_ADDR_SEPARATOR); + + while (cur < auth) { + *cur = 'X'; + cur++; + } +} + +static inline void safe_log_proxy_error(char *str, const char *proxy) +{ + char *log = strdupz(proxy); + safe_log_proxy_censor(log); + error("%s Provided Value:\"%s\"", str, log); + freez(log); +} + +static inline int check_socks_enviroment(const char **proxy) +{ + char *tmp = getenv("socks_proxy"); + + if (!tmp) + return 1; + + if (aclk_verify_proxy(tmp) == PROXY_TYPE_SOCKS5) { + *proxy = tmp; + return 0; + } + + safe_log_proxy_error( + "Environment var \"socks_proxy\" defined but of unknown format. Supported syntax: \"socks5[h]://[user:pass@]host:ip\".", + tmp); + return 1; +} + +static inline int check_http_enviroment(const char **proxy) +{ + char *tmp = getenv("http_proxy"); + + if (!tmp) + return 1; + + if (aclk_verify_proxy(tmp) == PROXY_TYPE_HTTP) { + *proxy = tmp; + return 0; + } + + safe_log_proxy_error( + "Environment var \"http_proxy\" defined but of unknown format. Supported syntax: \"http[s]://[user:pass@]host:ip\".", + tmp); + return 1; +} + +const char *aclk_lws_wss_get_proxy_setting(ACLK_PROXY_TYPE *type) +{ + const char *proxy = config_get(CONFIG_SECTION_CLOUD, ACLK_PROXY_CONFIG_VAR, ACLK_PROXY_ENV); + *type = PROXY_DISABLED; + + if (strcmp(proxy, "none") == 0) + return proxy; + + if (strcmp(proxy, ACLK_PROXY_ENV) == 0) { + if (check_socks_enviroment(&proxy) == 0) { +#ifdef LWS_WITH_SOCKS5 + *type = PROXY_TYPE_SOCKS5; + return proxy; +#else + safe_log_proxy_error("socks_proxy environment variable set to use SOCKS5 proxy " + "but Libwebsockets used doesn't have SOCKS5 support built in. " + "Ignoring and checking for other options.", + proxy); +#endif + } + if (check_http_enviroment(&proxy) == 0) + *type = PROXY_TYPE_HTTP; + return proxy; + } + + *type = aclk_verify_proxy(proxy); +#ifndef LWS_WITH_SOCKS5 + if (*type == PROXY_TYPE_SOCKS5) { + safe_log_proxy_error( + "Config var \"" ACLK_PROXY_CONFIG_VAR + "\" set to use SOCKS5 proxy but Libwebsockets used is built without support for SOCKS proxy. ACLK will be disabled.", + proxy); + } +#endif + if (*type == PROXY_TYPE_UNKNOWN) { + *type = PROXY_DISABLED; + safe_log_proxy_error( + "Config var \"" ACLK_PROXY_CONFIG_VAR + "\" defined but of unknown format. Supported syntax: \"socks5[h]://[user:pass@]host:ip\".", + proxy); + } + + return proxy; +} + +// helper function to read settings only once (static) +// as claiming, challenge/response and ACLK +// read the same thing, no need to parse again +const char *aclk_get_proxy(ACLK_PROXY_TYPE *type) +{ + static const char *proxy = NULL; + static ACLK_PROXY_TYPE proxy_type = PROXY_NOT_SET; + + if (proxy_type == PROXY_NOT_SET) + proxy = aclk_lws_wss_get_proxy_setting(&proxy_type); + + *type = proxy_type; + return proxy; +} diff --git a/aclk/aclk_util.h b/aclk/aclk_util.h new file mode 100644 index 000000000..c72329791 --- /dev/null +++ b/aclk/aclk_util.h @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +#ifndef ACLK_UTIL_H +#define ACLK_UTIL_H + +#include "libnetdata/libnetdata.h" + +// Helper stuff which should not have any further inside ACLK dependency +// and are supposed not to be needed outside of ACLK + +int aclk_decode_base_url(char *url, char **aclk_hostname, int *aclk_port); + +enum aclk_topics { + ACLK_TOPICID_CHART = 0, + ACLK_TOPICID_ALARMS = 1, + ACLK_TOPICID_METADATA = 2, + ACLK_TOPICID_COMMAND = 3, +}; + +const char *aclk_get_topic(enum aclk_topics topic); +void free_topic_cache(void); +// TODO +// aclk_topics_reload //when claim id changes + +#ifdef ACLK_LOG_CONVERSATION_DIR +extern volatile int aclk_conversation_log_counter; +#if defined(HAVE_C___ATOMIC) && !defined(NETDATA_NO_ATOMIC_INSTRUCTIONS) +#define ACLK_GET_CONV_LOG_NEXT() __atomic_fetch_add(&aclk_conversation_log_counter, 1, __ATOMIC_SEQ_CST) +#else +extern netdata_mutex_t aclk_conversation_log_mutex; +int aclk_get_conv_log_next(); +#define ACLK_GET_CONV_LOG_NEXT() aclk_get_conv_log_next() +#endif +#endif + +unsigned long int aclk_reconnect_delay(int mode); + +typedef enum aclk_proxy_type { + PROXY_TYPE_UNKNOWN = 0, + PROXY_TYPE_SOCKS5, + PROXY_TYPE_HTTP, + PROXY_DISABLED, + PROXY_NOT_SET, +} ACLK_PROXY_TYPE; + +const char *aclk_proxy_type_to_s(ACLK_PROXY_TYPE *type); +ACLK_PROXY_TYPE aclk_verify_proxy(const char *string); +const char *aclk_lws_wss_get_proxy_setting(ACLK_PROXY_TYPE *type); +void safe_log_proxy_censor(char *proxy); +int aclk_decode_base_url(char *url, char **aclk_hostname, int *aclk_port); +const char *aclk_get_proxy(ACLK_PROXY_TYPE *type); + +#endif /* ACLK_UTIL_H */ diff --git a/aclk/https_client.c b/aclk/https_client.c new file mode 100644 index 000000000..1b9546d77 --- /dev/null +++ b/aclk/https_client.c @@ -0,0 +1,246 @@ +#include "libnetdata/libnetdata.h" + +#include "https_client.h" + +#include "../mqtt_websockets/c-rbuf/include/ringbuffer.h" + +enum http_parse_state { + HTTP_PARSE_INITIAL = 0, + HTTP_PARSE_HEADERS, + HTTP_PARSE_CONTENT +}; + +typedef struct { + enum http_parse_state state; + int content_length; + int http_code; +} http_parse_ctx; + +#define HTTP_PARSE_CTX_INITIALIZER { .state = HTTP_PARSE_INITIAL, .content_length = -1, .http_code = 0 } + +#define NEED_MORE_DATA 0 +#define PARSE_SUCCESS 1 +#define PARSE_ERROR -1 +#define HTTP_LINE_TERM "\x0D\x0A" +#define RESP_PROTO "HTTP/1.1 " +#define HTTP_KEYVAL_SEPARATOR ": " +#define HTTP_HDR_BUFFER_SIZE 256 +#define PORT_STR_MAX_BYTES 7 + +static void process_http_hdr(http_parse_ctx *parse_ctx, const char *key, const char *val) +{ + // currently we care only about content-length + // but in future the way this is written + // it can be extended + if (!strcmp("content-length", key)) { + parse_ctx->content_length = atoi(val); + } +} + +static int parse_http_hdr(rbuf_t buf, http_parse_ctx *parse_ctx) +{ + int idx, idx_end; + char buf_key[HTTP_HDR_BUFFER_SIZE]; + char buf_val[HTTP_HDR_BUFFER_SIZE]; + char *ptr = buf_key; + if (!rbuf_find_bytes(buf, HTTP_LINE_TERM, strlen(HTTP_LINE_TERM), &idx_end)) { + error("CRLF expected"); + return 1; + } + + char *separator = rbuf_find_bytes(buf, HTTP_KEYVAL_SEPARATOR, strlen(HTTP_KEYVAL_SEPARATOR), &idx); + if (!separator) { + error("Missing Key/Value separator"); + return 1; + } + if (idx >= HTTP_HDR_BUFFER_SIZE) { + error("Key name is too long"); + return 1; + } + + rbuf_pop(buf, buf_key, idx); + buf_key[idx] = 0; + + rbuf_bump_tail(buf, strlen(HTTP_KEYVAL_SEPARATOR)); + idx_end -= strlen(HTTP_KEYVAL_SEPARATOR) + idx; + if (idx_end >= HTTP_HDR_BUFFER_SIZE) { + error("Value of key \"%s\" too long", buf_key); + return 1; + } + + rbuf_pop(buf, buf_val, idx_end); + buf_val[idx_end] = 0; + + rbuf_bump_tail(buf, strlen(HTTP_KEYVAL_SEPARATOR)); + + for (ptr = buf_key; *ptr; ptr++) + *ptr = tolower(*ptr); + + process_http_hdr(parse_ctx, buf_key, buf_val); + + return 0; +} + +static int parse_http_response(rbuf_t buf, http_parse_ctx *parse_ctx) +{ + int idx; + char rc[4]; + + do { + if (parse_ctx->state != HTTP_PARSE_CONTENT && !rbuf_find_bytes(buf, HTTP_LINE_TERM, strlen(HTTP_LINE_TERM), &idx)) + return NEED_MORE_DATA; + switch (parse_ctx->state) { + case HTTP_PARSE_INITIAL: + if (rbuf_memcmp_n(buf, RESP_PROTO, strlen(RESP_PROTO))) { + error("Expected response to start with \"%s\"", RESP_PROTO); + return PARSE_ERROR; + } + rbuf_bump_tail(buf, strlen(RESP_PROTO)); + if (rbuf_pop(buf, rc, 4) != 4) { + error("Expected HTTP status code"); + return PARSE_ERROR; + } + if (rc[3] != ' ') { + error("Expected space after HTTP return code"); + return PARSE_ERROR; + } + rc[3] = 0; + parse_ctx->http_code = atoi(rc); + if (parse_ctx->http_code < 100 || parse_ctx->http_code >= 600) { + error("HTTP code not in range 100 to 599"); + return PARSE_ERROR; + } + + rbuf_find_bytes(buf, HTTP_LINE_TERM, strlen(HTTP_LINE_TERM), &idx); + + rbuf_bump_tail(buf, idx + strlen(HTTP_LINE_TERM)); + + parse_ctx->state = HTTP_PARSE_HEADERS; + break; + case HTTP_PARSE_HEADERS: + if (!idx) { + parse_ctx->state = HTTP_PARSE_CONTENT; + rbuf_bump_tail(buf, strlen(HTTP_LINE_TERM)); + break; + } + if (parse_http_hdr(buf, parse_ctx)) + return PARSE_ERROR; + rbuf_find_bytes(buf, HTTP_LINE_TERM, strlen(HTTP_LINE_TERM), &idx); + rbuf_bump_tail(buf, idx + strlen(HTTP_LINE_TERM)); + break; + case HTTP_PARSE_CONTENT: + if (parse_ctx->content_length < 0) { + error("content-length missing and http headers ended"); + return PARSE_ERROR; + } + if (rbuf_bytes_available(buf) >= (size_t)parse_ctx->content_length) + return PARSE_SUCCESS; + return NEED_MORE_DATA; + } + } while(1); +} + +int https_request(http_req_type_t method, char *host, int port, char *url, char *b, size_t b_size, char *payload) +{ + struct timeval timeout = { .tv_sec = 30, .tv_usec = 0 }; + char sport[PORT_STR_MAX_BYTES]; + size_t len = 0; + int rc = 1; + int ret; + char *ptr; + http_parse_ctx parse_ctx = HTTP_PARSE_CTX_INITIALIZER; + + rbuf_t buffer = rbuf_create(b_size); + if (!buffer) + return 1; + + snprintf(sport, PORT_STR_MAX_BYTES, "%d", port); + + if (payload != NULL) + len = strlen(payload); + + snprintf( + b, + b_size, + "%s %s HTTP/1.1\r\nHost: %s\r\nAccept: application/json\r\nContent-length: %zu\r\nAccept-Language: en-us\r\n" + "User-Agent: Netdata/rocks\r\n\r\n", + (method == HTTP_REQ_GET ? "GET" : "POST"), url, host, len); + + if (payload != NULL) + strncat(b, payload, b_size - len); + + len = strlen(b); + + debug(D_ACLK, "Sending HTTPS req (%zu bytes): '%s'", len, b); + int sock = connect_to_this_ip46(IPPROTO_TCP, SOCK_STREAM, host, 0, sport, &timeout); + + if (unlikely(sock == -1)) { + error("Handshake failed"); + goto exit_buf; + } + + SSL_CTX *ctx = security_initialize_openssl_client(); + if (ctx==NULL) { + error("Cannot allocate SSL context"); + goto exit_sock; + } + // Certificate chain: not updating the stores - do we need private CA roots? + // Calls to SSL_CTX_load_verify_locations would go here. + SSL *ssl = SSL_new(ctx); + if (ssl==NULL) { + error("Cannot allocate SSL"); + goto exit_CTX; + } + SSL_set_fd(ssl, sock); + ret = SSL_connect(ssl); + if (ret != 1) { + error("SSL_connect() failed with err=%d", ret); + goto exit_SSL; + } + + ret = SSL_write(ssl, b, len); + if (ret <= 0) + { + error("SSL_write() failed with err=%d", ret); + goto exit_SSL; + } + + b[0] = 0; + + do { + ptr = rbuf_get_linear_insert_range(buffer, &len); + ret = SSL_read(ssl, ptr, len - 1); + if (ret) + rbuf_bump_head(buffer, ret); + if (ret <= 0) + { + error("No response available - SSL_read()=%d", ret); + goto exit_FULL; + } + } while (!(ret = parse_http_response(buffer, &parse_ctx))); + + if (ret != PARSE_SUCCESS) { + error("Error parsing HTTP response"); + goto exit_FULL; + } + + if (parse_ctx.http_code < 200 || parse_ctx.http_code >= 300) { + error("HTTP Response not Success (got %d)", parse_ctx.http_code); + goto exit_FULL; + } + + len = rbuf_pop(buffer, b, b_size); + b[MIN(len, b_size-1)] = 0; + + rc = 0; +exit_FULL: +exit_SSL: + SSL_free(ssl); +exit_CTX: + SSL_CTX_free(ctx); +exit_sock: + close(sock); +exit_buf: + rbuf_free(buffer); + return rc; +} diff --git a/aclk/https_client.h b/aclk/https_client.h new file mode 100644 index 000000000..0d2e0dba7 --- /dev/null +++ b/aclk/https_client.h @@ -0,0 +1,11 @@ +#ifndef NETDATA_HTTPS_CLIENT_H +#define NETDATA_HTTPS_CLIENT_H + +typedef enum http_req_type { + HTTP_REQ_GET, + HTTP_REQ_POST +} http_req_type_t; + +int https_request(http_req_type_t method, char *host, int port, char *url, char *b, size_t b_size, char *payload); + +#endif /* NETDATA_HTTPS_CLIENT_H */ diff --git a/aclk/legacy/aclk_common.c b/aclk/legacy/aclk_common.c index d7188b1f0..43455393a 100644 --- a/aclk/legacy/aclk_common.c +++ b/aclk/legacy/aclk_common.c @@ -252,6 +252,7 @@ struct label *add_aclk_host_labels(struct label *label) { proxy_str = "none"; break; } + label = add_label_to_list(label, "_aclk_impl", "Legacy", LABEL_SOURCE_AUTO); return add_label_to_list(label, "_aclk_proxy", proxy_str, LABEL_SOURCE_AUTO); #else return label; diff --git a/aclk/legacy/aclk_lws_https_client.c b/aclk/legacy/aclk_lws_https_client.c index c1856ed2c..f41a230db 100644 --- a/aclk/legacy/aclk_lws_https_client.c +++ b/aclk/legacy/aclk_lws_https_client.c @@ -3,7 +3,11 @@ #define ACLK_LWS_HTTPS_CLIENT_INTERNAL #include "aclk_lws_https_client.h" +#ifndef ACLK_NG #include "aclk_common.h" +#else +#include "../aclk.h" +#endif #include "aclk_lws_wss_client.h" diff --git a/aclk/legacy/aclk_lws_wss_client.c b/aclk/legacy/aclk_lws_wss_client.c index f06df3f42..df221dd60 100644 --- a/aclk/legacy/aclk_lws_wss_client.c +++ b/aclk/legacy/aclk_lws_wss_client.c @@ -348,6 +348,7 @@ static inline int received_data_to_ringbuff(struct lws_ring *buffer, void *data, return 1; } +#ifdef ACLK_TRP_DEBUG_VERBOSE static const char *aclk_lws_callback_name(enum lws_callback_reasons reason) { switch (reason) { @@ -377,12 +378,11 @@ static const char *aclk_lws_callback_name(enum lws_callback_reasons reason) return "LWS_CALLBACK_EVENT_WAIT_CANCELLED"; default: // Not using an internal buffer here for thread-safety with unknown calling context. -#ifdef ACLK_TRP_DEBUG_VERBOSE error("Unknown LWS callback %u", reason); -#endif return "unknown"; } } +#endif void aclk_lws_wss_fail_report() { diff --git a/aclk/legacy/aclk_query.c b/aclk/legacy/aclk_query.c index 7ab534f16..27ad9ac16 100644 --- a/aclk/legacy/aclk_query.c +++ b/aclk/legacy/aclk_query.c @@ -22,6 +22,7 @@ static netdata_mutex_t queue_mutex = NETDATA_MUTEX_INITIALIZER; struct aclk_query { usec_t created; + struct timeval tv_in; usec_t created_boot_time; time_t run_after; // Delay run until after this time ACLK_CMD cmd; // What command is this @@ -30,6 +31,7 @@ struct aclk_query { char *msg_id; // msg_id generated by the cloud (NULL if internal) char *query; // The actual query u_char deleted; // Mark deleted for garbage collect + int idx; // index of query thread struct aclk_query *next; }; @@ -62,6 +64,7 @@ static void aclk_query_free(struct aclk_query *this_query) freez(this_query->query); if(this_query->data && this_query->cmd == ACLK_CMD_CLOUD_QUERY_2) { struct aclk_cloud_req_v2 *del = (struct aclk_cloud_req_v2 *)this_query->data; + freez(del->query_endpoint); freez(del->data); freez(del); } @@ -236,7 +239,8 @@ int aclk_queue_query(char *topic, void *data, char *msg_id, char *query, int run new_query->data = data; new_query->next = NULL; - new_query->created = now_realtime_usec(); + now_realtime_timeval(&new_query->tv_in); + new_query->created = (new_query->tv_in.tv_sec * USEC_PER_SEC) + new_query->tv_in.tv_usec; new_query->created_boot_time = now_boottime_usec(); new_query->run_after = run_after; @@ -324,6 +328,7 @@ static char *aclk_encode_response(char *src, size_t content_size, int keep_newli #pragma region ACLK_QUERY #endif + static usec_t aclk_web_api_request_v1(RRDHOST *host, struct web_client *w, char *url, usec_t q_created) { usec_t t = now_boottime_usec(); @@ -359,8 +364,11 @@ static int aclk_execute_query(struct aclk_query *this_query) mysep = strrchr(this_query->query, '/'); // TODO: handle bad response perhaps in a different way. For now it does to the payload - aclk_web_api_request_v1(localhost, w, mysep ? mysep + 1 : "noop", this_query->created_boot_time); + w->tv_in = this_query->tv_in; now_realtime_timeval(&w->tv_ready); + aclk_web_api_request_v1(localhost, w, mysep ? mysep + 1 : "noop", this_query->created_boot_time); + size_t size = w->response.data->len; + size_t sent = size; w->response.data->date = w->tv_ready.tv_sec; web_client_build_http_header(w); // TODO: this function should offset from date, not tv_ready BUFFER *local_buffer = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); @@ -382,6 +390,24 @@ static int aclk_execute_query(struct aclk_query *this_query) aclk_send_message(this_query->topic, local_buffer->buffer, this_query->msg_id); + struct timeval tv; + now_realtime_timeval(&tv); + + log_access("%llu: %d '[ACLK]:%d' '%s' (sent/all = %zu/%zu bytes %0.0f%%, prep/sent/total = %0.2f/%0.2f/%0.2f ms) %d '%s'", + w->id + , gettid() + , this_query->idx + , "DATA" + , sent + , size + , size > sent ? -((size > 0) ? (((size - sent) / (double) size) * 100.0) : 0.0) : ((size > 0) ? (((sent - size ) / (double) size) * 100.0) : 0.0) + , dt_usec(&w->tv_ready, &w->tv_in) / 1000.0 + , dt_usec(&tv, &w->tv_ready) / 1000.0 + , dt_usec(&tv, &w->tv_in) / 1000.0 + , w->response.code + , strip_control_characters(this_query->query) + ); + buffer_free(w->response.data); buffer_free(w->response.header); buffer_free(w->response.header_output); @@ -426,7 +452,11 @@ static int aclk_execute_query_v2(struct aclk_query *this_query) mysep = strrchr(this_query->query, '/'); // execute the query + w->tv_in = this_query->tv_in; + now_realtime_timeval(&w->tv_ready); t = aclk_web_api_request_v1(cloud_req->host, w, mysep ? mysep + 1 : "noop", this_query->created_boot_time); + size_t size = (w->mode == WEB_CLIENT_MODE_FILECOPY)?w->response.rlen:w->response.data->len; + size_t sent = size; #ifdef NETDATA_WITH_ZLIB // check if gzip encoding can and should be used @@ -475,7 +505,6 @@ static int aclk_execute_query_v2(struct aclk_query *this_query) } #endif - now_realtime_timeval(&w->tv_ready); w->response.data->date = w->tv_ready.tv_sec; web_client_build_http_header(w); local_buffer = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); @@ -492,6 +521,7 @@ static int aclk_execute_query_v2(struct aclk_query *this_query) buffer_need_bytes(local_buffer, w->response.data->len); memcpy(&local_buffer->buffer[local_buffer->len], w->response.data->buffer, w->response.data->len); local_buffer->len += w->response.data->len; + sent = sent - size + w->response.data->len; } else { #endif buffer_strcat(local_buffer, w->response.data->buffer); @@ -502,6 +532,23 @@ static int aclk_execute_query_v2(struct aclk_query *this_query) aclk_send_message_bin(this_query->topic, local_buffer->buffer, local_buffer->len, this_query->msg_id); + struct timeval tv; + now_realtime_timeval(&tv); + + log_access("%llu: %d '[ACLK]:%d' '%s' (sent/all = %zu/%zu bytes %0.0f%%, prep/sent/total = %0.2f/%0.2f/%0.2f ms) %d '%s'", + w->id + , gettid() + , this_query->idx + , "DATA" + , sent + , size + , size > sent ? -((size > 0) ? (((size - sent) / (double) size) * 100.0) : 0.0) : ((size > 0) ? (((sent - size ) / (double) size) * 100.0) : 0.0) + , dt_usec(&w->tv_ready, &w->tv_in) / 1000.0 + , dt_usec(&tv, &w->tv_ready) / 1000.0 + , dt_usec(&tv, &w->tv_in) / 1000.0 + , w->response.code + , strip_control_characters(this_query->query) + ); cleanup: #ifdef NETDATA_WITH_ZLIB if(w->response.zinitialized) @@ -550,6 +597,7 @@ static int aclk_process_query(struct aclk_query_thread *t_info) query_count++; host = (RRDHOST*)this_query->data; + this_query->idx = t_info->idx; debug( D_ACLK, "Query #%ld (%s) size=%zu in queue %llu ms", query_count, this_query->topic, @@ -629,6 +677,12 @@ static int aclk_process_query(struct aclk_query_thread *t_info) aclk_metrics_per_sample.queries_dispatched++; aclk_queries_per_thread[t_info->idx]++; ACLK_STATS_UNLOCK; + + if (likely(getrusage_called_this_tick[t_info->idx] < MAX_GETRUSAGE_CALLS_PER_TICK)) { + getrusage(RUSAGE_THREAD, &rusage_per_thread[t_info->idx]); + getrusage_called_this_tick[t_info->idx]++; + } + } aclk_query_free(this_query); diff --git a/aclk/legacy/aclk_query.h b/aclk/legacy/aclk_query.h index 53eef1392..026985c8d 100644 --- a/aclk/legacy/aclk_query.h +++ b/aclk/legacy/aclk_query.h @@ -8,8 +8,11 @@ #define ACLK_STABLE_TIMEOUT 3 // Minimum delay to mark AGENT as stable +#define MAX_GETRUSAGE_CALLS_PER_TICK 5 // Maximum number of times getrusage can be called per tick, per thread. + extern pthread_cond_t query_cond_wait; extern pthread_mutex_t query_lock_wait; +extern uint8_t *getrusage_called_this_tick; #define QUERY_THREAD_WAKEUP pthread_cond_signal(&query_cond_wait) #define QUERY_THREAD_WAKEUP_ALL pthread_cond_broadcast(&query_cond_wait) @@ -28,6 +31,7 @@ struct aclk_query_threads { struct aclk_cloud_req_v2 { char *data; RRDHOST *host; + char *query_endpoint; }; void *aclk_query_main_thread(void *ptr); diff --git a/aclk/legacy/aclk_rx_msgs.c b/aclk/legacy/aclk_rx_msgs.c index 99fa9d987..2681445b4 100644 --- a/aclk/legacy/aclk_rx_msgs.c +++ b/aclk/legacy/aclk_rx_msgs.c @@ -25,7 +25,7 @@ static inline int aclk_extract_v2_data(char *payload, char **data) #define STRNCMP_CONSTANT_PREFIX(str, const_pref) strncmp(str, const_pref, strlen(const_pref)) static inline int aclk_v2_payload_get_query(struct aclk_cloud_req_v2 *cloud_req, struct aclk_request *req) { - const char *start, *end, *ptr; + const char *start, *end, *ptr, *query_type; char uuid_str[UUID_STR_LEN]; uuid_t uuid; @@ -66,6 +66,8 @@ static inline int aclk_v2_payload_get_query(struct aclk_cloud_req_v2 *cloud_req, error("Only accepting requests that start with \"%s\" from CLOUD.", ACLK_CLOUD_REQ_V2_PREFIX); return 1; } + ptr += strlen(ACLK_CLOUD_REQ_V2_PREFIX); + query_type = ptr; if(!(end = strstr(ptr, " HTTP/1.1\x0D\x0A"))) { errno = 0; @@ -73,6 +75,11 @@ static inline int aclk_v2_payload_get_query(struct aclk_cloud_req_v2 *cloud_req, return 1; } + if(!(ptr = strchr(ptr, '?')) || ptr > end) + ptr = end; + cloud_req->query_endpoint = mallocz((ptr - query_type) + 1); + strncpyz(cloud_req->query_endpoint, query_type, ptr - query_type); + req->payload = mallocz((end - start) + 1); strncpyz(req->payload, start, end - start); @@ -122,6 +129,13 @@ static int aclk_handle_cloud_request_v1(struct aclk_request *cloud_to_agent, cha if (unlikely(aclk_queue_query(cloud_to_agent->callback_topic, NULL, cloud_to_agent->msg_id, cloud_to_agent->payload, 0, 0, ACLK_CMD_CLOUD))) debug(D_ACLK, "ACLK failed to queue incoming \"http\" message"); + if (aclk_stats_enabled) { + ACLK_STATS_LOCK; + aclk_metrics_per_sample.cloud_req_v1++; + aclk_metrics_per_sample.cloud_req_ok++; + ACLK_STATS_UNLOCK; + } + return 0; } @@ -131,6 +145,7 @@ static int aclk_handle_cloud_request_v2(struct aclk_request *cloud_to_agent, cha struct aclk_cloud_req_v2 *cloud_req; char *data; + int stat_idx; errno = 0; if (cloud_to_agent->version < ACLK_V_COMPRESSION) { @@ -165,6 +180,10 @@ static int aclk_handle_cloud_request_v2(struct aclk_request *cloud_to_agent, cha goto cleanup; } + // we do this here due to cloud_req being taken over by query thread + // which if crazy quick can free it after aclk_queue_query + stat_idx = aclk_cloud_req_type_to_idx(cloud_req->query_endpoint); + // aclk_queue_query takes ownership of data pointer if (unlikely(aclk_queue_query( cloud_to_agent->callback_topic, cloud_req, cloud_to_agent->msg_id, cloud_to_agent->payload, 0, 0, @@ -173,8 +192,17 @@ static int aclk_handle_cloud_request_v2(struct aclk_request *cloud_to_agent, cha goto cleanup; } + if (aclk_stats_enabled) { + ACLK_STATS_LOCK; + aclk_metrics_per_sample.cloud_req_v2++; + aclk_metrics_per_sample.cloud_req_ok++; + aclk_metrics_per_sample.cloud_req_by_type[stat_idx]++; + ACLK_STATS_UNLOCK; + } + return 0; cleanup: + freez(cloud_req->query_endpoint); freez(cloud_req->data); freez(cloud_req); return 1; @@ -289,12 +317,6 @@ int aclk_handle_cloud_message(char *payload) struct aclk_request cloud_to_agent; memset(&cloud_to_agent, 0, sizeof(struct aclk_request)); - if (aclk_stats_enabled) { - ACLK_STATS_LOCK; - aclk_metrics_per_sample.cloud_req_recvd++; - ACLK_STATS_UNLOCK; - } - if (unlikely(!payload)) { errno = 0; error("ACLK incoming message is empty"); diff --git a/aclk/legacy/aclk_stats.c b/aclk/legacy/aclk_stats.c index 2a57cd6f0..7124380a2 100644 --- a/aclk/legacy/aclk_stats.c +++ b/aclk/legacy/aclk_stats.c @@ -11,8 +11,17 @@ struct aclk_qt_data { RRDDIM *dim; } *aclk_qt_data = NULL; +// ACLK per query thread cpu stats +struct aclk_cpu_data { + RRDDIM *user; + RRDDIM *system; + RRDSET *st; +} *aclk_cpu_data = NULL; + uint32_t *aclk_queries_per_thread = NULL; uint32_t *aclk_queries_per_thread_sample = NULL; +struct rusage *rusage_per_thread; +uint8_t *getrusage_called_this_tick = NULL; struct aclk_metrics aclk_metrics = { .online = 0, @@ -153,7 +162,7 @@ static void aclk_stats_read_q(struct aclk_metrics_per_sample *per_sample) static void aclk_stats_cloud_req(struct aclk_metrics_per_sample *per_sample) { static RRDSET *st = NULL; - static RRDDIM *rd_rq_rcvd = NULL; + static RRDDIM *rd_rq_ok = NULL; static RRDDIM *rd_rq_err = NULL; if (unlikely(!st)) { @@ -161,17 +170,82 @@ static void aclk_stats_cloud_req(struct aclk_metrics_per_sample *per_sample) "netdata", "aclk_cloud_req", NULL, "aclk", NULL, "Requests received from cloud", "req/s", "netdata", "stats", 200005, localhost->rrd_update_every, RRDSET_TYPE_STACKED); - rd_rq_rcvd = rrddim_add(st, "received", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); - rd_rq_err = rrddim_add(st, "malformed", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + rd_rq_ok = rrddim_add(st, "accepted", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + rd_rq_err = rrddim_add(st, "rejected", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); } else rrdset_next(st); - rrddim_set_by_pointer(st, rd_rq_rcvd, per_sample->cloud_req_recvd - per_sample->cloud_req_err); + rrddim_set_by_pointer(st, rd_rq_ok, per_sample->cloud_req_ok); rrddim_set_by_pointer(st, rd_rq_err, per_sample->cloud_req_err); rrdset_done(st); } +static void aclk_stats_cloud_req_version(struct aclk_metrics_per_sample *per_sample) +{ + static RRDSET *st = NULL; + static RRDDIM *rd_rq_v1 = NULL; + static RRDDIM *rd_rq_v2 = NULL; + + if (unlikely(!st)) { + st = rrdset_create_localhost( + "netdata", "aclk_cloud_req_version", NULL, "aclk", NULL, "Requests received from cloud by their version", "req/s", + "netdata", "stats", 200006, localhost->rrd_update_every, RRDSET_TYPE_STACKED); + + rd_rq_v1 = rrddim_add(st, "v1", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + rd_rq_v2 = rrddim_add(st, "v2+", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st); + + rrddim_set_by_pointer(st, rd_rq_v1, per_sample->cloud_req_v1); + rrddim_set_by_pointer(st, rd_rq_v2, per_sample->cloud_req_v2); + + rrdset_done(st); +} + +static char *cloud_req_type_names[ACLK_STATS_CLOUD_REQ_TYPE_CNT] = { + "other", + "info", + "data", + "alarms", + "alarm_log", + "chart", + "charts" + // if you change update: + // #define ACLK_STATS_CLOUD_REQ_TYPE_CNT 7 +}; + +int aclk_cloud_req_type_to_idx(const char *name) +{ + for (int i = 1; i < ACLK_STATS_CLOUD_REQ_TYPE_CNT; i++) + if (!strcmp(cloud_req_type_names[i], name)) + return i; + return 0; +} + +static void aclk_stats_cloud_req_cmd(struct aclk_metrics_per_sample *per_sample) +{ + static RRDSET *st; + static int initialized = 0; + static RRDDIM *rd_rq_types[ACLK_STATS_CLOUD_REQ_TYPE_CNT]; + + if (unlikely(!initialized)) { + initialized = 1; + st = rrdset_create_localhost( + "netdata", "aclk_cloud_req_cmd", NULL, "aclk", NULL, "Requests received from cloud by their type (api endpoint queried)", "req/s", + "netdata", "stats", 200007, localhost->rrd_update_every, RRDSET_TYPE_STACKED); + + for (int i = 0; i < ACLK_STATS_CLOUD_REQ_TYPE_CNT; i++) + rd_rq_types[i] = rrddim_add(st, cloud_req_type_names[i], NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st); + + for (int i = 0; i < ACLK_STATS_CLOUD_REQ_TYPE_CNT; i++) + rrddim_set_by_pointer(st, rd_rq_types[i], per_sample->cloud_req_by_type[i]); + + rrdset_done(st); +} + #define MAX_DIM_NAME 16 static void aclk_stats_query_threads(uint32_t *queries_per_thread) { @@ -182,7 +256,7 @@ static void aclk_stats_query_threads(uint32_t *queries_per_thread) if (unlikely(!st)) { st = rrdset_create_localhost( "netdata", "aclk_query_threads", NULL, "aclk", NULL, "Queries Processed Per Thread", "req/s", - "netdata", "stats", 200007, localhost->rrd_update_every, RRDSET_TYPE_STACKED); + "netdata", "stats", 200008, localhost->rrd_update_every, RRDSET_TYPE_STACKED); for (int i = 0; i < query_thread_count; i++) { if (snprintf(dim_name, MAX_DIM_NAME, "Query %d", i) < 0) @@ -222,11 +296,42 @@ static void aclk_stats_mat_metric_process(struct aclk_metric_mat *metric, struct rrdset_done(metric->st); } +static void aclk_stats_cpu_threads(void) +{ + char id[100 + 1]; + char title[100 + 1]; + + for (int i = 0; i < query_thread_count; i++) { + if (unlikely(!aclk_cpu_data[i].st)) { + + snprintfz(id, 100, "aclk_thread%d_cpu", i); + snprintfz(title, 100, "Cpu Usage For Thread No %d", i); + + aclk_cpu_data[i].st = rrdset_create_localhost( + "netdata", id, NULL, "aclk", NULL, title, "milliseconds/s", + "netdata", "stats", 200020 + i, localhost->rrd_update_every, RRDSET_TYPE_STACKED); + + aclk_cpu_data[i].user = rrddim_add(aclk_cpu_data[i].st, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); + aclk_cpu_data[i].system = rrddim_add(aclk_cpu_data[i].st, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); + + } else + rrdset_next(aclk_cpu_data[i].st); + } + + for (int i = 0; i < query_thread_count; i++) { + rrddim_set_by_pointer(aclk_cpu_data[i].st, aclk_cpu_data[i].user, rusage_per_thread[i].ru_utime.tv_sec * 1000000ULL + rusage_per_thread[i].ru_utime.tv_usec); + rrddim_set_by_pointer(aclk_cpu_data[i].st, aclk_cpu_data[i].system, rusage_per_thread[i].ru_stime.tv_sec * 1000000ULL + rusage_per_thread[i].ru_stime.tv_usec); + rrdset_done(aclk_cpu_data[i].st); + } +} + void aclk_stats_thread_cleanup() { freez(aclk_qt_data); freez(aclk_queries_per_thread); freez(aclk_queries_per_thread_sample); + freez(aclk_cpu_data); + freez(rusage_per_thread); } void *aclk_stats_main_thread(void *ptr) @@ -235,8 +340,11 @@ void *aclk_stats_main_thread(void *ptr) query_thread_count = args->query_thread_count; aclk_qt_data = callocz(query_thread_count, sizeof(struct aclk_qt_data)); + aclk_cpu_data = callocz(query_thread_count, sizeof(struct aclk_cpu_data)); aclk_queries_per_thread = callocz(query_thread_count, sizeof(uint32_t)); aclk_queries_per_thread_sample = callocz(query_thread_count, sizeof(uint32_t)); + rusage_per_thread = callocz(query_thread_count, sizeof(struct rusage)); + getrusage_called_this_tick = callocz(query_thread_count, sizeof(uint8_t)); heartbeat_t hb; heartbeat_init(&hb); @@ -264,6 +372,7 @@ void *aclk_stats_main_thread(void *ptr) memcpy(aclk_queries_per_thread_sample, aclk_queries_per_thread, sizeof(uint32_t) * query_thread_count); memset(aclk_queries_per_thread, 0, sizeof(uint32_t) * query_thread_count); + memset(getrusage_called_this_tick, 0, sizeof(uint8_t) * query_thread_count); ACLK_STATS_UNLOCK; aclk_stats_collect(&per_sample, &permanent); @@ -273,8 +382,14 @@ void *aclk_stats_main_thread(void *ptr) aclk_stats_read_q(&per_sample); aclk_stats_cloud_req(&per_sample); + aclk_stats_cloud_req_version(&per_sample); + + aclk_stats_cloud_req_cmd(&per_sample); + aclk_stats_query_threads(aclk_queries_per_thread_sample); + aclk_stats_cpu_threads(); + #ifdef NETDATA_INTERNAL_CHECKS aclk_stats_mat_metric_process(&aclk_mat_metrics.latency, &per_sample.latency); #endif diff --git a/aclk/legacy/aclk_stats.h b/aclk/legacy/aclk_stats.h index 7e74fdf88..5e50a2272 100644 --- a/aclk/legacy/aclk_stats.h +++ b/aclk/legacy/aclk_stats.h @@ -55,6 +55,11 @@ extern struct aclk_mat_metrics { void aclk_metric_mat_update(struct aclk_metric_mat_data *metric, usec_t measurement); +#define ACLK_STATS_CLOUD_REQ_TYPE_CNT 7 +// if you change update cloud_req_type_names + +int aclk_cloud_req_type_to_idx(const char *name); + // reset to 0 on every sample extern struct aclk_metrics_per_sample { /* in the unlikely event of ACLK disconnecting @@ -72,9 +77,14 @@ extern struct aclk_metrics_per_sample { volatile uint32_t read_q_added; volatile uint32_t read_q_consumed; - volatile uint32_t cloud_req_recvd; + volatile uint32_t cloud_req_ok; volatile uint32_t cloud_req_err; + volatile uint16_t cloud_req_v1; + volatile uint16_t cloud_req_v2; + + volatile uint16_t cloud_req_by_type[ACLK_STATS_CLOUD_REQ_TYPE_CNT]; + #ifdef NETDATA_INTERNAL_CHECKS struct aclk_metric_mat_data latency; #endif @@ -83,6 +93,7 @@ extern struct aclk_metrics_per_sample { } aclk_metrics_per_sample; extern uint32_t *aclk_queries_per_thread; +extern struct rusage *rusage_per_thread; void *aclk_stats_main_thread(void *ptr); void aclk_stats_thread_cleanup(); diff --git a/aclk/legacy/agent_cloud_link.c b/aclk/legacy/agent_cloud_link.c index e51a01308..5767df3a7 100644 --- a/aclk/legacy/agent_cloud_link.c +++ b/aclk/legacy/agent_cloud_link.c @@ -189,7 +189,8 @@ unsigned long int aclk_reconnect_delay(int mode) delay = ACLK_MAX_BACKOFF_DELAY * 1000; } else { fail++; - delay = (delay * 1000) + (random() % 1000); + delay *= 1000; + delay += (random() % (MAX(1000, delay/2))); } return delay; diff --git a/backends/backends.c b/backends/backends.c index 6bf583e17..bc718cb29 100644 --- a/backends/backends.c +++ b/backends/backends.c @@ -206,7 +206,7 @@ inline int backends_can_send_rrdset(BACKEND_OPTIONS backend_options, RRDSET *st) } } - if(unlikely(!rrdset_is_available_for_backends(st))) { + if(unlikely(!rrdset_is_available_for_exporting_and_alarms(st))) { debug(D_BACKEND, "BACKEND: not sending chart '%s' of host '%s', because it is not available for backends.", st->id, host->hostname); return 0; } diff --git a/backends/prometheus/backend_prometheus.c b/backends/prometheus/backend_prometheus.c index a3ecf16ef..1fb3fd42c 100644 --- a/backends/prometheus/backend_prometheus.c +++ b/backends/prometheus/backend_prometheus.c @@ -545,7 +545,7 @@ inline static void remote_write_split_words(char *str, char **words, int max_wor int i = 0; while(*s && i < max_words - 1) { - while(*s && isspace(*s)) s++; // skip spaces to the begining of a tag name + while(*s && isspace(*s)) s++; // skip spaces to the beginning of a tag name if(*s) words[i] = s; @@ -560,7 +560,7 @@ inline static void remote_write_split_words(char *str, char **words, int max_wor s++; i++; - while(*s && isspace(*s)) s++; // skip spaces to the begining of a tag value + while(*s && isspace(*s)) s++; // skip spaces to the beginning of a tag value if(*s && *s == '"') s++; // strip an opening quote if(*s) diff --git a/build/subst.inc b/build/subst.inc index af9762334..cc8825e24 100644 --- a/build/subst.inc +++ b/build/subst.inc @@ -10,7 +10,7 @@ -e 's#[@]registrydir_POST@#$(registrydir)#g' \ -e 's#[@]varlibdir_POST@#$(varlibdir)#g' \ -e 's#[@]webdir_POST@#$(webdir)#g' \ - -e 's#[@]can_enable_aclk_POST@#$(can_enable_aclk)#g' \ + -e 's#[@]enable_aclk_POST@#$(enable_aclk)#g' \ -e 's#[@]enable_cloud_POST@#$(enable_cloud)#g' \ $< > $@.tmp; then \ mv "$@.tmp" "$@"; \ diff --git a/claim/claim.c b/claim/claim.c index bb9316082..9a3660f9e 100644 --- a/claim/claim.c +++ b/claim/claim.c @@ -2,7 +2,11 @@ #include "claim.h" #include "../registry/registry_internals.h" +#ifndef ACLK_NG #include "../aclk/legacy/aclk_common.h" +#else +#include "../aclk/aclk.h" +#endif char *claiming_pending_arguments = NULL; diff --git a/claim/netdata-claim.sh.in b/claim/netdata-claim.sh.in index 7ac91d7fb..e0fd85a1e 100755 --- a/claim/netdata-claim.sh.in +++ b/claim/netdata-claim.sh.in @@ -85,15 +85,22 @@ ERROR_MESSAGES[17]="Service Unavailable" # Exit code: 18 - Agent unique id is not generated yet. +NETDATA_RUNNING=1 + get_config_value() { conf_file="${1}" section="${2}" key_name="${3}" - config_result=$(@sbindir_POST@/netdatacli 2>/dev/null read-config "$conf_file|$section|$key_name"; exit $?) - # shellcheck disable=SC2181 - if [ "$?" != "0" ]; then - echo >&2 "cli failed, assume netdata is not running and query the on-disk config" - config_result=$(@sbindir_POST@/netdata 2>/dev/null -W get2 "$conf_file" "$section" "$key_name" unknown_default) + if [ "${NETDATA_RUNNING}" -eq 1 ]; then + config_result=$(@sbindir_POST@/netdatacli 2>/dev/null read-config "$conf_file|$section|$key_name"; exit $?) + result="$?" + if [ "${result}" -ne 0 ]; then + echo >&2 "Unable to communicate with Netdata daemon, querying config from disk instead." + NETDATA_RUNNING=0 + fi + fi + if [ "${NETDATA_RUNNING}" -eq 0 ]; then + config_result=$(@sbindir_POST@/netdata 2>/dev/null -W get2 "$conf_file" "$section" "$key_name" unknown_default) fi echo "$config_result" } @@ -116,7 +123,7 @@ if [ "@enable_cloud_POST@" = "no" ]; then exit 3 fi # shellcheck disable=SC2050 -if [ "@can_enable_aclk_POST@" != "yes" ]; then +if [ "@enable_aclk_POST@" != "yes" ]; then echo >&2 "This agent was built without the dependencies for Cloud and cannot be claimed" exit 3 fi @@ -141,13 +148,33 @@ NETDATA_USER=$(get_config_value netdata global "run as user") [ -z "$EUID" ] && EUID="$(id -u)" +gen_id() { + local id + + id="$(uuidgen)" + + if [ "${id}" = "8a795b0c-2311-11e6-8563-000c295076a6" ] || [ "${id}" = "4aed1458-1c3e-11e6-a53f-000c290fc8f5" ]; then + gen_id + else + echo "${id}" + fi +} + # get the MACHINE_GUID by default if [ -r "${MACHINE_GUID_FILE}" ]; then ID="$(cat "${MACHINE_GUID_FILE}")" MGUID=$ID -else - echo >&2 "netdata.public.unique.id is not generated yet or not readable. Please run agent at least once before attempting to claim. Agent generates this file on first startup. If the ID is generated already make sure you have rights to read it (Filename: ${MACHINE_GUID_FILE})." +elif [ -f "${MACHINE_GUID_FILE}" ]; then + echo >&2 "netdata.public.unique.id is not readable. Please make sure you have rights to read it (Filename: ${MACHINE_GUID_FILE})." exit 18 +else + if mkdir -p "${MACHINE_GUID_FILE%/*}" && /bin/echo -n "$(gen_id)" > "${MACHINE_GUID_FILE}"; then + ID="$(cat "${MACHINE_GUID_FILE}")" + MGUID=$ID + else + echo >&2 "Failed to write new machine GUID. Please make sure you have rights to write to ${MACHINE_GUID_FILE}." + exit 18 + fi fi # get token from file @@ -174,6 +201,7 @@ do -noproxy) NOPROXY=yes ;; -noreload) RELOAD=0 ;; -user=*) NETDATA_USER=${arg:6} ;; + -daemon-not-running) NETDATA_RUNNING=0 ;; *) echo >&2 "Unknown argument ${arg}" exit 1 ;; esac @@ -254,7 +282,7 @@ fi if [ "${URLTOOL}" = "curl" ] ; then - URLCOMMAND="curl --connect-timeout 5 --retry 0 -s -i -X PUT -d \"@${CLAIMING_DIR}/tmpin.txt\"" + URLCOMMAND="curl --connect-timeout 30 --retry 0 -s -i -X PUT -d \"@${CLAIMING_DIR}/tmpin.txt\"" if [ "${NOPROXY}" = "yes" ] ; then URLCOMMAND="${URLCOMMAND} -x \"\"" elif [ -n "${PROXY}" ] ; then @@ -313,7 +341,7 @@ attempt_contact () { return 0 } -for i in {1..5} +for i in {1..3} do if attempt_contact ; then echo "Connection attempt $i successful" @@ -344,7 +372,7 @@ case ${ERROR_KEY} in *) EXIT_CODE=7 ;; esac -HTTP_STATUS_CODE=$(grep "HTTP" "${CLAIMING_DIR}/tmpout.txt" | awk -F " " '{print $2}') +HTTP_STATUS_CODE=$(grep "HTTP" "${CLAIMING_DIR}/tmpout.txt" | tail -1 | awk -F " " '{print $2}') if [ "${HTTP_STATUS_CODE}" = "204" ] ; then EXIT_CODE=0 fi diff --git a/collectors/QUICKSTART.md b/collectors/QUICKSTART.md index a691ffc4c..809ec18af 100644 --- a/collectors/QUICKSTART.md +++ b/collectors/QUICKSTART.md @@ -104,8 +104,8 @@ parameters as a reference, to configure the collector. Most collectors are enabled and will auto-detect their app/service without manual configuration. However, you need to restart Netdata to trigger the auto-detection process. -To restart Netdata on most systems, use `service netdata restart`. For other systems, see the [other restart -methods](/docs/getting-started.md#start-stop-and-restart-netdata). +To restart Netdata on most systems, use `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. Open Netdata's dashboard in your browser, or refresh the page if you already have it open. You should now see a new entry in the menu and new interactive charts! diff --git a/collectors/all.h b/collectors/all.h index 153fce931..295261b56 100644 --- a/collectors/all.h +++ b/collectors/all.h @@ -105,10 +105,11 @@ #define NETDATA_CHART_PRIO_DISK_OPS 2001 #define NETDATA_CHART_PRIO_DISK_QOPS 2002 #define NETDATA_CHART_PRIO_DISK_BACKLOG 2003 -#define NETDATA_CHART_PRIO_DISK_UTIL 2004 -#define NETDATA_CHART_PRIO_DISK_AWAIT 2005 -#define NETDATA_CHART_PRIO_DISK_AVGSZ 2006 -#define NETDATA_CHART_PRIO_DISK_SVCTM 2007 +#define NETDATA_CHART_PRIO_DISK_BUSY 2004 +#define NETDATA_CHART_PRIO_DISK_UTIL 2005 +#define NETDATA_CHART_PRIO_DISK_AWAIT 2006 +#define NETDATA_CHART_PRIO_DISK_AVGSZ 2007 +#define NETDATA_CHART_PRIO_DISK_SVCTM 2008 #define NETDATA_CHART_PRIO_DISK_MOPS 2021 #define NETDATA_CHART_PRIO_DISK_IOTIME 2022 #define NETDATA_CHART_PRIO_BCACHE_CACHE_ALLOC 2120 diff --git a/collectors/apps.plugin/apps_groups.conf b/collectors/apps.plugin/apps_groups.conf index 7242ed30d..cffd26c95 100644 --- a/collectors/apps.plugin/apps_groups.conf +++ b/collectors/apps.plugin/apps_groups.conf @@ -119,13 +119,13 @@ columndb: clickhouse-server* # ----------------------------------------------------------------------------- # email servers -email: dovecot imapd pop3d amavis* master zmstat* zmmailboxdmgr qmgr oqmgr saslauthd opendkim clamd freshclam tlsmgr postfwd2 postscreen postfix smtp* lmtp* sendmail +email: dovecot imapd pop3d amavis* zmstat* zmmailboxdmgr saslauthd opendkim postfwd2 smtp* lmtp* sendmail postfix master pickup qmgr showq tlsmgr postscreen oqmgr # ----------------------------------------------------------------------------- # network, routing, VPN ppp: ppp* -vpn: openvpn pptp* cjdroute gvpe tincd +vpn: openvpn pptp* cjdroute gvpe tincd wireguard wifi: hostapd wpa_supplicant NetworkManager routing: ospfd* ospf6d* bgpd bfdd fabricd isisd eigrpd sharpd staticd ripd ripngd pimd pbrd nhrpd ldpd zebra vrrpd vtysh bird* modem: ModemManager @@ -232,12 +232,12 @@ backup: rsync lsyncd bacula* borg rclone # ----------------------------------------------------------------------------- # cron -cron: cron* atd anacron systemd-cron* +cron: cron* atd anacron systemd-cron* incrond # ----------------------------------------------------------------------------- # UPS -ups: upsmon upsd */nut/* +ups: upsmon upsd */nut/* apcupsd # ----------------------------------------------------------------------------- # media players, servers, clients diff --git a/collectors/apps.plugin/apps_plugin.c b/collectors/apps.plugin/apps_plugin.c index 7cbbb075c..4d4626e6b 100644 --- a/collectors/apps.plugin/apps_plugin.c +++ b/collectors/apps.plugin/apps_plugin.c @@ -491,7 +491,7 @@ typedef enum fd_filetype { } FD_FILETYPE; struct file_descriptor { - avl avl; + avl_t avl; #ifdef NETDATA_INTERNAL_CHECKS uint32_t magic; @@ -514,7 +514,7 @@ static int // read users and groups from files struct user_or_group_id { - avl avl; + avl_t avl; union { uid_t uid; @@ -639,7 +639,7 @@ int read_user_or_group_ids(struct user_or_group_ids *ids, struct timespec *last_ struct user_or_group_id *existing_user_id = NULL; if(likely(ids->root)) - existing_user_id = (struct user_or_group_id *)avl_search(&ids->index, (avl *) user_or_group_id); + existing_user_id = (struct user_or_group_id *)avl_search(&ids->index, (avl_t *) user_or_group_id); if(unlikely(existing_user_id)) { freez(existing_user_id->name); @@ -648,7 +648,7 @@ int read_user_or_group_ids(struct user_or_group_ids *ids, struct timespec *last_ freez(user_or_group_id); } else { - if(unlikely(avl_insert(&ids->index, (avl *) user_or_group_id) != (void *) user_or_group_id)) { + if(unlikely(avl_insert(&ids->index, (avl_t *) user_or_group_id) != (void *) user_or_group_id)) { error("INTERNAL ERROR: duplicate indexing of id during realloc"); }; @@ -664,7 +664,7 @@ int read_user_or_group_ids(struct user_or_group_ids *ids, struct timespec *last_ while(user_or_group_id) { if(unlikely(!user_or_group_id->updated)) { - if(unlikely((struct user_or_group_id *)avl_remove(&ids->index, (avl *) user_or_group_id) != user_or_group_id)) + if(unlikely((struct user_or_group_id *)avl_remove(&ids->index, (avl_t *) user_or_group_id) != user_or_group_id)) error("INTERNAL ERROR: removal of unused id from index, removed a different id"); if(prev_user_id) @@ -716,7 +716,7 @@ static struct target *get_users_target(uid_t uid) { int ret = read_user_or_group_ids(&all_user_ids, &last_passwd_modification_time); if(likely(!ret && all_user_ids.index.root)) - user_or_group_id = (struct user_or_group_id *)avl_search(&all_user_ids.index, (avl *) &user_id_to_find); + user_or_group_id = (struct user_or_group_id *)avl_search(&all_user_ids.index, (avl_t *) &user_id_to_find); } if(user_or_group_id && user_or_group_id->name && *user_or_group_id->name) { @@ -764,7 +764,7 @@ struct target *get_groups_target(gid_t gid) int ret = read_user_or_group_ids(&all_group_ids, &last_group_modification_time); if(likely(!ret && all_group_ids.index.root)) - group_id = (struct user_or_group_id *)avl_search(&all_group_ids.index, (avl *) &group_id_to_find); + group_id = (struct user_or_group_id *)avl_search(&all_group_ids.index, (avl_t *) &group_id_to_find); } if(group_id && group_id->name && *group_id->name) { @@ -1690,7 +1690,7 @@ int file_descriptor_compare(void* a, void* b) { return strcmp(((struct file_descriptor *)a)->name, ((struct file_descriptor *)b)->name); } -// int file_descriptor_iterator(avl *a) { if(a) {}; return 0; } +// int file_descriptor_iterator(avl_t *a) { if(a) {}; return 0; } avl_tree_type all_files_index = { NULL, @@ -1707,11 +1707,11 @@ static struct file_descriptor *file_descriptor_find(const char *name, uint32_t h tmp.magic = 0x0BADCAFE; #endif /* NETDATA_INTERNAL_CHECKS */ - return (struct file_descriptor *)avl_search(&all_files_index, (avl *) &tmp); + return (struct file_descriptor *)avl_search(&all_files_index, (avl_t *) &tmp); } -#define file_descriptor_add(fd) avl_insert(&all_files_index, (avl *)(fd)) -#define file_descriptor_remove(fd) avl_remove(&all_files_index, (avl *)(fd)) +#define file_descriptor_add(fd) avl_insert(&all_files_index, (avl_t *)(fd)) +#define file_descriptor_remove(fd) avl_remove(&all_files_index, (avl_t *)(fd)) // ---------------------------------------------------------------------------- diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.c b/collectors/cgroups.plugin/sys_fs_cgroup.c index df1f5f21a..ceffffe92 100644 --- a/collectors/cgroups.plugin/sys_fs_cgroup.c +++ b/collectors/cgroups.plugin/sys_fs_cgroup.c @@ -31,7 +31,7 @@ static int cgroup_enable_pressure_memory_full = CONFIG_BOOLEAN_AUTO; static int cgroup_enable_systemd_services = CONFIG_BOOLEAN_YES; static int cgroup_enable_systemd_services_detailed_memory = CONFIG_BOOLEAN_NO; -static int cgroup_used_memory_without_cache = CONFIG_BOOLEAN_YES; +static int cgroup_used_memory = CONFIG_BOOLEAN_YES; static int cgroup_use_unified_cgroups = CONFIG_BOOLEAN_NO; static int cgroup_unified_exist = CONFIG_BOOLEAN_AUTO; @@ -226,7 +226,7 @@ void read_cgroup_plugin_configuration() { cgroup_enable_cpuacct_stat = config_get_boolean_ondemand("plugin:cgroups", "enable cpuacct stat (total CPU)", cgroup_enable_cpuacct_stat); cgroup_enable_cpuacct_usage = config_get_boolean_ondemand("plugin:cgroups", "enable cpuacct usage (per core CPU)", cgroup_enable_cpuacct_usage); - cgroup_enable_memory = config_get_boolean_ondemand("plugin:cgroups", "enable memory (used mem including cache)", cgroup_enable_memory); + cgroup_enable_memory = config_get_boolean_ondemand("plugin:cgroups", "enable memory", cgroup_enable_memory); cgroup_enable_detailed_memory = config_get_boolean_ondemand("plugin:cgroups", "enable detailed memory", cgroup_enable_detailed_memory); cgroup_enable_memory_failcnt = config_get_boolean_ondemand("plugin:cgroups", "enable memory limits fail count", cgroup_enable_memory_failcnt); cgroup_enable_swap = config_get_boolean_ondemand("plugin:cgroups", "enable swap memory", cgroup_enable_swap); @@ -250,7 +250,7 @@ void read_cgroup_plugin_configuration() { cgroup_enable_systemd_services = config_get_boolean("plugin:cgroups", "enable systemd services", cgroup_enable_systemd_services); cgroup_enable_systemd_services_detailed_memory = config_get_boolean("plugin:cgroups", "enable systemd services detailed memory", cgroup_enable_systemd_services_detailed_memory); - cgroup_used_memory_without_cache = config_get_boolean("plugin:cgroups", "report used memory without cache", cgroup_used_memory_without_cache); + cgroup_used_memory = config_get_boolean("plugin:cgroups", "report used memory", cgroup_used_memory); char filename[FILENAME_MAX + 1], *s; struct mountinfo *mi, *root = mountinfo_read(0); @@ -327,7 +327,7 @@ void read_cgroup_plugin_configuration() { cgroup_enable_blkio_queued_ops = CONFIG_BOOLEAN_NO; cgroup_search_in_devices = 0; cgroup_enable_systemd_services_detailed_memory = CONFIG_BOOLEAN_NO; - cgroup_used_memory_without_cache = CONFIG_BOOLEAN_NO; //unified cgroups use different values + cgroup_used_memory = CONFIG_BOOLEAN_NO; //unified cgroups use different values //TODO: can there be more than 1 cgroup2 mount point? mi = mountinfo_find_by_filesystem_super_option(root, "cgroup2", "rw"); //there is no cgroup2 specific super option - for now use 'rw' option @@ -541,7 +541,11 @@ struct memory { /* unsigned long long total_inactive_anon; unsigned long long total_active_anon; +*/ + unsigned long long total_inactive_file; + +/* unsigned long long total_active_file; unsigned long long total_unevictable; */ @@ -628,6 +632,7 @@ struct cgroup { RRDSET *st_cpu_limit; RRDSET *st_cpu_per_core; RRDSET *st_mem; + RRDSET *st_mem_utilization; RRDSET *st_writeback; RRDSET *st_mem_activity; RRDSET *st_pgfaults; @@ -1069,6 +1074,7 @@ static inline void cgroup_read_memory(struct memory *mem, char parent_cg_is_unif arl_expect(mem->arl_base, "total_pgpgout", &mem->total_pgpgout); arl_expect(mem->arl_base, "total_pgfault", &mem->total_pgfault); arl_expect(mem->arl_base, "total_pgmajfault", &mem->total_pgmajfault); + arl_expect(mem->arl_base, "total_inactive_file", &mem->total_inactive_file); } else { mem->arl_base = arl_create("cgroup/memory", NULL, 60); @@ -1082,6 +1088,7 @@ static inline void cgroup_read_memory(struct memory *mem, char parent_cg_is_unif mem->arl_dirty = arl_expect(mem->arl_base, "file_dirty", &mem->total_dirty); arl_expect(mem->arl_base, "pgfault", &mem->total_pgfault); arl_expect(mem->arl_base, "pgmajfault", &mem->total_pgmajfault); + arl_expect(mem->arl_base, "inactive_file", &mem->total_inactive_file); } } @@ -1105,9 +1112,9 @@ static inline void cgroup_read_memory(struct memory *mem, char parent_cg_is_unif if(unlikely(mem->enabled_detailed == CONFIG_BOOLEAN_AUTO)) { if(( (!parent_cg_is_unified) && ( mem->total_cache || mem->total_dirty || mem->total_rss || mem->total_rss_huge || mem->total_mapped_file || mem->total_writeback - || mem->total_swap || mem->total_pgpgin || mem->total_pgpgout || mem->total_pgfault || mem->total_pgmajfault)) + || mem->total_swap || mem->total_pgpgin || mem->total_pgpgout || mem->total_pgfault || mem->total_pgmajfault || mem->total_inactive_file)) || (parent_cg_is_unified && ( mem->anon || mem->total_dirty || mem->kernel_stack || mem->slab || mem->sock || mem->total_writeback - || mem->anon_thp || mem->total_pgfault || mem->total_pgmajfault)) + || mem->anon_thp || mem->total_pgfault || mem->total_pgmajfault || mem->total_inactive_file)) || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES) mem->enabled_detailed = CONFIG_BOOLEAN_YES; else @@ -1125,6 +1132,11 @@ memory_next: mem->enabled_usage_in_bytes = CONFIG_BOOLEAN_YES; } + if (likely(mem->updated_usage_in_bytes && mem->updated_detailed)) { + mem->usage_in_bytes = + (mem->usage_in_bytes > mem->total_inactive_file) ? (mem->usage_in_bytes - mem->total_inactive_file) : 0; + } + // read msw_usage_in_bytes if(likely(mem->filename_msw_usage_in_bytes)) { mem->updated_msw_usage_in_bytes = !read_single_number_file(mem->filename_msw_usage_in_bytes, &mem->msw_usage_in_bytes); @@ -1515,6 +1527,7 @@ static inline void cgroup_free(struct cgroup *cg) { if(cg->st_pgfaults) rrdset_is_obsolete(cg->st_pgfaults); if(cg->st_mem_usage) rrdset_is_obsolete(cg->st_mem_usage); if(cg->st_mem_usage_limit) rrdset_is_obsolete(cg->st_mem_usage_limit); + if(cg->st_mem_utilization) rrdset_is_obsolete(cg->st_mem_utilization); if(cg->st_mem_failcnt) rrdset_is_obsolete(cg->st_mem_failcnt); if(cg->st_io) rrdset_is_obsolete(cg->st_io); if(cg->st_serviced_ops) rrdset_is_obsolete(cg->st_serviced_ops); @@ -1751,7 +1764,7 @@ static inline void update_filenames() debug(D_CGROUP, "cpuacct.usage_percpu file for cgroup '%s': '%s' does not exist.", cg->id, filename); } - if(unlikely((cgroup_enable_detailed_memory || cgroup_used_memory_without_cache) && !cg->memory.filename_detailed && (cgroup_used_memory_without_cache || cgroup_enable_systemd_services_detailed_memory || !(cg->options & CGROUP_OPTIONS_SYSTEM_SLICE_SERVICE)))) { + if(unlikely((cgroup_enable_detailed_memory || cgroup_used_memory) && !cg->memory.filename_detailed && (cgroup_used_memory || cgroup_enable_systemd_services_detailed_memory || !(cg->options & CGROUP_OPTIONS_SYSTEM_SLICE_SERVICE)))) { snprintfz(filename, FILENAME_MAX, "%s%s/memory.stat", cgroup_memory_base, cg->id); if(likely(stat(filename, &buf) != -1)) { cg->memory.filename_detailed = strdupz(filename); @@ -1898,7 +1911,7 @@ static inline void update_filenames() else debug(D_CGROUP, "cpu.stat file for unified cgroup '%s': '%s' does not exist.", cg->id, filename); } - if(unlikely((cgroup_enable_detailed_memory || cgroup_used_memory_without_cache) && !cg->memory.filename_detailed && (cgroup_used_memory_without_cache || cgroup_enable_systemd_services_detailed_memory || !(cg->options & CGROUP_OPTIONS_SYSTEM_SLICE_SERVICE)))) { + if(unlikely((cgroup_enable_detailed_memory || cgroup_used_memory) && !cg->memory.filename_detailed && (cgroup_used_memory || cgroup_enable_systemd_services_detailed_memory || !(cg->options & CGROUP_OPTIONS_SYSTEM_SLICE_SERVICE)))) { snprintfz(filename, FILENAME_MAX, "%s%s/memory.stat", cgroup_unified_base, cg->id); if(likely(stat(filename, &buf) != -1)) { cg->memory.filename_detailed = strdupz(filename); @@ -2187,8 +2200,7 @@ void update_systemd_services_charts( , NULL , "mem" , "services.mem_usage" - , (cgroup_used_memory_without_cache) ? "Systemd Services Used Memory without Cache" - : "Systemd Services Used Memory" + , "Systemd Services Used Memory" , "MiB" , PLUGIN_CGROUPS_NAME , PLUGIN_CGROUPS_MODULE_SYSTEMD_NAME @@ -2705,7 +2717,7 @@ void update_systemd_services_charts( if(unlikely(!cg->rd_mem_usage)) cg->rd_mem_usage = rrddim_add(st_mem_usage, cg->chart_id, cg->chart_title, 1, 1024 * 1024, RRD_ALGORITHM_ABSOLUTE); - rrddim_set_by_pointer(st_mem_usage, cg->rd_mem_usage, cg->memory.usage_in_bytes - ((cgroup_used_memory_without_cache)?cg->memory.total_cache:0)); + rrddim_set_by_pointer(st_mem_usage, cg->rd_mem_usage, cg->memory.usage_in_bytes); } if(likely(do_mem_detailed && cg->memory.updated_detailed)) { @@ -2936,15 +2948,15 @@ static inline void update_cpu_limits(char **filename, unsigned long long *value, // parse the cpuset string and calculate the number of cpus the cgroup is allowed to use while(*s) { unsigned long long n = cpuset_str2ull(&s); + ncpus++; if(*s == ',') { s++; - ncpus++; continue; } if(*s == '-') { s++; unsigned long long m = cpuset_str2ull(&s); - ncpus += m - n + 1; // calculate the number of cpus in the region + ncpus += m - n; // calculate the number of cpus in the region } s++; } @@ -3275,7 +3287,7 @@ void update_cgroup_charts(int update_every) { , "MiB" , PLUGIN_CGROUPS_NAME , PLUGIN_CGROUPS_MODULE_CGROUPS_NAME - , cgroup_containers_chart_priority + 210 + , cgroup_containers_chart_priority + 220 , update_every , RRDSET_TYPE_STACKED ); @@ -3421,7 +3433,7 @@ void update_cgroup_charts(int update_every) { if(likely(cg->memory.updated_usage_in_bytes && cg->memory.enabled_usage_in_bytes == CONFIG_BOOLEAN_YES)) { if(unlikely(!cg->st_mem_usage)) { - snprintfz(title, CHART_TITLE_MAX, "Used Memory %s", (cgroup_used_memory_without_cache && cg->memory.updated_detailed)?"without Cache ":""); + snprintfz(title, CHART_TITLE_MAX, "Used Memory"); cg->st_mem_usage = rrdset_create_localhost( cgroup_chart_type(type, cg->chart_id, RRD_ID_LENGTH_MAX) @@ -3433,7 +3445,7 @@ void update_cgroup_charts(int update_every) { , "MiB" , PLUGIN_CGROUPS_NAME , PLUGIN_CGROUPS_MODULE_CGROUPS_NAME - , cgroup_containers_chart_priority + 200 + , cgroup_containers_chart_priority + 210 , update_every , RRDSET_TYPE_STACKED ); @@ -3446,9 +3458,13 @@ void update_cgroup_charts(int update_every) { else rrdset_next(cg->st_mem_usage); - rrddim_set(cg->st_mem_usage, "ram", cg->memory.usage_in_bytes - ((cgroup_used_memory_without_cache)?cg->memory.total_cache:0)); + rrddim_set(cg->st_mem_usage, "ram", cg->memory.usage_in_bytes); if(!(cg->options & CGROUP_OPTIONS_IS_UNIFIED)) { - rrddim_set(cg->st_mem_usage, "swap", (cg->memory.msw_usage_in_bytes > cg->memory.usage_in_bytes)?cg->memory.msw_usage_in_bytes - cg->memory.usage_in_bytes:0); + rrddim_set( + cg->st_mem_usage, + "swap", + (cg->memory.msw_usage_in_bytes > cg->memory.usage_in_bytes) ? + cg->memory.msw_usage_in_bytes - cg->memory.usage_in_bytes : 0); } else { rrddim_set(cg->st_mem_usage, "swap", cg->memory.msw_usage_in_bytes); } @@ -3484,7 +3500,7 @@ void update_cgroup_charts(int update_every) { memory_limit = cg->memory_limit; if(unlikely(!cg->st_mem_usage_limit)) { - snprintfz(title, CHART_TITLE_MAX, "Used RAM without Cache within the limits"); + snprintfz(title, CHART_TITLE_MAX, "Used RAM within the limits"); cg->st_mem_usage_limit = rrdset_create_localhost( cgroup_chart_type(type, cg->chart_id, RRD_ID_LENGTH_MAX) @@ -3496,7 +3512,7 @@ void update_cgroup_charts(int update_every) { , "MiB" , PLUGIN_CGROUPS_NAME , PLUGIN_CGROUPS_MODULE_CGROUPS_NAME - , cgroup_containers_chart_priority + 199 + , cgroup_containers_chart_priority + 200 , update_every , RRDSET_TYPE_STACKED ); @@ -3511,9 +3527,41 @@ void update_cgroup_charts(int update_every) { rrdset_isnot_obsolete(cg->st_mem_usage_limit); - rrddim_set(cg->st_mem_usage_limit, "available", memory_limit - (cg->memory.usage_in_bytes - ((cgroup_used_memory_without_cache)?cg->memory.total_cache:0))); - rrddim_set(cg->st_mem_usage_limit, "used", cg->memory.usage_in_bytes - ((cgroup_used_memory_without_cache)?cg->memory.total_cache:0)); + rrddim_set(cg->st_mem_usage_limit, "available", memory_limit - cg->memory.usage_in_bytes); + rrddim_set(cg->st_mem_usage_limit, "used", cg->memory.usage_in_bytes); rrdset_done(cg->st_mem_usage_limit); + + if (unlikely(!cg->st_mem_utilization)) { + snprintfz(title, CHART_TITLE_MAX, "Memory Utilization"); + + cg->st_mem_utilization = rrdset_create_localhost( + cgroup_chart_type(type, cg->chart_id, RRD_ID_LENGTH_MAX) + , "mem_utilization" + , NULL + , "mem" + , "cgroup.mem_utilization" + , title + , "percentage" + , PLUGIN_CGROUPS_NAME + , PLUGIN_CGROUPS_MODULE_CGROUPS_NAME + , cgroup_containers_chart_priority + 199 + , update_every + , RRDSET_TYPE_AREA + ); + + rrdset_update_labels(cg->st_mem_utilization, cg->chart_labels); + + rrddim_add(cg->st_mem_utilization, "utilization", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(cg->st_mem_utilization); + + if (memory_limit) { + rrdset_isnot_obsolete(cg->st_mem_utilization); + + rrddim_set( + cg->st_mem_utilization, "utilization", cg->memory.usage_in_bytes * 100 / memory_limit); + rrdset_done(cg->st_mem_utilization); + } } } else { @@ -3521,6 +3569,11 @@ void update_cgroup_charts(int update_every) { rrdset_is_obsolete(cg->st_mem_usage_limit); cg->st_mem_usage_limit = NULL; } + + if(unlikely(cg->st_mem_utilization)) { + rrdset_is_obsolete(cg->st_mem_utilization); + cg->st_mem_utilization = NULL; + } } update_memory_limits(&cg->filename_memoryswap_limit, &cg->chart_var_memoryswap_limit, &cg->memoryswap_limit, "memory_and_swap_limit", cg); diff --git a/collectors/cups.plugin/cups_plugin.c b/collectors/cups.plugin/cups_plugin.c index dc8643755..a80930e4d 100644 --- a/collectors/cups.plugin/cups_plugin.c +++ b/collectors/cups.plugin/cups_plugin.c @@ -7,6 +7,7 @@ */ #include "../../libnetdata/libnetdata.h" +#include #include // callback required by fatal() @@ -45,10 +46,6 @@ static int debug = 0; static int netdata_update_every = 1; static int netdata_priority = 100004; - -#ifdef HAVE_CUPS -#include - http_t *http; // connection to the cups daemon /* @@ -468,12 +465,3 @@ int main(int argc, char **argv) { httpClose(http); info("CUPS process exiting"); } - -#else // !HAVE_CUPS - -int main(int argc, char **argv) -{ - fatal("cups.plugin is not compiled."); -} - -#endif // !HAVE_CUPS diff --git a/collectors/ebpf.plugin/Makefile.am b/collectors/ebpf.plugin/Makefile.am index 1327d47a6..4fb2056fd 100644 --- a/collectors/ebpf.plugin/Makefile.am +++ b/collectors/ebpf.plugin/Makefile.am @@ -10,6 +10,12 @@ CLEANFILES = \ include $(top_srcdir)/build/subst.inc SUFFIXES = .in +userebpfconfigdir=$(configdir)/ebpf.d + +# Explicitly install directories to avoid permission issues due to umask +install-exec-local: + $(INSTALL) -d $(DESTDIR)$(userebpfconfigdir) + dist_plugins_SCRIPTS = \ reset_netdata_trace.sh \ $(NULL) @@ -19,7 +25,15 @@ dist_noinst_DATA = \ README.md \ $(NULL) +ebpfconfigdir=$(libconfigdir)/ebpf.d dist_libconfig_DATA = \ - ebpf.conf \ - ebpf_kernel_reject_list.txt \ + ebpf.d.conf \ + $(NULL) + +dist_ebpfconfig_DATA = \ + ebpf.d/ebpf_kernel_reject_list.txt \ + ebpf.d/cachestat.conf \ + ebpf.d/network.conf \ + ebpf.d/process.conf \ + ebpf.d/sync.conf \ $(NULL) diff --git a/collectors/ebpf.plugin/README.md b/collectors/ebpf.plugin/README.md index 5ea3b4951..405eab875 100644 --- a/collectors/ebpf.plugin/README.md +++ b/collectors/ebpf.plugin/README.md @@ -148,6 +148,7 @@ accepts the following values: โ€‹ - `return`: In the `return` mode, the eBPF collector monitors the same kernel functions as `entry`, but also creates new charts for the return of these functions, such as errors. Monitoring function returns can help in debugging software, such as failing to close file descriptors or creating zombie processes. +- `update every`: Number of seconds used for eBPF to send data for Netdata. #### Integration with `apps.plugin` @@ -186,16 +187,45 @@ If you want to _disable_ the integration with `apps.plugin` along with the above apps = yes ``` -### `[ebpf programs]` +#### `[ebpf programs]` The eBPF collector enables and runs the following eBPF programs by default: +- `cachestat`: Netdata's eBPF data collector creates charts about the memory page cache. When the integration with + [`apps.plugin`](/collectors/apps.plugin/README.md) is enabled, this collector creates charts for the whole host _and_ + for each application. - `process`: This eBPF program creates charts that show information about process creation, VFS IO, and files removed. When in `return` mode, it also creates charts showing errors when these operations are executed. - `network viewer`: This eBPF program creates charts with information about `TCP` and `UDP` functions, including the bandwidth consumed by each. +- `sync`: Montitor calls for syscalls sync(2), fsync(2), fdatasync(2), syncfs(2), msync(2), and sync_file_range(2). -### `[network connections]` +## Thread configuration + +You can configure each thread of the eBPF data collector by editing either the `cachestat.conf`, `process.conf`, +or `network.conf` files. Use [`edit-config`](/docs/configure/nodes.md) from your Netdata config directory: + +```bash +cd /etc/netdata/ # Replace with your Netdata configuration directory, if not /etc/netdata/ +./edit-config ebpf.d/process.conf +``` + +### Configuration files + +The following configuration files are available: + +- `cachestat.conf`: Configuration for the `cachestat` thread. +- `process.conf`: Configuration for the `process` thread. +- `network.conf`: Configuration for the `network viewer` thread. This config file overwrites the global options and + also lets you specify which network the eBPF collector monitors. +- `sync.conf`: Configuration for the `sync` thread. + +### Network configuration + +The network configuration has specific options to configure which network(s) the eBPF collector monitors. These options +are divided in the following sections: + +#### `[network connections]` You can configure the information shown on `outbound` and `inbound` charts with the settings in this section. @@ -232,7 +262,7 @@ The dimensions for the traffic charts are created using the destination IPs of t changed setting `resolve hostname ips = yes` and restarting Netdata, after this Netdata will create dimensions using the `hostnames` every time that is possible to resolve IPs to their hostnames. -### `[service name]` +#### `[service name]` Netdata uses the list of services in `/etc/services` to plot network connection charts. If this file does not contain the name for a particular service you use in your infrastructure, you will need to add it to the `[service name]` section. @@ -245,6 +275,21 @@ service in network connection charts, and thus see the name of the service inste 19999 = Netdata ``` +### Sync configuration + +The sync configuration has specific options to disable monitoring for syscalls, as default option all syscalls are +monitored. + +```conf +[syscalls] + sync = yes + msync = yes + fsync = yes + fdatasync = yes + syncfs = yes + sync_file_range = yes +``` + ## Troubleshooting If the eBPF collector does not work, you can troubleshoot it by running the `ebpf.plugin` command and investigating its diff --git a/collectors/ebpf.plugin/ebpf.c b/collectors/ebpf.plugin/ebpf.c index 26bcfcf17..26dacfd3e 100644 --- a/collectors/ebpf.plugin/ebpf.c +++ b/collectors/ebpf.plugin/ebpf.c @@ -52,8 +52,6 @@ void netdata_cleanup_and_exit(int ret) *****************************************************************/ char *ebpf_plugin_dir = PLUGINS_DIR; -char *ebpf_user_config_dir = CONFIG_DIR; -char *ebpf_stock_config_dir = LIBCONFIG_DIR; static char *ebpf_configured_log_dir = LOG_DIR; char *ebpf_algorithms[] = {"absolute", "incremental"}; @@ -79,13 +77,19 @@ pthread_cond_t collect_data_cond_var; ebpf_module_t ebpf_modules[] = { { .thread_name = "process", .config_name = "process", .enabled = 0, .start_routine = ebpf_process_thread, .update_time = 1, .global_charts = 1, .apps_charts = 1, .mode = MODE_ENTRY, - .optional = 0 }, + .optional = 0, .apps_routine = ebpf_process_create_apps_charts }, { .thread_name = "socket", .config_name = "socket", .enabled = 0, .start_routine = ebpf_socket_thread, .update_time = 1, .global_charts = 1, .apps_charts = 1, .mode = MODE_ENTRY, - .optional = 0 }, + .optional = 0, .apps_routine = ebpf_socket_create_apps_charts }, + { .thread_name = "cachestat", .config_name = "cachestat", .enabled = 0, .start_routine = ebpf_cachestat_thread, + .update_time = 1, .global_charts = 1, .apps_charts = 1, .mode = MODE_ENTRY, + .optional = 0, .apps_routine = ebpf_cachestat_create_apps_charts }, + { .thread_name = "sync", .config_name = "sync", .enabled = 0, .start_routine = ebpf_sync_thread, + .update_time = 1, .global_charts = 1, .apps_charts = 1, .mode = MODE_ENTRY, + .optional = 0, .apps_routine = NULL }, { .thread_name = NULL, .enabled = 0, .start_routine = NULL, .update_time = 1, .global_charts = 0, .apps_charts = 1, .mode = MODE_ENTRY, - .optional = 0 }, + .optional = 0, .apps_routine = NULL }, }; // Link with apps.plugin @@ -100,58 +104,6 @@ ebpf_network_viewer_options_t network_viewer_opt; * *****************************************************************/ -/** - * Cleanup publish syscall - * - * @param nps list of structures to clean - */ -void ebpf_cleanup_publish_syscall(netdata_publish_syscall_t *nps) -{ - while (nps) { - freez(nps->algorithm); - nps = nps->next; - } -} - -/** - * Clean port Structure - * - * Clean the allocated list. - * - * @param clean the list that will be cleaned - */ -void clean_port_structure(ebpf_network_viewer_port_list_t **clean) -{ - ebpf_network_viewer_port_list_t *move = *clean; - while (move) { - ebpf_network_viewer_port_list_t *next = move->next; - freez(move->value); - freez(move); - - move = next; - } - *clean = NULL; -} - -/** - * Clean IP structure - * - * Clean the allocated list. - * - * @param clean the list that will be cleaned - */ -static void clean_ip_structure(ebpf_network_viewer_ip_list_t **clean) -{ - ebpf_network_viewer_ip_list_t *move = *clean; - while (move) { - ebpf_network_viewer_ip_list_t *next = move->next; - freez(move); - - move = next; - } - *clean = NULL; -} - /** * Clean Loaded Events * @@ -318,6 +270,25 @@ void write_err_chart(char *name, char *family, netdata_publish_syscall_t *move, write_end_chart(); } +/** + * Write charts + * + * Write the current information to publish the charts. + * + * @param family chart family + * @param chart chart id + * @param dim dimension name + * @param v1 value. + */ +void ebpf_one_dimension_write_charts(char *family, char *chart, char *dim, long long v1) +{ + write_begin_chart(family, chart); + + write_chart_dimension(dim, v1); + + write_end_chart(); +} + /** * Call the necessary functions to create a chart. * @@ -343,23 +314,26 @@ void write_io_chart(char *chart, char *family, char *dwrite, long long vwrite, c /** * Write chart cmd on standard output * - * @param type the chart type - * @param id the chart id - * @param title the chart title - * @param units the units label - * @param family the group name used to attach the chart on dashaboard - * @param charttype the chart type - * @param order the chart order + * @param type chart type + * @param id chart id + * @param title chart title + * @param units units label + * @param family group name used to attach the chart on dashaboard + * @param charttype chart type + * @param context chart context + * @param order chart order */ -void ebpf_write_chart_cmd(char *type, char *id, char *title, char *units, char *family, char *charttype, int order) +void ebpf_write_chart_cmd(char *type, char *id, char *title, char *units, char *family, + char *charttype, char *context, int order) { - printf("CHART %s.%s '' '%s' '%s' '%s' '' %s %d %d\n", + printf("CHART %s.%s '' '%s' '%s' '%s' '%s' '%s' %d %d\n", type, id, title, units, - family, - charttype, + (family)?family:"", + (context)?context:"", + (charttype)?charttype:"", order, update_every); } @@ -398,26 +372,31 @@ void ebpf_create_global_dimension(void *ptr, int end) /** * Call write_chart_cmd to create the charts * - * @param type the chart type - * @param id the chart id - * @param units the axis label - * @param family the group name used to attach the chart on dashaboard - * @param order the order number of the specified chart - * @param ncd a pointer to a function called to create dimensions - * @param move a pointer for a structure that has the dimensions - * @param end number of dimensions for the chart created + * @param type chart type + * @param id chart id + * @param title chart title + * @param units axis label + * @param family group name used to attach the chart on dashaboard + * @param context chart context + * @param charttype chart type + * @param order order number of the specified chart + * @param ncd a pointer to a function called to create dimensions + * @param move a pointer for a structure that has the dimensions + * @param end number of dimensions for the chart created */ void ebpf_create_chart(char *type, char *id, char *title, char *units, char *family, + char *context, + char *charttype, int order, void (*ncd)(void *, int), void *move, int end) { - ebpf_write_chart_cmd(type, id, title, units, family, "line", order); + ebpf_write_chart_cmd(type, id, title, units, family, charttype, context, order); ncd(move, end); } @@ -429,15 +408,16 @@ void ebpf_create_chart(char *type, * @param title the value displayed on vertical axis. * @param units the value displayed on vertical axis. * @param family Submenu that the chart will be attached on dashboard. + * @param charttype chart type * @param order the chart order * @param algorithm the algorithm used by dimension * @param root structure used to create the dimensions. */ -void ebpf_create_charts_on_apps(char *id, char *title, char *units, char *family, int order, +void ebpf_create_charts_on_apps(char *id, char *title, char *units, char *family, char *charttype, int order, char *algorithm, struct target *root) { struct target *w; - ebpf_write_chart_cmd(NETDATA_APPS_FAMILY, id, title, units, family, "stacked", order); + ebpf_write_chart_cmd(NETDATA_APPS_FAMILY, id, title, units, family, charttype, NULL, order); for (w = root; w; w = w->next) { if (unlikely(w->exposed)) @@ -580,22 +560,26 @@ void ebpf_print_help() "\n" " Available command line options:\n" "\n" - " SECONDS set the data collection frequency.\n" + " SECONDS Set the data collection frequency.\n" "\n" - " --help or -h show this help.\n" + " --help or -h Show this help.\n" "\n" - " --version or -v show software version.\n" + " --version or -v Show software version.\n" "\n" - " --global or -g disable charts per application.\n" + " --global or -g Disable charts per application.\n" "\n" - " --all or -a Enable all chart groups (global and apps), unless -g is also given.\n" + " --all or -a Enable all chart groups (global and apps), unless -g is also given.\n" "\n" - " --net or -n Enable network viewer charts.\n" + " --cachestat or -c Enable charts related to process run time.\n" "\n" - " --process or -p Enable charts related to process run time.\n" + " --net or -n Enable network viewer charts.\n" "\n" - " --return or -r Run the collector in return mode.\n" + " --process or -p Enable charts related to process run time.\n" + "\n" + " --return or -r Run the collector in return mode.\n" "\n", + " --sync or -s Enable chart related to sync run time.\n" + "\n" VERSION, (year >= 116) ? year + 1900 : 2020); } @@ -606,87 +590,6 @@ void ebpf_print_help() * *****************************************************************/ -/** - * Is ip inside the range - * - * Check if the ip is inside a IP range - * - * @param rfirst the first ip address of the range - * @param rlast the last ip address of the range - * @param cmpfirst the first ip to compare - * @param cmplast the last ip to compare - * @param family the IP family - * - * @return It returns 1 if the IP is inside the range and 0 otherwise - */ -static int is_ip_inside_range(union netdata_ip_t *rfirst, union netdata_ip_t *rlast, - union netdata_ip_t *cmpfirst, union netdata_ip_t *cmplast, int family) -{ - if (family == AF_INET) { - if (ntohl(rfirst->addr32[0]) <= ntohl(cmpfirst->addr32[0]) && - ntohl(rlast->addr32[0]) >= ntohl(cmplast->addr32[0])) - return 1; - } else { - if (memcmp(rfirst->addr8, cmpfirst->addr8, sizeof(union netdata_ip_t)) <= 0 && - memcmp(rlast->addr8, cmplast->addr8, sizeof(union netdata_ip_t)) >= 0) { - return 1; - } - - } - return 0; -} - - -/** - * Fill IP list - * - * @param out a pointer to the link list. - * @param in the structure that will be linked. - */ -static inline void fill_ip_list(ebpf_network_viewer_ip_list_t **out, ebpf_network_viewer_ip_list_t *in, char *table) -{ -#ifndef NETDATA_INTERNAL_CHECKS - UNUSED(table); -#endif - if (likely(*out)) { - ebpf_network_viewer_ip_list_t *move = *out, *store = *out; - while (move) { - if (in->ver == move->ver && is_ip_inside_range(&move->first, &move->last, &in->first, &in->last, in->ver)) { - info("The range/value (%s) is inside the range/value (%s) already inserted, it will be ignored.", - in->value, move->value); - freez(in->value); - freez(in); - return; - } - store = move; - move = move->next; - } - - store->next = in; - } else { - *out = in; - } - -#ifdef NETDATA_INTERNAL_CHECKS - char first[512], last[512]; - if (in->ver == AF_INET) { - if (inet_ntop(AF_INET, in->first.addr8, first, INET_ADDRSTRLEN) && - inet_ntop(AF_INET, in->last.addr8, last, INET_ADDRSTRLEN)) - info("Adding values %s - %s to %s IP list \"%s\" used on network viewer", - first, last, - (*out == network_viewer_opt.included_ips)?"included":"excluded", - table); - } else { - if (inet_ntop(AF_INET6, in->first.addr8, first, INET6_ADDRSTRLEN) && - inet_ntop(AF_INET6, in->last.addr8, last, INET6_ADDRSTRLEN)) - info("Adding values %s - %s to %s IP list \"%s\" used on network viewer", - first, last, - (*out == network_viewer_opt.included_ips)?"included":"excluded", - table); - } -#endif -} - /** * Read Local Ports * @@ -838,789 +741,26 @@ void fill_ebpf_data(ebpf_data_t *ef) */ static inline void how_to_load(char *ptr) { - if (!strcasecmp(ptr, "return")) + if (!strcasecmp(ptr, EBPF_CFG_LOAD_MODE_RETURN)) ebpf_set_thread_mode(MODE_RETURN); - else if (!strcasecmp(ptr, "entry")) + else if (!strcasecmp(ptr, EBPF_CFG_LOAD_MODE_DEFAULT)) ebpf_set_thread_mode(MODE_ENTRY); else error("the option %s for \"ebpf load mode\" is not a valid option.", ptr); } /** - * Fill Port list - * - * @param out a pointer to the link list. - * @param in the structure that will be linked. - */ -static inline void fill_port_list(ebpf_network_viewer_port_list_t **out, ebpf_network_viewer_port_list_t *in) -{ - if (likely(*out)) { - ebpf_network_viewer_port_list_t *move = *out, *store = *out; - uint16_t first = ntohs(in->first); - uint16_t last = ntohs(in->last); - while (move) { - uint16_t cmp_first = ntohs(move->first); - uint16_t cmp_last = ntohs(move->last); - if (cmp_first <= first && first <= cmp_last && - cmp_first <= last && last <= cmp_last ) { - info("The range/value (%u, %u) is inside the range/value (%u, %u) already inserted, it will be ignored.", - first, last, cmp_first, cmp_last); - freez(in->value); - freez(in); - return; - } else if (first <= cmp_first && cmp_first <= last && - first <= cmp_last && cmp_last <= last) { - info("The range (%u, %u) is bigger than previous range (%u, %u) already inserted, the previous will be ignored.", - first, last, cmp_first, cmp_last); - freez(move->value); - move->value = in->value; - move->first = in->first; - move->last = in->last; - freez(in); - return; - } - - store = move; - move = move->next; - } - - store->next = in; - } else { - *out = in; - } - -#ifdef NETDATA_INTERNAL_CHECKS - info("Adding values %s( %u, %u) to %s port list used on network viewer", - in->value, ntohs(in->first), ntohs(in->last), - (*out == network_viewer_opt.included_port)?"included":"excluded"); -#endif -} - -/** - * Fill port list - * - * Fill an allocated port list with the range given - * - * @param out a pointer to store the link list - * @param range the informed range for the user. - */ -static void parse_port_list(void **out, char *range) -{ - int first, last; - ebpf_network_viewer_port_list_t **list = (ebpf_network_viewer_port_list_t **)out; - - char *copied = strdupz(range); - if (*range == '*' && *(range+1) == '\0') { - first = 1; - last = 65535; - - clean_port_structure(list); - goto fillenvpl; - } - - char *end = range; - //Move while I cannot find a separator - while (*end && *end != ':' && *end != '-') end++; - - //It has a range - if (likely(*end)) { - *end++ = '\0'; - if (*end == '!') { - info("The exclusion cannot be in the second part of the range, the range %s will be ignored.", copied); - freez(copied); - return; - } - last = str2i((const char *)end); - } else { - last = 0; - } - - first = str2i((const char *)range); - if (first < NETDATA_MINIMUM_PORT_VALUE || first > NETDATA_MAXIMUM_PORT_VALUE) { - info("The first port %d of the range \"%s\" is invalid and it will be ignored!", first, copied); - freez(copied); - return; - } - - if (!last) - last = first; - - if (last < NETDATA_MINIMUM_PORT_VALUE || last > NETDATA_MAXIMUM_PORT_VALUE) { - info("The second port %d of the range \"%s\" is invalid and the whole range will be ignored!", last, copied); - freez(copied); - return; - } - - if (first > last) { - info("The specified order %s is wrong, the smallest value is always the first, it will be ignored!", copied); - freez(copied); - return; - } - - ebpf_network_viewer_port_list_t *w; -fillenvpl: - w = callocz(1, sizeof(ebpf_network_viewer_port_list_t)); - w->value = copied; - w->hash = simple_hash(copied); - w->first = (uint16_t)htons((uint16_t)first); - w->last = (uint16_t)htons((uint16_t)last); - w->cmp_first = (uint16_t)first; - w->cmp_last = (uint16_t)last; - - fill_port_list(list, w); -} - -/** - * Parse Service List - * - * @param out a pointer to store the link list - * @param service the service used to create the structure that will be linked. - */ -static void parse_service_list(void **out, char *service) -{ - ebpf_network_viewer_port_list_t **list = (ebpf_network_viewer_port_list_t **)out; - struct servent *serv = getservbyname((const char *)service, "tcp"); - if (!serv) - serv = getservbyname((const char *)service, "udp"); - - if (!serv) { - info("Cannot resolv the service '%s' with protocols TCP and UDP, it will be ignored", service); - return; - } - - ebpf_network_viewer_port_list_t *w = callocz(1, sizeof(ebpf_network_viewer_port_list_t)); - w->value = strdupz(service); - w->hash = simple_hash(service); - - w->first = w->last = (uint16_t)serv->s_port; - - fill_port_list(list, w); -} - -/** - * Netmask - * - * Copied from iprange (https://github.com/firehol/iprange/blob/master/iprange.h) - * - * @param prefix create the netmask based in the CIDR value. - * - * @return - */ -static inline in_addr_t netmask(int prefix) { - - if (prefix == 0) - return (~((in_addr_t) - 1)); - else - return (in_addr_t)(~((1 << (32 - prefix)) - 1)); - -} - -/** - * Broadcast - * - * Copied from iprange (https://github.com/firehol/iprange/blob/master/iprange.h) - * - * @param addr is the ip address - * @param prefix is the CIDR value. - * - * @return It returns the last address of the range - */ -static inline in_addr_t broadcast(in_addr_t addr, int prefix) -{ - return (addr | ~netmask(prefix)); -} - -/** - * Network - * - * Copied from iprange (https://github.com/firehol/iprange/blob/master/iprange.h) - * - * @param addr is the ip address - * @param prefix is the CIDR value. - * - * @return It returns the first address of the range. - */ -static inline in_addr_t ipv4_network(in_addr_t addr, int prefix) -{ - return (addr & netmask(prefix)); -} - -/** - * IP to network long - * - * @param dst the vector to store the result - * @param ip the source ip given by our users. - * @param domain the ip domain (IPV4 or IPV6) - * @param source the original string - * - * @return it returns 0 on success and -1 otherwise. - */ -static inline int ip2nl(uint8_t *dst, char *ip, int domain, char *source) -{ - if (inet_pton(domain, ip, dst) <= 0) { - error("The address specified (%s) is invalid ", source); - return -1; - } - - return 0; -} - -/** - * Get IPV6 Last Address + * Update interval * - * @param out the address to store the last address. - * @param in the address used to do the math. - * @param prefix number of bits used to calculate the address + * Update default interval with value from user */ -static void get_ipv6_last_addr(union netdata_ip_t *out, union netdata_ip_t *in, uint64_t prefix) +static void ebpf_update_interval() { - uint64_t mask,tmp; - uint64_t ret[2]; - memcpy(ret, in->addr32, sizeof(union netdata_ip_t)); - - if (prefix == 128) { - memcpy(out->addr32, in->addr32, sizeof(union netdata_ip_t)); - return; - } else if (!prefix) { - ret[0] = ret[1] = 0xFFFFFFFFFFFFFFFF; - memcpy(out->addr32, ret, sizeof(union netdata_ip_t)); - return; - } else if (prefix <= 64) { - ret[1] = 0xFFFFFFFFFFFFFFFFULL; - - tmp = be64toh(ret[0]); - if (prefix > 0) { - mask = 0xFFFFFFFFFFFFFFFFULL << (64 - prefix); - tmp |= ~mask; - } - ret[0] = htobe64(tmp); - } else { - mask = 0xFFFFFFFFFFFFFFFFULL << (128 - prefix); - tmp = be64toh(ret[1]); - tmp |= ~mask; - ret[1] = htobe64(tmp); - } - - memcpy(out->addr32, ret, sizeof(union netdata_ip_t)); -} - -/** - * Calculate ipv6 first address - * - * @param out the address to store the first address. - * @param in the address used to do the math. - * @param prefix number of bits used to calculate the address - */ -static void get_ipv6_first_addr(union netdata_ip_t *out, union netdata_ip_t *in, uint64_t prefix) -{ - uint64_t mask,tmp; - uint64_t ret[2]; - - memcpy(ret, in->addr32, sizeof(union netdata_ip_t)); - - if (prefix == 128) { - memcpy(out->addr32, in->addr32, sizeof(union netdata_ip_t)); - return; - } else if (!prefix) { - ret[0] = ret[1] = 0; - memcpy(out->addr32, ret, sizeof(union netdata_ip_t)); - return; - } else if (prefix <= 64) { - ret[1] = 0ULL; - - tmp = be64toh(ret[0]); - if (prefix > 0) { - mask = 0xFFFFFFFFFFFFFFFFULL << (64 - prefix); - tmp &= mask; - } - ret[0] = htobe64(tmp); - } else { - mask = 0xFFFFFFFFFFFFFFFFULL << (128 - prefix); - tmp = be64toh(ret[1]); - tmp &= mask; - ret[1] = htobe64(tmp); - } - - memcpy(out->addr32, ret, sizeof(union netdata_ip_t)); -} - -/** - * Parse IP List - * - * Parse IP list and link it. - * - * @param out a pointer to store the link list - * @param ip the value given as parameter - */ -static void parse_ip_list(void **out, char *ip) -{ - ebpf_network_viewer_ip_list_t **list = (ebpf_network_viewer_ip_list_t **)out; - - char *ipdup = strdupz(ip); - union netdata_ip_t first = { }; - union netdata_ip_t last = { }; - char *is_ipv6; - if (*ip == '*' && *(ip+1) == '\0') { - memset(first.addr8, 0, sizeof(first.addr8)); - memset(last.addr8, 0xFF, sizeof(last.addr8)); - - is_ipv6 = ip; - - clean_ip_structure(list); - goto storethisip; - } - - char *end = ip; - // Move while I cannot find a separator - while (*end && *end != '/' && *end != '-') end++; - - // We will use only the classic IPV6 for while, but we could consider the base 85 in a near future - // https://tools.ietf.org/html/rfc1924 - is_ipv6 = strchr(ip, ':'); - - int select; - if (*end && !is_ipv6) { // IPV4 range - select = (*end == '/') ? 0 : 1; - *end++ = '\0'; - if (*end == '!') { - info("The exclusion cannot be in the second part of the range %s, it will be ignored.", ipdup); - goto cleanipdup; - } - - if (!select) { // CIDR - select = ip2nl(first.addr8, ip, AF_INET, ipdup); - if (select) - goto cleanipdup; - - select = (int) str2i(end); - if (select < NETDATA_MINIMUM_IPV4_CIDR || select > NETDATA_MAXIMUM_IPV4_CIDR) { - info("The specified CIDR %s is not valid, the IP %s will be ignored.", end, ip); - goto cleanipdup; - } - - last.addr32[0] = htonl(broadcast(ntohl(first.addr32[0]), select)); - // This was added to remove - // https://app.codacy.com/manual/netdata/netdata/pullRequest?prid=5810941&bid=19021977 - UNUSED(last.addr32[0]); - - uint32_t ipv4_test = htonl(ipv4_network(ntohl(first.addr32[0]), select)); - if (first.addr32[0] != ipv4_test) { - first.addr32[0] = ipv4_test; - struct in_addr ipv4_convert; - ipv4_convert.s_addr = ipv4_test; - char ipv4_msg[INET_ADDRSTRLEN]; - if(inet_ntop(AF_INET, &ipv4_convert, ipv4_msg, INET_ADDRSTRLEN)) - info("The network value of CIDR %s was updated for %s .", ipdup, ipv4_msg); - } - } else { // Range - select = ip2nl(first.addr8, ip, AF_INET, ipdup); - if (select) - goto cleanipdup; - - select = ip2nl(last.addr8, end, AF_INET, ipdup); - if (select) - goto cleanipdup; - } - - if (htonl(first.addr32[0]) > htonl(last.addr32[0])) { - info("The specified range %s is invalid, the second address is smallest than the first, it will be ignored.", - ipdup); - goto cleanipdup; - } - } else if (is_ipv6) { // IPV6 - if (!*end) { // Unique - select = ip2nl(first.addr8, ip, AF_INET6, ipdup); - if (select) - goto cleanipdup; - - memcpy(last.addr8, first.addr8, sizeof(first.addr8)); - } else if (*end == '-') { - *end++ = 0x00; - if (*end == '!') { - info("The exclusion cannot be in the second part of the range %s, it will be ignored.", ipdup); - goto cleanipdup; - } - - select = ip2nl(first.addr8, ip, AF_INET6, ipdup); - if (select) - goto cleanipdup; - - select = ip2nl(last.addr8, end, AF_INET6, ipdup); - if (select) - goto cleanipdup; - } else { // CIDR - *end++ = 0x00; - if (*end == '!') { - info("The exclusion cannot be in the second part of the range %s, it will be ignored.", ipdup); - goto cleanipdup; - } - - select = str2i(end); - if (select < 0 || select > 128) { - info("The CIDR %s is not valid, the address %s will be ignored.", end, ip); - goto cleanipdup; - } - - uint64_t prefix = (uint64_t)select; - select = ip2nl(first.addr8, ip, AF_INET6, ipdup); - if (select) - goto cleanipdup; - - get_ipv6_last_addr(&last, &first, prefix); - - union netdata_ip_t ipv6_test; - get_ipv6_first_addr(&ipv6_test, &first, prefix); - - if (memcmp(first.addr8, ipv6_test.addr8, sizeof(union netdata_ip_t)) != 0) { - memcpy(first.addr8, ipv6_test.addr8, sizeof(union netdata_ip_t)); - - struct in6_addr ipv6_convert; - memcpy(ipv6_convert.s6_addr, ipv6_test.addr8, sizeof(union netdata_ip_t)); - - char ipv6_msg[INET6_ADDRSTRLEN]; - if(inet_ntop(AF_INET6, &ipv6_convert, ipv6_msg, INET6_ADDRSTRLEN)) - info("The network value of CIDR %s was updated for %s .", ipdup, ipv6_msg); - } - } - - if ((be64toh(*(uint64_t *)&first.addr32[2]) > be64toh(*(uint64_t *)&last.addr32[2]) && - !memcmp(first.addr32, last.addr32, 2*sizeof(uint32_t))) || - (be64toh(*(uint64_t *)&first.addr32) > be64toh(*(uint64_t *)&last.addr32)) ) { - info("The specified range %s is invalid, the second address is smallest than the first, it will be ignored.", - ipdup); - goto cleanipdup; - } - } else { // Unique ip - select = ip2nl(first.addr8, ip, AF_INET, ipdup); - if (select) - goto cleanipdup; - - memcpy(last.addr8, first.addr8, sizeof(first.addr8)); - } - - ebpf_network_viewer_ip_list_t *store; - -storethisip: - store = callocz(1, sizeof(ebpf_network_viewer_ip_list_t)); - store->value = ipdup; - store->hash = simple_hash(ipdup); - store->ver = (uint8_t)(!is_ipv6)?AF_INET:AF_INET6; - memcpy(store->first.addr8, first.addr8, sizeof(first.addr8)); - memcpy(store->last.addr8, last.addr8, sizeof(last.addr8)); - - fill_ip_list(list, store, "socket"); - return; - -cleanipdup: - freez(ipdup); -} - -/** - * Parse IP Range - * - * Parse the IP ranges given and create Network Viewer IP Structure - * - * @param ptr is a pointer with the text to parse. - */ -static void parse_ips(char *ptr) -{ - // No value - if (unlikely(!ptr)) - return; - - while (likely(ptr)) { - // Move forward until next valid character - while (isspace(*ptr)) ptr++; - - // No valid value found - if (unlikely(!*ptr)) - return; - - // Find space that ends the list - char *end = strchr(ptr, ' '); - if (end) { - *end++ = '\0'; - } - - int neg = 0; - if (*ptr == '!') { - neg++; - ptr++; - } - - if (isascii(*ptr)) { // Parse port - parse_ip_list((!neg)?(void **)&network_viewer_opt.included_ips:(void **)&network_viewer_opt.excluded_ips, - ptr); - } - - ptr = end; - } -} - - -/** - * Parse Port Range - * - * Parse the port ranges given and create Network Viewer Port Structure - * - * @param ptr is a pointer with the text to parse. - */ -static void parse_ports(char *ptr) -{ - // No value - if (unlikely(!ptr)) - return; - - while (likely(ptr)) { - // Move forward until next valid character - while (isspace(*ptr)) ptr++; - - // No valid value found - if (unlikely(!*ptr)) - return; - - // Find space that ends the list - char *end = strchr(ptr, ' '); - if (end) { - *end++ = '\0'; - } - - int neg = 0; - if (*ptr == '!') { - neg++; - ptr++; - } - - if (isdigit(*ptr)) { // Parse port - parse_port_list((!neg)?(void **)&network_viewer_opt.included_port:(void **)&network_viewer_opt.excluded_port, - ptr); - } else if (isalpha(*ptr)) { // Parse service - parse_service_list((!neg)?(void **)&network_viewer_opt.included_port:(void **)&network_viewer_opt.excluded_port, - ptr); - } else if (*ptr == '*') { // All - parse_port_list((!neg)?(void **)&network_viewer_opt.included_port:(void **)&network_viewer_opt.excluded_port, - ptr); - } - - ptr = end; - } -} - -/** - * Link hostname - * - * @param out is the output link list - * @param in the hostname to add to list. - */ -static void link_hostname(ebpf_network_viewer_hostname_list_t **out, ebpf_network_viewer_hostname_list_t *in) -{ - if (likely(*out)) { - ebpf_network_viewer_hostname_list_t *move = *out; - for (; move->next ; move = move->next ) { - if (move->hash == in->hash && !strcmp(move->value, in->value)) { - info("The hostname %s was already inserted, it will be ignored.", in->value); - freez(in->value); - simple_pattern_free(in->value_pattern); - freez(in); - return; - } - } - - move->next = in; - } else { - *out = in; - } -#ifdef NETDATA_INTERNAL_CHECKS - info("Adding value %s to %s hostname list used on network viewer", - in->value, - (*out == network_viewer_opt.included_hostnames)?"included":"excluded"); -#endif -} - -/** - * Link Hostnames - * - * Parse the list of hostnames to create the link list. - * This is not associated with the IP, because simple patterns like *example* cannot be resolved to IP. - * - * @param out is the output link list - * @param parse is a pointer with the text to parser. - */ -static void link_hostnames(char *parse) -{ - // No value - if (unlikely(!parse)) - return; - - while (likely(parse)) { - // Find the first valid value - while (isspace(*parse)) parse++; - - // No valid value found - if (unlikely(!*parse)) - return; - - // Find space that ends the list - char *end = strchr(parse, ' '); - if (end) { - *end++ = '\0'; - } - - int neg = 0; - if (*parse == '!') { - neg++; - parse++; - } - - ebpf_network_viewer_hostname_list_t *hostname = callocz(1 , sizeof(ebpf_network_viewer_hostname_list_t)); - hostname->value = strdupz(parse); - hostname->hash = simple_hash(parse); - hostname->value_pattern = simple_pattern_create(parse, NULL, SIMPLE_PATTERN_EXACT); - - link_hostname((!neg)?&network_viewer_opt.included_hostnames:&network_viewer_opt.excluded_hostnames, - hostname); - - parse = end; - } -} - -/** - * Read max dimension. - * - * Netdata plot two dimensions per connection, so it is necessary to adjust the values. - */ -static void read_max_dimension() -{ - int maxdim ; - maxdim = (int) appconfig_get_number(&collector_config, - EBPF_NETWORK_VIEWER_SECTION, - "maximum dimensions", - NETDATA_NV_CAP_VALUE); - if (maxdim < 0) { - error("'maximum dimensions = %d' must be a positive number, Netdata will change for default value %ld.", - maxdim, NETDATA_NV_CAP_VALUE); - maxdim = NETDATA_NV_CAP_VALUE; - } - - maxdim /= 2; - if (!maxdim) { - info("The number of dimensions is too small (%u), we are setting it to minimum 2", network_viewer_opt.max_dim); - network_viewer_opt.max_dim = 1; - } - - network_viewer_opt.max_dim = (uint32_t)maxdim; -} - -/** - * Parse network viewer section - */ -static void parse_network_viewer_section() -{ - read_max_dimension(); - - network_viewer_opt.hostname_resolution_enabled = appconfig_get_boolean(&collector_config, - EBPF_NETWORK_VIEWER_SECTION, - "resolve hostnames", - CONFIG_BOOLEAN_NO); - - network_viewer_opt.service_resolution_enabled = appconfig_get_boolean(&collector_config, - EBPF_NETWORK_VIEWER_SECTION, - "resolve service names", - CONFIG_BOOLEAN_NO); - - char *value = appconfig_get(&collector_config, EBPF_NETWORK_VIEWER_SECTION, - "ports", NULL); - parse_ports(value); - - if (network_viewer_opt.hostname_resolution_enabled) { - value = appconfig_get(&collector_config, EBPF_NETWORK_VIEWER_SECTION, "hostnames", NULL); - link_hostnames(value); - } else { - info("Name resolution is disabled, collector will not parser \"hostnames\" list."); - } - - value = appconfig_get(&collector_config, EBPF_NETWORK_VIEWER_SECTION, - "ips", "!127.0.0.1/8 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 fc00::/7 !::1/128"); - parse_ips(value); -} - -/** - * Link dimension name - * - * Link user specified names inside a link list. - * - * @param port the port number associated to the dimension name. - * @param hash the calculated hash for the dimension name. - * @param name the dimension name. - */ -static void link_dimension_name(char *port, uint32_t hash, char *value) -{ - int test = str2i(port); - if (test < NETDATA_MINIMUM_PORT_VALUE || test > NETDATA_MAXIMUM_PORT_VALUE){ - error("The dimension given (%s = %s) has an invalid value and it will be ignored.", port, value); - return; - } - - ebpf_network_viewer_dim_name_t *w; - w = callocz(1, sizeof(ebpf_network_viewer_dim_name_t)); - - w->name = strdupz(value); - w->hash = hash; - - w->port = (uint16_t) htons(test); - - ebpf_network_viewer_dim_name_t *names = network_viewer_opt.names; - if (unlikely(!names)) { - network_viewer_opt.names = w; - } else { - for (; names->next; names = names->next) { - if (names->port == w->port) { - info("Dupplicated definition for a service, the name %s will be ignored. ", names->name); - freez(names->name); - names->name = w->name; - names->hash = w->hash; - freez(w); - return; - } - } - names->next = w; - } - -#ifdef NETDATA_INTERNAL_CHECKS - info("Adding values %s( %u) to dimension name list used on network viewer", w->name, htons(w->port)); -#endif -} - -/** - * Parse service Name section. - * - * This function gets the values that will be used to overwrite dimensions. - */ -static void parse_service_name_section() -{ - struct section *co = appconfig_get_section(&collector_config, EBPF_SERVICE_NAME_SECTION); - if (co) { - struct config_option *cv; - for (cv = co->values; cv ; cv = cv->next) { - link_dimension_name(cv->name, cv->hash, cv->value); - } - } - - // Always associated the default port to Netdata - ebpf_network_viewer_dim_name_t *names = network_viewer_opt.names; - if (names) { - uint16_t default_port = htons(19999); - while (names) { - if (names->port == default_port) - return; - - names = names->next; - } + int i; + int value = (int) appconfig_get_number(&collector_config, EBPF_GLOBAL_SECTION, EBPF_CFG_UPDATE_EVERY, 1); + for (i = 0; ebpf_modules[i].thread_name; i++) { + ebpf_modules[i].update_time = value; } - - char *port_string = getenv("NETDATA_LISTEN_PORT"); - if (port_string) - link_dimension_name(port_string, simple_hash(port_string), "Netdata"); } /** @@ -1633,18 +773,22 @@ static void read_collector_values(int *disable_apps) // Read global section char *value; if (appconfig_exists(&collector_config, EBPF_GLOBAL_SECTION, "load")) // Backward compatibility - value = appconfig_get(&collector_config, EBPF_GLOBAL_SECTION, "load", "entry"); + value = appconfig_get(&collector_config, EBPF_GLOBAL_SECTION, "load", + EBPF_CFG_LOAD_MODE_DEFAULT); else - value = appconfig_get(&collector_config, EBPF_GLOBAL_SECTION, "ebpf load mode", "entry"); + value = appconfig_get(&collector_config, EBPF_GLOBAL_SECTION, EBPF_CFG_LOAD_MODE, + EBPF_CFG_LOAD_MODE_DEFAULT); how_to_load(value); + ebpf_update_interval(); + // This is kept to keep compatibility uint32_t enabled = appconfig_get_boolean(&collector_config, EBPF_GLOBAL_SECTION, "disable apps", CONFIG_BOOLEAN_NO); if (!enabled) { // Apps is a positive sentence, so we need to invert the values to disable apps. - enabled = appconfig_get_boolean(&collector_config, EBPF_GLOBAL_SECTION, "apps", + enabled = appconfig_get_boolean(&collector_config, EBPF_GLOBAL_SECTION, EBPF_CFG_APPLICATION, CONFIG_BOOLEAN_YES); enabled = (enabled == CONFIG_BOOLEAN_NO)?CONFIG_BOOLEAN_YES:CONFIG_BOOLEAN_NO; } @@ -1652,7 +796,7 @@ static void read_collector_values(int *disable_apps) // Read ebpf programs section enabled = appconfig_get_boolean(&collector_config, EBPF_PROGRAMS_SECTION, - ebpf_modules[0].config_name, CONFIG_BOOLEAN_YES); + ebpf_modules[EBPF_MODULE_PROCESS_IDX].config_name, CONFIG_BOOLEAN_YES); int started = 0; if (enabled) { ebpf_enable_chart(EBPF_MODULE_PROCESS_IDX, *disable_apps); @@ -1663,14 +807,16 @@ static void read_collector_values(int *disable_apps) enabled = appconfig_get_boolean(&collector_config, EBPF_PROGRAMS_SECTION, "network viewer", CONFIG_BOOLEAN_NO); if (!enabled) - enabled = appconfig_get_boolean(&collector_config, EBPF_PROGRAMS_SECTION, ebpf_modules[1].config_name, + enabled = appconfig_get_boolean(&collector_config, EBPF_PROGRAMS_SECTION, + ebpf_modules[EBPF_MODULE_SOCKET_IDX].config_name, CONFIG_BOOLEAN_NO); if (enabled) { ebpf_enable_chart(EBPF_MODULE_SOCKET_IDX, *disable_apps); // Read network viewer section if network viewer is enabled - parse_network_viewer_section(); - parse_service_name_section(); + // This is kept here to keep backward compatibility + parse_network_viewer_section(&collector_config); + parse_service_name_section(&collector_config); started++; } @@ -1680,13 +826,30 @@ static void read_collector_values(int *disable_apps) if (!enabled) enabled = appconfig_get_boolean(&collector_config, EBPF_PROGRAMS_SECTION, "network connections", CONFIG_BOOLEAN_NO); - ebpf_modules[1].optional = enabled; + ebpf_modules[EBPF_MODULE_SOCKET_IDX].optional = enabled; + + enabled = appconfig_get_boolean(&collector_config, EBPF_PROGRAMS_SECTION, "cachestat", + CONFIG_BOOLEAN_NO); + + if (enabled) { + ebpf_enable_chart(EBPF_MODULE_CACHESTAT_IDX, *disable_apps); + started++; + } + + enabled = appconfig_get_boolean(&collector_config, EBPF_PROGRAMS_SECTION, "sync", + CONFIG_BOOLEAN_YES); + + if (enabled) { + ebpf_enable_chart(EBPF_MODULE_SYNC_IDX, *disable_apps); + started++; + } if (!started){ ebpf_enable_all_charts(*disable_apps); // Read network viewer section - parse_network_viewer_section(); - parse_service_name_section(); + // This is kept here to keep backward compatibility + parse_network_viewer_section(&collector_config); + parse_service_name_section(&collector_config); } } @@ -1702,10 +865,13 @@ static int load_collector_config(char *path, int *disable_apps) { char lpath[4096]; - snprintf(lpath, 4095, "%s/%s", path, "ebpf.conf"); - - if (!appconfig_load(&collector_config, lpath, 0, NULL)) - return -1; + snprintf(lpath, 4095, "%s/%s", path, NETDATA_EBPF_CONFIG_FILE); + if (!appconfig_load(&collector_config, lpath, 0, NULL)) { + snprintf(lpath, 4095, "%s/%s", path, NETDATA_EBPF_OLD_CONFIG_FILE); + if (!appconfig_load(&collector_config, lpath, 0, NULL)) { + return -1; + } + } read_collector_values(disable_apps); @@ -1756,13 +922,15 @@ static void parse_args(int argc, char **argv) int freq = 0; int option_index = 0; static struct option long_options[] = { - {"help", no_argument, 0, 'h' }, - {"version", no_argument, 0, 'v' }, - {"global", no_argument, 0, 'g' }, - {"all", no_argument, 0, 'a' }, - {"net", no_argument, 0, 'n' }, - {"process", no_argument, 0, 'p' }, - {"return", no_argument, 0, 'r' }, + {"help", no_argument, 0, 'h' }, + {"version", no_argument, 0, 'v' }, + {"global", no_argument, 0, 'g' }, + {"all", no_argument, 0, 'a' }, + {"cachestat", no_argument, 0, 'c' }, + {"net", no_argument, 0, 'n' }, + {"process", no_argument, 0, 'p' }, + {"return", no_argument, 0, 'r' }, + {"sync", no_argument, 0, 's' }, {0, 0, 0, 0} }; @@ -1777,7 +945,7 @@ static void parse_args(int argc, char **argv) } while (1) { - int c = getopt_long(argc, argv, "hvganpr", long_options, &option_index); + int c = getopt_long(argc, argv, "hvgcanprs", long_options, &option_index); if (c == -1) break; @@ -1803,6 +971,15 @@ static void parse_args(int argc, char **argv) ebpf_enable_all_charts(disable_apps); #ifdef NETDATA_INTERNAL_CHECKS info("EBPF running with all chart groups, because it was started with the option \"--all\" or \"-a\"."); +#endif + break; + } + case 'c': { + enabled = 1; + ebpf_enable_chart(EBPF_MODULE_CACHESTAT_IDX, disable_apps); +#ifdef NETDATA_INTERNAL_CHECKS + info( + "EBPF enabling \"CACHESTAT\" charts, because it was started with the option \"--cachestat\" or \"-c\"."); #endif break; } @@ -1827,6 +1004,14 @@ static void parse_args(int argc, char **argv) ebpf_set_thread_mode(MODE_RETURN); #ifdef NETDATA_INTERNAL_CHECKS info("EBPF running in \"return\" mode, because it was started with the option \"--return\" or \"-r\"."); +#endif + break; + } + case 's': { + enabled = 1; + ebpf_enable_chart(EBPF_MODULE_SYNC_IDX, disable_apps); +#ifdef NETDATA_INTERNAL_CHECKS + info("EBPF enabling \"sync\" chart, because it was started with the option \"--sync\" or \"-s\"."); #endif break; } @@ -1948,9 +1133,16 @@ int main(int argc, char **argv) read_local_ports("/proc/net/udp6", IPPROTO_UDP); struct netdata_static_thread ebpf_threads[] = { - {"EBPF PROCESS", NULL, NULL, 1, NULL, NULL, ebpf_modules[0].start_routine}, - {"EBPF SOCKET" , NULL, NULL, 1, NULL, NULL, ebpf_modules[1].start_routine}, - {NULL , NULL, NULL, 0, NULL, NULL, NULL} + {"EBPF PROCESS", NULL, NULL, 1, + NULL, NULL, ebpf_modules[EBPF_MODULE_PROCESS_IDX].start_routine}, + {"EBPF SOCKET" , NULL, NULL, 1, + NULL, NULL, ebpf_modules[EBPF_MODULE_SOCKET_IDX].start_routine}, + {"EBPF CACHESTAT" , NULL, NULL, 1, + NULL, NULL, ebpf_modules[EBPF_MODULE_CACHESTAT_IDX].start_routine}, + {"EBPF SYNC" , NULL, NULL, 1, + NULL, NULL, ebpf_modules[EBPF_MODULE_SYNC_IDX].start_routine}, + {NULL , NULL, NULL, 0, + NULL, NULL, NULL} }; //clean_loaded_events(); diff --git a/collectors/ebpf.plugin/ebpf.conf b/collectors/ebpf.plugin/ebpf.conf deleted file mode 100644 index 3a5b77395..000000000 --- a/collectors/ebpf.plugin/ebpf.conf +++ /dev/null @@ -1,45 +0,0 @@ -# -# Global options -# -# The `ebpf load mode` option accepts the following values : -# `entry` : The eBPF collector only monitors calls for the functions, and does not show charts related to errors. -# `return : In the `return` mode, the eBPF collector monitors the same kernel functions as `entry`, but also creates -# new charts for the return of these functions, such as errors. -# -# The eBPF collector also creates charts for each running application through an integration with the `apps plugin`. -# If you want to disable the integration with `apps.plugin` along with the above charts, change the setting `apps` to -# 'no'. -# -[global] - ebpf load mode = entry - apps = yes - -# -# eBPF Programs -# -# The eBPF collector enables and runs the following eBPF programs by default: -# -# `process` : This eBPF program creates charts that show information about process creation, VFS IO, and -# files removed. -# `socket` : This eBPF program creates charts with information about `TCP` and `UDP` functions, including the -# bandwidth consumed by each. -[ebpf programs] - process = yes - socket = yes - network connections = no - -# -# Network Connection -# -# This is a feature with status WIP(Work in Progress) -# -[network connections] - maximum dimensions = 50 - resolve hostnames = no - resolve service names = no - ports = * - ips = !127.0.0.1/8 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 fc00::/7 !::1/128 - hostnames = * - -[service name] - 19999 = Netdata diff --git a/collectors/ebpf.plugin/ebpf.d.conf b/collectors/ebpf.plugin/ebpf.d.conf new file mode 100644 index 000000000..7191d7416 --- /dev/null +++ b/collectors/ebpf.plugin/ebpf.d.conf @@ -0,0 +1,36 @@ +# +# Global options +# +# The `ebpf load mode` option accepts the following values : +# `entry` : The eBPF collector only monitors calls for the functions, and does not show charts related to errors. +# `return : In the `return` mode, the eBPF collector monitors the same kernel functions as `entry`, but also creates +# new charts for the return of these functions, such as errors. +# +# The eBPF collector also creates charts for each running application through an integration with the `apps plugin`. +# If you want to disable the integration with `apps.plugin` along with the above charts, change the setting `apps` to +# 'no'. +# +# The `update every` option defines the number of seconds used to read data from kernel and send to netdata +[global] + ebpf load mode = entry + apps = yes + update every = 1 + +# +# eBPF Programs +# +# The eBPF collector enables and runs the following eBPF programs by default: +# +# `cachestat`: Make charts for kernel functions related to page cache. +# `process` : This eBPF program creates charts that show information about process creation, VFS IO, and +# files removed. +# `socket` : This eBPF program creates charts with information about `TCP` and `UDP` functions, including the +# bandwidth consumed by each. +# `sync` : Montitor calls for syscall sync(2). +[ebpf programs] + cachestat = no + process = yes + socket = yes + sync = yes + network connections = no + diff --git a/collectors/ebpf.plugin/ebpf.d/cachestat.conf b/collectors/ebpf.plugin/ebpf.d/cachestat.conf new file mode 100644 index 000000000..78277cf56 --- /dev/null +++ b/collectors/ebpf.plugin/ebpf.d/cachestat.conf @@ -0,0 +1,14 @@ +# The `ebpf load mode` option accepts the following values : +# `entry` : The eBPF collector only monitors calls for the functions, and does not show charts related to errors. +# `return : In the `return` mode, the eBPF collector monitors the same kernel functions as `entry`, but also creates +# new charts for the return of these functions, such as errors. +# +# The eBPF collector also creates charts for each running application through an integration with the `apps plugin`. +# If you want to disable the integration with `apps.plugin` along with the above charts, change the setting `apps` to +# 'no'. +# +# +[global] + ebpf load mode = entry + apps = yes + update every = 2 diff --git a/collectors/ebpf.plugin/ebpf.d/ebpf_kernel_reject_list.txt b/collectors/ebpf.plugin/ebpf.d/ebpf_kernel_reject_list.txt new file mode 100644 index 000000000..539bf357f --- /dev/null +++ b/collectors/ebpf.plugin/ebpf.d/ebpf_kernel_reject_list.txt @@ -0,0 +1 @@ +Ubuntu 4.18.0 diff --git a/collectors/ebpf.plugin/ebpf.d/network.conf b/collectors/ebpf.plugin/ebpf.d/network.conf new file mode 100644 index 000000000..b033bc39c --- /dev/null +++ b/collectors/ebpf.plugin/ebpf.d/network.conf @@ -0,0 +1,30 @@ +# The `ebpf load mode` option accepts the following values : +# `entry` : The eBPF collector only monitors calls for the functions, and does not show charts related to errors. +# `return : In the `return` mode, the eBPF collector monitors the same kernel functions as `entry`, but also creates +# new charts for the return of these functions, such as errors. +# +# The eBPF collector also creates charts for each running application through an integration with the `apps plugin`. +# If you want to disable the integration with `apps.plugin` along with the above charts, change the setting `apps` to +# 'no'. +# +# +[global] + ebpf load mode = entry + apps = yes + update every = 1 + +# +# Network Connection +# +# This is a feature with status WIP(Work in Progress) +# +[network connections] + maximum dimensions = 50 + resolve hostnames = no + resolve service names = no + ports = * + ips = !127.0.0.1/8 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 fc00::/7 !::1/128 + hostnames = * + +[service name] + 19999 = Netdata diff --git a/collectors/ebpf.plugin/ebpf.d/process.conf b/collectors/ebpf.plugin/ebpf.d/process.conf new file mode 100644 index 000000000..7806dc844 --- /dev/null +++ b/collectors/ebpf.plugin/ebpf.d/process.conf @@ -0,0 +1,14 @@ +# The `ebpf load mode` option accepts the following values : +# `entry` : The eBPF collector only monitors calls for the functions, and does not show charts related to errors. +# `return : In the `return` mode, the eBPF collector monitors the same kernel functions as `entry`, but also creates +# new charts for the return of these functions, such as errors. +# +# The eBPF collector also creates charts for each running application through an integration with the `apps plugin`. +# If you want to disable the integration with `apps.plugin` along with the above charts, change the setting `apps` to +# 'no'. +# +# +[global] + ebpf load mode = entry + apps = yes + update every = 1 diff --git a/collectors/ebpf.plugin/ebpf.d/sync.conf b/collectors/ebpf.plugin/ebpf.d/sync.conf new file mode 100644 index 000000000..de28f3394 --- /dev/null +++ b/collectors/ebpf.plugin/ebpf.d/sync.conf @@ -0,0 +1,23 @@ +# The `ebpf load mode` option accepts the following values : +# `entry` : The eBPF collector only monitors calls for the functions, and does not show charts related to errors. +# `return : In the `return` mode, the eBPF collector monitors the same kernel functions as `entry`, but also creates +# new charts for the return of these functions, such as errors. +# +# The eBPF collector also creates charts for each running application through an integration with the `apps plugin`. +# If you want to disable the integration with `apps.plugin` along with the above charts, change the setting `apps` to +# 'no'. +# +# +[global] + ebpf load mode = entry + apps = yes + update every = 2 + +# List of monitored syscalls +[syscalls] + sync = yes + msync = yes + fsync = yes + fdatasync = yes + syncfs = yes + sync_file_range = yes diff --git a/collectors/ebpf.plugin/ebpf.h b/collectors/ebpf.plugin/ebpf.h index 35013c2b2..6796dcdad 100644 --- a/collectors/ebpf.plugin/ebpf.h +++ b/collectors/ebpf.plugin/ebpf.h @@ -31,6 +31,9 @@ #include "ebpf_apps.h" +#define NETDATA_EBPF_OLD_CONFIG_FILE "ebpf.conf" +#define NETDATA_EBPF_CONFIG_FILE "ebpf.d.conf" + typedef struct netdata_syscall_stat { unsigned long bytes; // total number of bytes uint64_t call; // total number of calls @@ -70,8 +73,12 @@ typedef struct netdata_error_report { } netdata_error_report_t; extern ebpf_module_t ebpf_modules[]; -#define EBPF_MODULE_PROCESS_IDX 0 -#define EBPF_MODULE_SOCKET_IDX 1 +enum ebpf_module_indexes { + EBPF_MODULE_PROCESS_IDX, + EBPF_MODULE_SOCKET_IDX, + EBPF_MODULE_CACHESTAT_IDX, + EBPF_MODULE_SYNC_IDX +}; // Copied from musl header #ifndef offsetof @@ -84,6 +91,9 @@ extern ebpf_module_t ebpf_modules[]; // Chart defintions #define NETDATA_EBPF_FAMILY "ebpf" +#define NETDATA_EBPF_CHART_TYPE_LINE "line" +#define NETDATA_EBPF_CHART_TYPE_STACKED "stacked" +#define NETDATA_EBPF_MEMORY_GROUP "mem" // Log file #define NETDATA_DEVELOPER_LOG_FILE "developer.log" @@ -133,6 +143,7 @@ extern void ebpf_write_chart_cmd(char *type, char *units, char *family, char *charttype, + char *context, int order); extern void ebpf_write_global_dimension(char *name, char *id, char *algorithm); @@ -144,6 +155,8 @@ extern void ebpf_create_chart(char *type, char *title, char *units, char *family, + char *context, + char *charttype, int order, void (*ncd)(void *, int), void *move, @@ -166,6 +179,7 @@ extern void ebpf_create_charts_on_apps(char *name, char *title, char *units, char *family, + char *charttype, int order, char *algorithm, struct target *root); @@ -174,11 +188,9 @@ extern void write_end_chart(); extern void ebpf_cleanup_publish_syscall(netdata_publish_syscall_t *nps); -#define EBPF_GLOBAL_SECTION "global" #define EBPF_PROGRAMS_SECTION "ebpf programs" -#define EBPF_NETWORK_VIEWER_SECTION "network connections" -#define EBPF_SERVICE_NAME_SECTION "service name" +#define EBPF_COMMON_DIMENSION_PERCENTAGE "%" #define EBPF_COMMON_DIMENSION_CALL "calls/s" #define EBPF_COMMON_DIMENSION_BITS "kilobits/s" #define EBPF_COMMON_DIMENSION_BYTES "bytes/s" @@ -186,22 +198,24 @@ extern void ebpf_cleanup_publish_syscall(netdata_publish_syscall_t *nps); #define EBPF_COMMON_DIMENSION_PACKETS "packets" // Common variables -extern char *ebpf_user_config_dir; -extern char *ebpf_stock_config_dir; extern int debug_enabled; extern struct pid_stat *root_of_pids; extern char *ebpf_algorithms[]; - -// Socket functions and variables -// Common functions -extern void ebpf_socket_create_apps_charts(ebpf_module_t *em, struct target *root); -extern collected_number get_value_from_structure(char *basis, size_t offset); +extern struct config collector_config; extern struct pid_stat *root_of_pids; extern ebpf_process_stat_t *global_process_stat; extern size_t all_pids_count; extern int update_every; extern uint32_t finalized_threads; +// Socket functions and variables +// Common functions +extern void ebpf_process_create_apps_charts(struct ebpf_module *em, void *ptr); +extern void ebpf_socket_create_apps_charts(struct ebpf_module *em, void *ptr); +extern void ebpf_cachestat_create_apps_charts(struct ebpf_module *em, void *root); +extern void ebpf_one_dimension_write_charts(char *family, char *chart, char *dim, long long v1); +extern collected_number get_value_from_structure(char *basis, size_t offset); + #define EBPF_MAX_SYNCHRONIZATION_TIME 300 #endif /* NETDATA_COLLECTOR_EBPF_H */ diff --git a/collectors/ebpf.plugin/ebpf_apps.c b/collectors/ebpf.plugin/ebpf_apps.c index 844ce23b8..1be7b9260 100644 --- a/collectors/ebpf.plugin/ebpf_apps.c +++ b/collectors/ebpf.plugin/ebpf_apps.c @@ -909,6 +909,26 @@ static inline void del_pid_entry(pid_t pid) all_pids_count--; } +/** + * Cleanup variable from other threads + * + * @param pid current pid. + */ +void cleanup_variables_from_other_threads(uint32_t pid) +{ + // Clean socket structures + if (socket_bandwidth_curr) { + freez(socket_bandwidth_curr[pid]); + socket_bandwidth_curr[pid] = NULL; + } + + // Clean cachestat strcture + if (cachestat_pid) { + freez(cachestat_pid[pid]); + cachestat_pid[pid] = NULL; + } +} + /** * Remove PIDs when they are not running more. */ @@ -932,11 +952,7 @@ void cleanup_exited_pids() freez(current_apps_data[r]); current_apps_data[r] = NULL; - // Clean socket structures - if (socket_bandwidth_curr) { - freez(socket_bandwidth_curr[r]); - socket_bandwidth_curr[r] = NULL; - } + cleanup_variables_from_other_threads(r); } else { if (unlikely(p->keep)) p->keeploops++; @@ -1054,11 +1070,7 @@ void collect_data_for_all_processes(int tbl_pid_stats_fd) freez(current_apps_data[key]); current_apps_data[key] = NULL; - // Clean socket structures - if (socket_bandwidth_curr) { - freez(socket_bandwidth_curr[key]); - socket_bandwidth_curr[key] = NULL; - } + cleanup_variables_from_other_threads(key); pids = pids->next; continue; diff --git a/collectors/ebpf.plugin/ebpf_apps.h b/collectors/ebpf.plugin/ebpf_apps.h index f8cb7ac72..eb54754c6 100644 --- a/collectors/ebpf.plugin/ebpf_apps.h +++ b/collectors/ebpf.plugin/ebpf_apps.h @@ -11,12 +11,15 @@ #include "libnetdata/ebpf/ebpf.h" #define NETDATA_APPS_FAMILY "apps" -#define NETDATA_APPS_FILE_GROUP "ebpf file" -#define NETDATA_APPS_VFS_GROUP "ebpf vfs" -#define NETDATA_APPS_PROCESS_GROUP "ebpf process" -#define NETDATA_APPS_NET_GROUP "ebpf net" +#define NETDATA_APPS_FILE_GROUP "file (eBPF)" +#define NETDATA_APPS_VFS_GROUP "vfs (eBPF)" +#define NETDATA_APPS_PROCESS_GROUP "process (eBPF)" +#define NETDATA_APPS_NET_GROUP "net (eBPF)" +#define NETDATA_APPS_CACHESTAT_GROUP "page cache (eBPF)" #include "ebpf_process.h" +#include "ebpf_cachestat.h" +#include "ebpf_sync.h" #define MAX_COMPARE_NAME 100 #define MAX_NAME 100 @@ -105,6 +108,9 @@ struct target { uid_t uid; gid_t gid; + // Page cache statistic per process + netdata_publish_cachestat_t cachestat; + /* These variables are not necessary for eBPF collector kernel_uint_t minflt; kernel_uint_t cminflt; @@ -426,5 +432,6 @@ extern void collect_data_for_all_processes(int tbl_pid_stats_fd); extern ebpf_process_stat_t **global_process_stats; extern ebpf_process_publish_apps_t **current_apps_data; +extern netdata_publish_cachestat_t **cachestat_pid; #endif /* NETDATA_EBPF_APPS_H */ diff --git a/collectors/ebpf.plugin/ebpf_cachestat.c b/collectors/ebpf.plugin/ebpf_cachestat.c new file mode 100644 index 000000000..6516d4da2 --- /dev/null +++ b/collectors/ebpf.plugin/ebpf_cachestat.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "ebpf.h" +#include "ebpf_cachestat.h" + +static ebpf_data_t cachestat_data; +netdata_publish_cachestat_t **cachestat_pid; + +static struct bpf_link **probe_links = NULL; +static struct bpf_object *objects = NULL; + +static char *cachestat_counter_dimension_name[NETDATA_CACHESTAT_END] = { "ratio", "dirty", "hit", + "miss" }; +static netdata_syscall_stat_t cachestat_counter_aggregated_data[NETDATA_CACHESTAT_END]; +static netdata_publish_syscall_t cachestat_counter_publish_aggregated[NETDATA_CACHESTAT_END]; + +netdata_cachestat_pid_t *cachestat_vector = NULL; + +static netdata_idx_t *cachestat_hash_values = NULL; + +static int read_thread_closed = 1; + +struct netdata_static_thread cachestat_threads = {"CACHESTAT KERNEL", + NULL, NULL, 1, NULL, + NULL, NULL}; + +static int *map_fd = NULL; + +struct config cachestat_config = { .first_section = NULL, + .last_section = NULL, + .mutex = NETDATA_MUTEX_INITIALIZER, + .index = { .avl_tree = { .root = NULL, .compar = appconfig_section_compare }, + .rwlock = AVL_LOCK_INITIALIZER } }; + +/***************************************************************** + * + * FUNCTIONS TO CLOSE THE THREAD + * + *****************************************************************/ + +/** + * Clean PID structures + * + * Clean the allocated structures. + */ +static void clean_pid_structures() { + struct pid_stat *pids = root_of_pids; + while (pids) { + freez(cachestat_pid[pids->pid]); + + pids = pids->next; + } +} + +/** + * Clean up the main thread. + * + * @param ptr thread data. + */ +static void ebpf_cachestat_cleanup(void *ptr) +{ + ebpf_module_t *em = (ebpf_module_t *)ptr; + if (!em->enabled) + return; + + heartbeat_t hb; + heartbeat_init(&hb); + uint32_t tick = 2*USEC_PER_MS; + while (!read_thread_closed) { + usec_t dt = heartbeat_next(&hb, tick); + UNUSED(dt); + } + + clean_pid_structures(); + freez(cachestat_pid); + + ebpf_cleanup_publish_syscall(cachestat_counter_publish_aggregated); + + freez(cachestat_vector); + freez(cachestat_hash_values); + + struct bpf_program *prog; + size_t i = 0 ; + bpf_object__for_each_program(prog, objects) { + bpf_link__destroy(probe_links[i]); + i++; + } + bpf_object__close(objects); +} + +/***************************************************************** + * + * COMMON FUNCTIONS + * + *****************************************************************/ + +/** + * Update publish + * + * Update publish values before to write dimension. + * + * @param out strcuture that will receive data. + * @param mpa calls for mark_page_accessed during the last second. + * @param mbd calls for mark_buffer_dirty during the last second. + * @param apcl calls for add_to_page_cache_lru during the last second. + * @param apd calls for account_page_dirtied during the last second. + */ +void cachestat_update_publish(netdata_publish_cachestat_t *out, uint64_t mpa, uint64_t mbd, + uint64_t apcl, uint64_t apd) +{ + // Adapted algorithm from https://github.com/iovisor/bcc/blob/master/tools/cachestat.py#L126-L138 + calculated_number total = (calculated_number) (((long long)mpa) - ((long long)mbd)); + if (total < 0) + total = 0; + + calculated_number misses = (calculated_number) ( ((long long) apcl) - ((long long) apd) ); + if (misses < 0) + misses = 0; + + // If hits are < 0, then its possible misses are overestimate due to possibly page cache read ahead adding + // more pages than needed. In this case just assume misses as total and reset hits. + calculated_number hits = total - misses; + if (hits < 0 ) { + misses = total; + hits = 0; + } + + calculated_number ratio = (total > 0) ? hits/total : 0; + + out->ratio = (long long )(ratio*100); + out->hit = (long long)hits; + out->miss = (long long)misses; +} + +/** + * Save previous values + * + * Save values used this time. + * + * @param publish + */ +static void save_previous_values(netdata_publish_cachestat_t *publish) { + publish->prev.mark_page_accessed = cachestat_hash_values[NETDATA_KEY_CALLS_MARK_PAGE_ACCESSED]; + publish->prev.account_page_dirtied = cachestat_hash_values[NETDATA_KEY_CALLS_ACCOUNT_PAGE_DIRTIED]; + publish->prev.add_to_page_cache_lru = cachestat_hash_values[NETDATA_KEY_CALLS_ADD_TO_PAGE_CACHE_LRU]; + publish->prev.mark_buffer_dirty = cachestat_hash_values[NETDATA_KEY_CALLS_MARK_BUFFER_DIRTY]; +} + +/** + * Calculate statistics + * + * @param publish the structure where we will store the data. + */ +static void calculate_stats(netdata_publish_cachestat_t *publish) { + if (!publish->prev.mark_page_accessed) { + save_previous_values(publish); + return; + } + + uint64_t mpa = cachestat_hash_values[NETDATA_KEY_CALLS_MARK_PAGE_ACCESSED] - publish->prev.mark_page_accessed; + uint64_t mbd = cachestat_hash_values[NETDATA_KEY_CALLS_MARK_BUFFER_DIRTY] - publish->prev.mark_buffer_dirty; + uint64_t apcl = cachestat_hash_values[NETDATA_KEY_CALLS_ADD_TO_PAGE_CACHE_LRU] - publish->prev.add_to_page_cache_lru; + uint64_t apd = cachestat_hash_values[NETDATA_KEY_CALLS_ACCOUNT_PAGE_DIRTIED] - publish->prev.account_page_dirtied; + + save_previous_values(publish); + + // We are changing the original algorithm to have a smooth ratio. + cachestat_update_publish(publish, mpa, mbd, apcl, apd); +} + + +/***************************************************************** + * + * APPS + * + *****************************************************************/ + +/** + * Apps Accumulator + * + * Sum all values read from kernel and store in the first address. + * + * @param out the vector with read values. + */ +static void cachestat_apps_accumulator(netdata_cachestat_pid_t *out) +{ + int i, end = (running_on_kernel >= NETDATA_KERNEL_V4_15) ? ebpf_nprocs : 1; + netdata_cachestat_pid_t *total = &out[0]; + for (i = 1; i < end; i++) { + netdata_cachestat_pid_t *w = &out[i]; + total->account_page_dirtied += w->account_page_dirtied; + total->add_to_page_cache_lru += w->add_to_page_cache_lru; + total->mark_buffer_dirty += w->mark_buffer_dirty; + total->mark_page_accessed += w->mark_page_accessed; + } +} + +/** + * Save Pid values + * + * Save the current values inside the structure + * + * @param out vector used to plot charts + * @param publish vector with values read from hash tables. + */ +static inline void cachestat_save_pid_values(netdata_publish_cachestat_t *out, netdata_cachestat_pid_t *publish) +{ + if (!out->current.mark_page_accessed) { + memcpy(&out->current, &publish[0], sizeof(netdata_cachestat_pid_t)); + return; + } + + memcpy(&out->prev, &out->current, sizeof(netdata_cachestat_pid_t)); + memcpy(&out->current, &publish[0], sizeof(netdata_cachestat_pid_t)); +} + +/** + * Fill PID + * + * Fill PID structures + * + * @param current_pid pid that we are collecting data + * @param out values read from hash tables; + */ +static void cachestat_fill_pid(uint32_t current_pid, netdata_cachestat_pid_t *publish) +{ + netdata_publish_cachestat_t *curr = cachestat_pid[current_pid]; + if (!curr) { + curr = callocz(1, sizeof(netdata_publish_cachestat_t)); + cachestat_pid[current_pid] = curr; + + cachestat_save_pid_values(curr, publish); + return; + } + + cachestat_save_pid_values(curr, publish); +} + +/** + * Read APPS table + * + * Read the apps table and store data inside the structure. + */ +static void read_apps_table() +{ + netdata_cachestat_pid_t *cv = cachestat_vector; + uint32_t key; + struct pid_stat *pids = root_of_pids; + int fd = map_fd[NETDATA_CACHESTAT_PID_STATS]; + size_t length = sizeof(netdata_cachestat_pid_t)*ebpf_nprocs; + while (pids) { + key = pids->pid; + + if (bpf_map_lookup_elem(fd, &key, cv)) { + pids = pids->next; + continue; + } + + cachestat_apps_accumulator(cv); + + cachestat_fill_pid(key, cv); + + // We are cleaning to avoid passing data read from one process to other. + memset(cv, 0, length); + + pids = pids->next; + } +} + +/** + * Create apps charts + * + * Call ebpf_create_chart to create the charts on apps submenu. + * + * @param em a pointer to the structure with the default values. + */ +void ebpf_cachestat_create_apps_charts(struct ebpf_module *em, void *ptr) +{ + UNUSED(em); + struct target *root = ptr; + ebpf_create_charts_on_apps(NETDATA_CACHESTAT_HIT_RATIO_CHART, + "The ratio is calculated dividing the Hit pages per total cache accesses without counting dirties.", + EBPF_COMMON_DIMENSION_PERCENTAGE, + NETDATA_APPS_CACHESTAT_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, + 20090, + ebpf_algorithms[NETDATA_EBPF_ABSOLUTE_IDX], + root); + + ebpf_create_charts_on_apps(NETDATA_CACHESTAT_DIRTY_CHART, + "Number of pages marked as dirty. When a page is called dirty, this means that the data stored inside the page needs to be written to devices.", + EBPF_CACHESTAT_DIMENSION_PAGE, + NETDATA_APPS_CACHESTAT_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, + 20091, + ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], + root); + + ebpf_create_charts_on_apps(NETDATA_CACHESTAT_HIT_CHART, + "Number of cache access without counting dirty pages and page additions.", + EBPF_CACHESTAT_DIMENSION_HITS, + NETDATA_APPS_CACHESTAT_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, + 20092, + ebpf_algorithms[NETDATA_EBPF_ABSOLUTE_IDX], + root); + + ebpf_create_charts_on_apps(NETDATA_CACHESTAT_MISSES_CHART, + "Page caches added without counting dirty pages", + EBPF_CACHESTAT_DIMENSION_MISSES, + NETDATA_APPS_CACHESTAT_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, + 20093, + ebpf_algorithms[NETDATA_EBPF_ABSOLUTE_IDX], + root); +} + +/***************************************************************** + * + * MAIN LOOP + * + *****************************************************************/ + +/** + * Read global counter + * + * Read the table with number of calls for all functions + */ +static void read_global_table() +{ + uint32_t idx; + netdata_idx_t *val = cachestat_hash_values; + netdata_idx_t stored; + int fd = map_fd[NETDATA_CACHESTAT_GLOBAL_STATS]; + + for (idx = NETDATA_KEY_CALLS_ADD_TO_PAGE_CACHE_LRU; idx < NETDATA_CACHESTAT_END; idx++) { + if (!bpf_map_lookup_elem(fd, &idx, &stored)) { + val[idx] = stored; + } + } +} + +/** + * Socket read hash + * + * This is the thread callback. + * This thread is necessary, because we cannot freeze the whole plugin to read the data on very busy socket. + * + * @param ptr It is a NULL value for this thread. + * + * @return It always returns NULL. + */ +void *ebpf_cachestat_read_hash(void *ptr) +{ + read_thread_closed = 0; + + heartbeat_t hb; + heartbeat_init(&hb); + + ebpf_module_t *em = (ebpf_module_t *)ptr; + + usec_t step = NETDATA_LATENCY_CACHESTAT_SLEEP_MS * em->update_time; + int apps = em->apps_charts; + while (!close_ebpf_plugin) { + usec_t dt = heartbeat_next(&hb, step); + (void)dt; + + read_global_table(); + + if (apps) + read_apps_table(); + } + read_thread_closed = 1; + + return NULL; +} + +/** + * Send global + * + * Send global charts to Netdata + */ +static void cachestat_send_global(netdata_publish_cachestat_t *publish) +{ + calculate_stats(publish); + + netdata_publish_syscall_t *ptr = cachestat_counter_publish_aggregated; + // The algorithm sets this value to zero sometimes, we are not written them to have a smooth chart + if (publish->ratio) { + ebpf_one_dimension_write_charts( + NETDATA_EBPF_MEMORY_GROUP, NETDATA_CACHESTAT_HIT_RATIO_CHART, ptr[NETDATA_CACHESTAT_IDX_RATIO].dimension, + publish->ratio); + } + + ebpf_one_dimension_write_charts( + NETDATA_EBPF_MEMORY_GROUP, NETDATA_CACHESTAT_DIRTY_CHART, ptr[NETDATA_CACHESTAT_IDX_DIRTY].dimension, + cachestat_hash_values[NETDATA_KEY_CALLS_MARK_BUFFER_DIRTY]); + + ebpf_one_dimension_write_charts( + NETDATA_EBPF_MEMORY_GROUP, NETDATA_CACHESTAT_HIT_CHART, ptr[NETDATA_CACHESTAT_IDX_HIT].dimension, publish->hit); + + ebpf_one_dimension_write_charts( + NETDATA_EBPF_MEMORY_GROUP, NETDATA_CACHESTAT_MISSES_CHART, ptr[NETDATA_CACHESTAT_IDX_MISS].dimension, + publish->miss); +} + +/** + * Cachestat sum PIDs + * + * Sum values for all PIDs associated to a group + * + * @param publish output structure. + * @param root structure with listed IPs + */ +void ebpf_cachestat_sum_pids(netdata_publish_cachestat_t *publish, struct pid_on_target *root) +{ + memcpy(&publish->prev, &publish->current,sizeof(publish->current)); + memset(&publish->current, 0, sizeof(publish->current)); + + netdata_cachestat_pid_t *dst = &publish->current; + while (root) { + int32_t pid = root->pid; + netdata_publish_cachestat_t *w = cachestat_pid[pid]; + if (w) { + netdata_cachestat_pid_t *src = &w->current; + dst->account_page_dirtied += src->account_page_dirtied; + dst->add_to_page_cache_lru += src->add_to_page_cache_lru; + dst->mark_buffer_dirty += src->mark_buffer_dirty; + dst->mark_page_accessed += src->mark_page_accessed; + } + + root = root->next; + } +} + +/** + * Send data to Netdata calling auxiliar functions. + * + * @param root the target list. +*/ +void ebpf_cache_send_apps_data(struct target *root) +{ + struct target *w; + collected_number value; + + write_begin_chart(NETDATA_APPS_FAMILY, NETDATA_CACHESTAT_HIT_RATIO_CHART); + for (w = root; w; w = w->next) { + if (unlikely(w->exposed && w->processes)) { + ebpf_cachestat_sum_pids(&w->cachestat, w->root_pid); + netdata_cachestat_pid_t *current = &w->cachestat.current; + netdata_cachestat_pid_t *prev = &w->cachestat.prev; + + uint64_t mpa = current->mark_page_accessed - prev->mark_page_accessed; + uint64_t mbd = current->mark_buffer_dirty - prev->mark_buffer_dirty; + w->cachestat.dirty = current->mark_buffer_dirty; + uint64_t apcl = current->add_to_page_cache_lru - prev->add_to_page_cache_lru; + uint64_t apd = current->account_page_dirtied - prev->account_page_dirtied; + + cachestat_update_publish(&w->cachestat, mpa, mbd, apcl, apd); + value = (collected_number) w->cachestat.ratio; + // Here we are using different approach to have a chart more smooth + write_chart_dimension(w->name, value); + } + } + write_end_chart(); + + write_begin_chart(NETDATA_APPS_FAMILY, NETDATA_CACHESTAT_DIRTY_CHART); + for (w = root; w; w = w->next) { + if (unlikely(w->exposed && w->processes)) { + value = (collected_number) w->cachestat.dirty; + write_chart_dimension(w->name, value); + } + } + write_end_chart(); + + write_begin_chart(NETDATA_APPS_FAMILY, NETDATA_CACHESTAT_HIT_CHART); + for (w = root; w; w = w->next) { + if (unlikely(w->exposed && w->processes)) { + value = (collected_number) w->cachestat.hit; + write_chart_dimension(w->name, value); + } + } + write_end_chart(); + + write_begin_chart(NETDATA_APPS_FAMILY, NETDATA_CACHESTAT_MISSES_CHART); + for (w = root; w; w = w->next) { + if (unlikely(w->exposed && w->processes)) { + value = (collected_number) w->cachestat.miss; + write_chart_dimension(w->name, value); + } + } + write_end_chart(); +} + +/** +* Main loop for this collector. +*/ +static void cachestat_collector(ebpf_module_t *em) +{ + cachestat_threads.thread = mallocz(sizeof(netdata_thread_t)); + cachestat_threads.start_routine = ebpf_cachestat_read_hash; + + map_fd = cachestat_data.map_fd; + + netdata_thread_create(cachestat_threads.thread, cachestat_threads.name, NETDATA_THREAD_OPTION_JOINABLE, + ebpf_cachestat_read_hash, em); + + netdata_publish_cachestat_t publish; + memset(&publish, 0, sizeof(publish)); + int apps = em->apps_charts; + while (!close_ebpf_plugin) { + pthread_mutex_lock(&collect_data_mutex); + pthread_cond_wait(&collect_data_cond_var, &collect_data_mutex); + + pthread_mutex_lock(&lock); + + cachestat_send_global(&publish); + + if (apps) + ebpf_cache_send_apps_data(apps_groups_root_target); + + pthread_mutex_unlock(&lock); + pthread_mutex_unlock(&collect_data_mutex); + } +} + +/***************************************************************** + * + * INITIALIZE THREAD + * + *****************************************************************/ + +/** + * Create global charts + * + * Call ebpf_create_chart to create the charts for the collector. + */ +static void ebpf_create_memory_charts() +{ + ebpf_create_chart(NETDATA_EBPF_MEMORY_GROUP, NETDATA_CACHESTAT_HIT_RATIO_CHART, + "Hit is calculating using total cache added without dirties per total added because of red misses.", + EBPF_CACHESTAT_DIMENSION_HITS, NETDATA_CACHESTAT_SUBMENU, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, + 21100, + ebpf_create_global_dimension, + cachestat_counter_publish_aggregated, 1); + + ebpf_create_chart(NETDATA_EBPF_MEMORY_GROUP, NETDATA_CACHESTAT_DIRTY_CHART, + "Number of dirty pages added to the page cache.", + EBPF_CACHESTAT_DIMENSION_PAGE, NETDATA_CACHESTAT_SUBMENU, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, + 21101, + ebpf_create_global_dimension, + &cachestat_counter_publish_aggregated[NETDATA_CACHESTAT_IDX_DIRTY], 1); + + ebpf_create_chart(NETDATA_EBPF_MEMORY_GROUP, NETDATA_CACHESTAT_HIT_CHART, + "Hits are function calls that Netdata counts.", + EBPF_CACHESTAT_DIMENSION_HITS, NETDATA_CACHESTAT_SUBMENU, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, + 21102, + ebpf_create_global_dimension, + &cachestat_counter_publish_aggregated[NETDATA_CACHESTAT_IDX_HIT], 1); + + ebpf_create_chart(NETDATA_EBPF_MEMORY_GROUP, NETDATA_CACHESTAT_MISSES_CHART, + "Misses are function calls that Netdata counts.", + EBPF_CACHESTAT_DIMENSION_MISSES, NETDATA_CACHESTAT_SUBMENU, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, + 21103, + ebpf_create_global_dimension, + &cachestat_counter_publish_aggregated[NETDATA_CACHESTAT_IDX_MISS], 1); + + fflush(stdout); +} + +/** + * Allocate vectors used with this thread. + * + * We are not testing the return, because callocz does this and shutdown the software + * case it was not possible to allocate. + * + * @param length is the length for the vectors used inside the collector. + */ +static void ebpf_cachestat_allocate_global_vectors(size_t length) +{ + cachestat_pid = callocz((size_t)pid_max, sizeof(netdata_publish_cachestat_t *)); + cachestat_vector = callocz((size_t)ebpf_nprocs, sizeof(netdata_cachestat_pid_t)); + + cachestat_hash_values = callocz(length, sizeof(netdata_idx_t)); + + memset(cachestat_counter_aggregated_data, 0, length * sizeof(netdata_syscall_stat_t)); + memset(cachestat_counter_publish_aggregated, 0, length * sizeof(netdata_publish_syscall_t)); +} + +/***************************************************************** + * + * MAIN THREAD + * + *****************************************************************/ + +/** + * Cachestat thread + * + * Thread used to make cachestat thread + * + * @param ptr a pointer to `struct ebpf_module` + * + * @return It always return NULL + */ +void *ebpf_cachestat_thread(void *ptr) +{ + netdata_thread_cleanup_push(ebpf_cachestat_cleanup, ptr); + + ebpf_module_t *em = (ebpf_module_t *)ptr; + fill_ebpf_data(&cachestat_data); + + ebpf_update_module(em, &cachestat_config, NETDATA_CACHESTAT_CONFIG_FILE); + + if (!em->enabled) + goto endcachestat; + + pthread_mutex_lock(&lock); + ebpf_cachestat_allocate_global_vectors(NETDATA_CACHESTAT_END); + if (ebpf_update_kernel(&cachestat_data)) { + pthread_mutex_unlock(&lock); + goto endcachestat; + } + + probe_links = ebpf_load_program(ebpf_plugin_dir, em, kernel_string, &objects, cachestat_data.map_fd); + if (!probe_links) { + pthread_mutex_unlock(&lock); + goto endcachestat; + } + + int algorithms[NETDATA_CACHESTAT_END] = { + NETDATA_EBPF_ABSOLUTE_IDX, NETDATA_EBPF_INCREMENTAL_IDX, NETDATA_EBPF_ABSOLUTE_IDX, NETDATA_EBPF_ABSOLUTE_IDX + }; + + ebpf_global_labels(cachestat_counter_aggregated_data, cachestat_counter_publish_aggregated, + cachestat_counter_dimension_name, cachestat_counter_dimension_name, + algorithms, NETDATA_CACHESTAT_END); + + ebpf_create_memory_charts(); + + pthread_mutex_unlock(&lock); + + cachestat_collector(em); + +endcachestat: + netdata_thread_cleanup_pop(1); + return NULL; +} diff --git a/collectors/ebpf.plugin/ebpf_cachestat.h b/collectors/ebpf.plugin/ebpf_cachestat.h new file mode 100644 index 000000000..daf678975 --- /dev/null +++ b/collectors/ebpf.plugin/ebpf_cachestat.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_EBPF_CACHESTAT_H +#define NETDATA_EBPF_CACHESTAT_H 1 + +// charts +#define NETDATA_CACHESTAT_HIT_RATIO_CHART "cachestat_ratio" +#define NETDATA_CACHESTAT_DIRTY_CHART "cachestat_dirties" +#define NETDATA_CACHESTAT_HIT_CHART "cachestat_hits" +#define NETDATA_CACHESTAT_MISSES_CHART "cachestat_misses" + +#define NETDATA_CACHESTAT_SUBMENU "page cache (eBPF)" + +#define EBPF_CACHESTAT_DIMENSION_PAGE "pages/s" +#define EBPF_CACHESTAT_DIMENSION_HITS "hits/s" +#define EBPF_CACHESTAT_DIMENSION_MISSES "misses/s" + +#define NETDATA_LATENCY_CACHESTAT_SLEEP_MS 600000ULL + +// configuration file +#define NETDATA_CACHESTAT_CONFIG_FILE "cachestat.conf" + +// variables +enum cachestat_counters { + NETDATA_KEY_CALLS_ADD_TO_PAGE_CACHE_LRU, + NETDATA_KEY_CALLS_MARK_PAGE_ACCESSED, + NETDATA_KEY_CALLS_ACCOUNT_PAGE_DIRTIED, + NETDATA_KEY_CALLS_MARK_BUFFER_DIRTY, + + NETDATA_CACHESTAT_END +}; + +enum cachestat_indexes { + NETDATA_CACHESTAT_IDX_RATIO, + NETDATA_CACHESTAT_IDX_DIRTY, + NETDATA_CACHESTAT_IDX_HIT, + NETDATA_CACHESTAT_IDX_MISS +}; + +enum cachesta_tables { + NETDATA_CACHESTAT_GLOBAL_STATS, + NETDATA_CACHESTAT_PID_STATS +}; + +typedef struct netdata_publish_cachestat_pid { + uint64_t add_to_page_cache_lru; + uint64_t mark_page_accessed; + uint64_t account_page_dirtied; + uint64_t mark_buffer_dirty; +} netdata_cachestat_pid_t; + +typedef struct netdata_publish_cachestat { + long long ratio; + long long dirty; + long long hit; + long long miss; + + netdata_cachestat_pid_t current; + netdata_cachestat_pid_t prev; +} netdata_publish_cachestat_t; + +extern void *ebpf_cachestat_thread(void *ptr); + +#endif // NETDATA_EBPF_CACHESTAT_H diff --git a/collectors/ebpf.plugin/ebpf_kernel_reject_list.txt b/collectors/ebpf.plugin/ebpf_kernel_reject_list.txt deleted file mode 100644 index d56b216a9..000000000 --- a/collectors/ebpf.plugin/ebpf_kernel_reject_list.txt +++ /dev/null @@ -1 +0,0 @@ -Ubuntu 4.18.0-13. diff --git a/collectors/ebpf.plugin/ebpf_process.c b/collectors/ebpf.plugin/ebpf_process.c index 27e39d1a5..5fa930b2d 100644 --- a/collectors/ebpf.plugin/ebpf_process.c +++ b/collectors/ebpf.plugin/ebpf_process.c @@ -19,8 +19,8 @@ static char *process_id_names[NETDATA_KEY_PUBLISH_PROCESS_END] = { "do_sys_open" static char *status[] = { "process", "zombie" }; static netdata_idx_t *process_hash_values = NULL; -static netdata_syscall_stat_t *process_aggregated_data = NULL; -static netdata_publish_syscall_t *process_publish_aggregated = NULL; +static netdata_syscall_stat_t process_aggregated_data[NETDATA_KEY_PUBLISH_PROCESS_END]; +static netdata_publish_syscall_t process_publish_aggregated[NETDATA_KEY_PUBLISH_PROCESS_END]; static ebpf_data_t process_data; @@ -33,6 +33,12 @@ static int *map_fd = NULL; static struct bpf_object *objects = NULL; static struct bpf_link **probe_links = NULL; +struct config process_config = { .first_section = NULL, + .last_section = NULL, + .mutex = NETDATA_MUTEX_INITIALIZER, + .index = { .avl_tree = { .root = NULL, .compar = appconfig_section_compare }, + .rwlock = AVL_LOCK_INITIALIZER } }; + /***************************************************************** * * PROCESS DATA AND SEND TO NETDATA @@ -520,6 +526,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Open and close calls", EBPF_COMMON_DIMENSION_CALL, NETDATA_FILE_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21000, ebpf_create_global_dimension, process_publish_aggregated, @@ -531,6 +539,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Open fails", EBPF_COMMON_DIMENSION_CALL, NETDATA_FILE_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21001, ebpf_create_global_dimension, process_publish_aggregated, @@ -542,6 +552,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Remove files", EBPF_COMMON_DIMENSION_CALL, NETDATA_VFS_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21002, ebpf_create_global_dimension, &process_publish_aggregated[NETDATA_DEL_START], @@ -552,6 +564,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Calls to IO", EBPF_COMMON_DIMENSION_CALL, NETDATA_VFS_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21003, ebpf_create_global_dimension, &process_publish_aggregated[NETDATA_IN_START_BYTE], @@ -569,6 +583,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Fails to write or read", EBPF_COMMON_DIMENSION_CALL, NETDATA_VFS_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21005, ebpf_create_global_dimension, &process_publish_aggregated[2], @@ -580,6 +596,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Start process", EBPF_COMMON_DIMENSION_CALL, NETDATA_PROCESS_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21006, ebpf_create_global_dimension, &process_publish_aggregated[NETDATA_PROCESS_START], @@ -590,6 +608,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Exit process", EBPF_COMMON_DIMENSION_CALL, NETDATA_PROCESS_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21007, ebpf_create_global_dimension, &process_publish_aggregated[NETDATA_EXIT_START], @@ -608,6 +628,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Fails to create process", EBPF_COMMON_DIMENSION_CALL, NETDATA_PROCESS_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21009, ebpf_create_global_dimension, &process_publish_aggregated[NETDATA_PROCESS_START], @@ -621,14 +643,16 @@ static void ebpf_create_global_charts(ebpf_module_t *em) * Call ebpf_create_chart to create the charts on apps submenu. * * @param em a pointer to the structure with the default values. - * @param root a pointer for the targets. + * @param ptr a pointer for the targets. */ -static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *root) +void ebpf_process_create_apps_charts(struct ebpf_module *em, void *ptr) { + struct target *root = ptr; ebpf_create_charts_on_apps(NETDATA_SYSCALL_APPS_FILE_OPEN, "Number of open files", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_FILE_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20061, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -638,6 +662,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Fails to open files", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_FILE_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20062, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -647,6 +672,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Files closed", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_FILE_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20063, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -656,6 +682,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Fails to close files", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_FILE_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20064, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -665,6 +692,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Files deleted", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_VFS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20065, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -673,6 +701,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Write to disk", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_VFS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20066, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], apps_groups_root_target); @@ -682,6 +711,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Fails to write", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_VFS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20067, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -691,6 +721,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Read from disk", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_VFS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20068, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -700,6 +731,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Fails to read", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_VFS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20069, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -708,6 +740,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro ebpf_create_charts_on_apps(NETDATA_SYSCALL_APPS_VFS_WRITE_BYTES, "Bytes written on disk", EBPF_COMMON_DIMENSION_BYTES, NETDATA_APPS_VFS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20070, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -715,6 +748,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro ebpf_create_charts_on_apps(NETDATA_SYSCALL_APPS_VFS_READ_BYTES, "Bytes read from disk", EBPF_COMMON_DIMENSION_BYTES, NETDATA_APPS_VFS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20071, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -723,6 +757,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Process started", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_PROCESS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20072, ebpf_algorithms[NETDATA_EBPF_ABSOLUTE_IDX], root); @@ -731,6 +766,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Threads started", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_PROCESS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20073, ebpf_algorithms[NETDATA_EBPF_ABSOLUTE_IDX], root); @@ -739,6 +775,7 @@ static void ebpf_process_create_apps_charts(ebpf_module_t *em, struct target *ro "Tasks closed", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_PROCESS_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20074, ebpf_algorithms[NETDATA_EBPF_ABSOLUTE_IDX], root); @@ -786,11 +823,12 @@ static void ebpf_create_apps_charts(ebpf_module_t *em, struct target *root) if (!newly_added) return; - if (ebpf_modules[EBPF_MODULE_PROCESS_IDX].apps_charts) - ebpf_process_create_apps_charts(em, root); - - if (ebpf_modules[EBPF_MODULE_SOCKET_IDX].apps_charts) - ebpf_socket_create_apps_charts(NULL, root); + int counter; + for (counter = 0; ebpf_modules[counter].thread_name; counter++) { + ebpf_module_t *current = &ebpf_modules[counter]; + if (current->enabled && current->apps_charts && current->apps_routine) + current->apps_routine(em, root); + } } /***************************************************************** @@ -904,9 +942,7 @@ static void ebpf_process_cleanup(void *ptr) UNUSED(dt); } - freez(process_aggregated_data); ebpf_cleanup_publish_syscall(process_publish_aggregated); - freez(process_publish_aggregated); freez(process_hash_values); clean_global_memory(); @@ -940,8 +976,8 @@ static void ebpf_process_cleanup(void *ptr) */ static void ebpf_process_allocate_global_vectors(size_t length) { - process_aggregated_data = callocz(length, sizeof(netdata_syscall_stat_t)); - process_publish_aggregated = callocz(length, sizeof(netdata_publish_syscall_t)); + memset(process_aggregated_data, 0, length * sizeof(netdata_syscall_stat_t)); + memset(process_publish_aggregated, 0, length * sizeof(netdata_publish_syscall_t)); process_hash_values = callocz(ebpf_nprocs, sizeof(netdata_idx_t)); global_process_stats = callocz((size_t)pid_max, sizeof(ebpf_process_stat_t *)); @@ -1018,13 +1054,15 @@ void *ebpf_process_thread(void *ptr) fill_ebpf_data(&process_data); pthread_mutex_lock(&lock); - ebpf_process_allocate_global_vectors(NETDATA_MAX_MONITOR_VECTOR); + ebpf_process_allocate_global_vectors(NETDATA_KEY_PUBLISH_PROCESS_END); if (ebpf_update_kernel(&process_data)) { pthread_mutex_unlock(&lock); goto endprocess; } + ebpf_update_module(em, &process_config, NETDATA_PROCESS_CONFIG_FILE); + set_local_pointers(); probe_links = ebpf_load_program(ebpf_plugin_dir, em, kernel_string, &objects, process_data.map_fd); if (!probe_links) { @@ -1040,7 +1078,7 @@ void *ebpf_process_thread(void *ptr) ebpf_global_labels( process_aggregated_data, process_publish_aggregated, process_dimension_names, process_id_names, - algorithms, NETDATA_MAX_MONITOR_VECTOR); + algorithms, NETDATA_KEY_PUBLISH_PROCESS_END); if (process_enabled) { ebpf_create_global_charts(em); diff --git a/collectors/ebpf.plugin/ebpf_process.h b/collectors/ebpf.plugin/ebpf_process.h index aa6ed66d6..a731227e1 100644 --- a/collectors/ebpf.plugin/ebpf_process.h +++ b/collectors/ebpf.plugin/ebpf_process.h @@ -10,7 +10,6 @@ // Internal constants #define NETDATA_GLOBAL_VECTOR 24 -#define NETDATA_MAX_MONITOR_VECTOR 9 #define NETDATA_VFS_ERRORS 3 // Map index @@ -52,6 +51,9 @@ #define NETDATA_SYSCALL_APPS_VFS_WRITE_CALLS_ERROR "vfs_write_error" #define NETDATA_SYSCALL_APPS_VFS_READ_CALLS_ERROR "vfs_read_error" +// Process configuration name +#define NETDATA_PROCESS_CONFIG_FILE "process.conf" + // Index from kernel typedef enum ebpf_process_index { NETDATA_KEY_CALLS_DO_SYS_OPEN, diff --git a/collectors/ebpf.plugin/ebpf_socket.c b/collectors/ebpf.plugin/ebpf_socket.c index 7fbc24421..a142d43b3 100644 --- a/collectors/ebpf.plugin/ebpf_socket.c +++ b/collectors/ebpf.plugin/ebpf_socket.c @@ -17,8 +17,8 @@ static char *socket_id_names[NETDATA_MAX_SOCKET_VECTOR] = { "tcp_sendmsg", "tcp_ "udp_sendmsg", "udp_recvmsg", "tcp_retransmit_skb" }; static netdata_idx_t *socket_hash_values = NULL; -static netdata_syscall_stat_t *socket_aggregated_data = NULL; -static netdata_publish_syscall_t *socket_publish_aggregated = NULL; +static netdata_syscall_stat_t socket_aggregated_data[NETDATA_MAX_SOCKET_VECTOR]; +static netdata_publish_syscall_t socket_publish_aggregated[NETDATA_MAX_SOCKET_VECTOR]; static ebpf_data_t socket_data; @@ -40,6 +40,12 @@ static int *map_fd = NULL; static struct bpf_object *objects = NULL; static struct bpf_link **probe_links = NULL; +struct config socket_config = { .first_section = NULL, + .last_section = NULL, + .mutex = NETDATA_MUTEX_INITIALIZER, + .index = { .avl_tree = { .root = NULL, .compar = appconfig_section_compare }, + .rwlock = AVL_LOCK_INITIALIZER } }; + /***************************************************************** * * PROCESS DATA AND SEND TO NETDATA @@ -279,17 +285,20 @@ static void ebpf_socket_send_data(ebpf_module_t *em) NETDATA_TCP_FUNCTION_ERROR, NETDATA_EBPF_FAMILY, socket_publish_aggregated, 2); } write_count_chart( - NETDATA_TCP_RETRANSMIT, NETDATA_EBPF_FAMILY, &socket_publish_aggregated[NETDATA_RETRANSMIT_START], 1); + NETDATA_TCP_RETRANSMIT, NETDATA_EBPF_FAMILY, &socket_publish_aggregated[NETDATA_IDX_TCP_RETRANSMIT], + 1); write_count_chart( - NETDATA_UDP_FUNCTION_COUNT, NETDATA_EBPF_FAMILY, &socket_publish_aggregated[NETDATA_UDP_START], 2); + NETDATA_UDP_FUNCTION_COUNT, NETDATA_EBPF_FAMILY, &socket_publish_aggregated[NETDATA_IDX_UDP_RECVBUF], + 2); write_io_chart( NETDATA_UDP_FUNCTION_BITS, NETDATA_EBPF_FAMILY, socket_id_names[3],(long long)common_udp.write*8/100, socket_id_names[4], (long long)common_udp.read*8/1000); if (em->mode < MODE_ENTRY) { write_err_chart( - NETDATA_UDP_FUNCTION_ERROR, NETDATA_EBPF_FAMILY, &socket_publish_aggregated[NETDATA_UDP_START], 2); + NETDATA_UDP_FUNCTION_ERROR, NETDATA_EBPF_FAMILY, &socket_publish_aggregated[NETDATA_UDP_START], + 2); } } @@ -427,6 +436,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Calls to internal functions", EBPF_COMMON_DIMENSION_CALL, NETDATA_SOCKET_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21070, ebpf_create_global_dimension, socket_publish_aggregated, @@ -435,6 +446,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) ebpf_create_chart(NETDATA_EBPF_FAMILY, NETDATA_TCP_FUNCTION_BITS, "TCP bandwidth", EBPF_COMMON_DIMENSION_BITS, NETDATA_SOCKET_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21071, ebpf_create_global_dimension, socket_publish_aggregated, @@ -446,6 +459,8 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "TCP errors", EBPF_COMMON_DIMENSION_CALL, NETDATA_SOCKET_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21072, ebpf_create_global_dimension, socket_publish_aggregated, @@ -457,9 +472,11 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "Packages retransmitted", EBPF_COMMON_DIMENSION_CALL, NETDATA_SOCKET_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21073, ebpf_create_global_dimension, - &socket_publish_aggregated[NETDATA_RETRANSMIT_START], + &socket_publish_aggregated[NETDATA_IDX_TCP_RETRANSMIT], 1); ebpf_create_chart(NETDATA_EBPF_FAMILY, @@ -467,17 +484,21 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "UDP calls", EBPF_COMMON_DIMENSION_CALL, NETDATA_SOCKET_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21074, ebpf_create_global_dimension, - &socket_publish_aggregated[NETDATA_UDP_START], + &socket_publish_aggregated[NETDATA_IDX_UDP_RECVBUF], 2); ebpf_create_chart(NETDATA_EBPF_FAMILY, NETDATA_UDP_FUNCTION_BITS, "UDP bandwidth", EBPF_COMMON_DIMENSION_BITS, NETDATA_SOCKET_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21075, ebpf_create_global_dimension, - &socket_publish_aggregated[NETDATA_UDP_START], + &socket_publish_aggregated[NETDATA_IDX_UDP_RECVBUF], 2); if (em->mode < MODE_ENTRY) { @@ -486,9 +507,11 @@ static void ebpf_create_global_charts(ebpf_module_t *em) "UDP errors", EBPF_COMMON_DIMENSION_CALL, NETDATA_SOCKET_GROUP, + NULL, + NETDATA_EBPF_CHART_TYPE_LINE, 21076, ebpf_create_global_dimension, - &socket_publish_aggregated[NETDATA_UDP_START], + &socket_publish_aggregated[NETDATA_IDX_UDP_RECVBUF], 2); } } @@ -498,14 +521,17 @@ static void ebpf_create_global_charts(ebpf_module_t *em) * * Call ebpf_create_chart to create the charts on apps submenu. * - * @param em a pointer to the structure with the default values. + * @param em a pointer to the structure with the default values. + * @param ptr a pointer for targets */ -void ebpf_socket_create_apps_charts(ebpf_module_t *em, struct target *root) +void ebpf_socket_create_apps_charts(struct ebpf_module *em, void *ptr) { UNUSED(em); + struct target *root = ptr;; ebpf_create_charts_on_apps(NETDATA_NET_APPS_BANDWIDTH_SENT, "Bytes sent", EBPF_COMMON_DIMENSION_BITS, NETDATA_APPS_NET_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20080, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -513,6 +539,7 @@ void ebpf_socket_create_apps_charts(ebpf_module_t *em, struct target *root) ebpf_create_charts_on_apps(NETDATA_NET_APPS_BANDWIDTH_RECV, "bytes received", EBPF_COMMON_DIMENSION_BITS, NETDATA_APPS_NET_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20081, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -521,6 +548,7 @@ void ebpf_socket_create_apps_charts(ebpf_module_t *em, struct target *root) "Calls for tcp_sendmsg", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_NET_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20082, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -529,6 +557,7 @@ void ebpf_socket_create_apps_charts(ebpf_module_t *em, struct target *root) "Calls for tcp_cleanup_rbuf", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_NET_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20083, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -537,6 +566,7 @@ void ebpf_socket_create_apps_charts(ebpf_module_t *em, struct target *root) "Calls for tcp_retransmit", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_NET_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20084, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -545,6 +575,7 @@ void ebpf_socket_create_apps_charts(ebpf_module_t *em, struct target *root) "Calls for udp_sendmsg", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_NET_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20085, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -553,6 +584,7 @@ void ebpf_socket_create_apps_charts(ebpf_module_t *em, struct target *root) "Calls for udp_recvmsg", EBPF_COMMON_DIMENSION_CALL, NETDATA_APPS_NET_GROUP, + NETDATA_EBPF_CHART_TYPE_STACKED, 20086, ebpf_algorithms[NETDATA_EBPF_INCREMENTAL_IDX], root); @@ -580,7 +612,8 @@ static void ebpf_socket_create_nv_chart(char *id, char *title, char *units, title, units, family, - "stacked", + NETDATA_EBPF_CHART_TYPE_STACKED, + NULL, order); uint32_t i; @@ -616,7 +649,8 @@ static void ebpf_socket_create_nv_retransmit(char *id, char *title, char *units, title, units, family, - "stacked", + NETDATA_EBPF_CHART_TYPE_STACKED, + NULL, order); uint32_t i; @@ -1137,7 +1171,7 @@ static void store_socket_inside_avl(netdata_vector_plot_t *out, netdata_socket_t memcpy(&test.index, lindex, sizeof(netdata_socket_idx_t)); test.flags = flags; - ret = (netdata_socket_plot_t *) avl_search_lock(&out->tree, (avl *)&test); + ret = (netdata_socket_plot_t *) avl_search_lock(&out->tree, (avl_t *)&test); if (ret) { if (lvalues->ct > ret->plot.last_time) { update_socket_data(&ret->sock, lvalues); @@ -1175,7 +1209,7 @@ static void store_socket_inside_avl(netdata_vector_plot_t *out, netdata_socket_t w->flags = flags; netdata_socket_plot_t *check ; - check = (netdata_socket_plot_t *) avl_insert_lock(&out->tree, (avl *)w); + check = (netdata_socket_plot_t *) avl_insert_lock(&out->tree, (avl_t *)w); if (check != w) error("Internal error, cannot insert the AVL tree."); @@ -1427,7 +1461,7 @@ void *ebpf_socket_read_hash(void *ptr) read_thread_closed = 0; heartbeat_t hb; heartbeat_init(&hb); - usec_t step = NETDATA_SOCKET_READ_SLEEP_MS; + usec_t step = NETDATA_SOCKET_READ_SLEEP_MS * em->update_time; int fd_ipv4 = map_fd[NETDATA_SOCKET_IPV4_HASH_TABLE]; int fd_ipv6 = map_fd[NETDATA_SOCKET_IPV6_HASH_TABLE]; int network_connection = em->optional; @@ -1471,22 +1505,22 @@ static void read_hash_global_tables() } } - socket_aggregated_data[0].call = res[NETDATA_KEY_CALLS_TCP_SENDMSG]; - socket_aggregated_data[1].call = res[NETDATA_KEY_CALLS_TCP_CLEANUP_RBUF]; - socket_aggregated_data[2].call = res[NETDATA_KEY_CALLS_TCP_CLOSE]; - socket_aggregated_data[3].call = res[NETDATA_KEY_CALLS_UDP_RECVMSG]; - socket_aggregated_data[4].call = res[NETDATA_KEY_CALLS_UDP_SENDMSG]; - socket_aggregated_data[5].call = res[NETDATA_KEY_TCP_RETRANSMIT]; - - socket_aggregated_data[0].ecall = res[NETDATA_KEY_ERROR_TCP_SENDMSG]; - socket_aggregated_data[1].ecall = res[NETDATA_KEY_ERROR_TCP_CLEANUP_RBUF]; - socket_aggregated_data[3].ecall = res[NETDATA_KEY_ERROR_UDP_RECVMSG]; - socket_aggregated_data[4].ecall = res[NETDATA_KEY_ERROR_UDP_SENDMSG]; - - socket_aggregated_data[0].bytes = res[NETDATA_KEY_BYTES_TCP_SENDMSG]; - socket_aggregated_data[1].bytes = res[NETDATA_KEY_BYTES_TCP_CLEANUP_RBUF]; - socket_aggregated_data[3].bytes = res[NETDATA_KEY_BYTES_UDP_RECVMSG]; - socket_aggregated_data[4].bytes = res[NETDATA_KEY_BYTES_UDP_SENDMSG]; + socket_aggregated_data[NETDATA_IDX_TCP_SENDMSG].call = res[NETDATA_KEY_CALLS_TCP_SENDMSG]; + socket_aggregated_data[NETDATA_IDX_TCP_CLEANUP_RBUF].call = res[NETDATA_KEY_CALLS_TCP_CLEANUP_RBUF]; + socket_aggregated_data[NETDATA_IDX_TCP_CLOSE].call = res[NETDATA_KEY_CALLS_TCP_CLOSE]; + socket_aggregated_data[NETDATA_IDX_UDP_RECVBUF].call = res[NETDATA_KEY_CALLS_UDP_RECVMSG]; + socket_aggregated_data[NETDATA_IDX_UDP_SENDMSG].call = res[NETDATA_KEY_CALLS_UDP_SENDMSG]; + socket_aggregated_data[NETDATA_IDX_TCP_RETRANSMIT].call = res[NETDATA_KEY_TCP_RETRANSMIT]; + + socket_aggregated_data[NETDATA_IDX_TCP_SENDMSG].ecall = res[NETDATA_KEY_ERROR_TCP_SENDMSG]; + socket_aggregated_data[NETDATA_IDX_TCP_CLEANUP_RBUF].ecall = res[NETDATA_KEY_ERROR_TCP_CLEANUP_RBUF]; + socket_aggregated_data[NETDATA_IDX_UDP_RECVBUF].ecall = res[NETDATA_KEY_ERROR_UDP_RECVMSG]; + socket_aggregated_data[NETDATA_IDX_UDP_SENDMSG].ecall = res[NETDATA_KEY_ERROR_UDP_SENDMSG]; + + socket_aggregated_data[NETDATA_IDX_TCP_SENDMSG].bytes = res[NETDATA_KEY_BYTES_TCP_SENDMSG]; + socket_aggregated_data[NETDATA_IDX_TCP_CLEANUP_RBUF].bytes = res[NETDATA_KEY_BYTES_TCP_CLEANUP_RBUF]; + socket_aggregated_data[NETDATA_IDX_UDP_RECVBUF].bytes = res[NETDATA_KEY_BYTES_UDP_RECVMSG]; + socket_aggregated_data[NETDATA_IDX_UDP_SENDMSG].bytes = res[NETDATA_KEY_BYTES_UDP_SENDMSG]; } /** @@ -1744,6 +1778,59 @@ void clean_thread_structures() { } } +/** + * Cleanup publish syscall + * + * @param nps list of structures to clean + */ +void ebpf_cleanup_publish_syscall(netdata_publish_syscall_t *nps) +{ + while (nps) { + freez(nps->algorithm); + nps = nps->next; + } +} + +/** + * Clean port Structure + * + * Clean the allocated list. + * + * @param clean the list that will be cleaned + */ +void clean_port_structure(ebpf_network_viewer_port_list_t **clean) +{ + ebpf_network_viewer_port_list_t *move = *clean; + while (move) { + ebpf_network_viewer_port_list_t *next = move->next; + freez(move->value); + freez(move); + + move = next; + } + *clean = NULL; +} + +/** + * Clean IP structure + * + * Clean the allocated list. + * + * @param clean the list that will be cleaned + */ +static void clean_ip_structure(ebpf_network_viewer_ip_list_t **clean) +{ + ebpf_network_viewer_ip_list_t *move = *clean; + while (move) { + ebpf_network_viewer_ip_list_t *next = move->next; + freez(move->value); + freez(move); + + move = next; + } + *clean = NULL; +} + /** * Clean up the main thread. * @@ -1763,9 +1850,7 @@ static void ebpf_socket_cleanup(void *ptr) UNUSED(dt); } - freez(socket_aggregated_data); ebpf_cleanup_publish_syscall(socket_publish_aggregated); - freez(socket_publish_aggregated); freez(socket_hash_values); clean_thread_structures(); @@ -1817,8 +1902,8 @@ static void ebpf_socket_cleanup(void *ptr) */ static void ebpf_socket_allocate_global_vectors(size_t length) { - socket_aggregated_data = callocz(length, sizeof(netdata_syscall_stat_t)); - socket_publish_aggregated = callocz(length, sizeof(netdata_publish_syscall_t)); + memset(socket_aggregated_data, 0 ,length * sizeof(netdata_syscall_stat_t)); + memset(socket_publish_aggregated, 0 ,length * sizeof(netdata_publish_syscall_t)); socket_hash_values = callocz(ebpf_nprocs, sizeof(netdata_idx_t)); socket_bandwidth_curr = callocz((size_t)pid_max, sizeof(ebpf_socket_publish_apps_t *)); @@ -1856,6 +1941,874 @@ static void initialize_inbound_outbound() * *****************************************************************/ +/** + * Fill Port list + * + * @param out a pointer to the link list. + * @param in the structure that will be linked. + */ +static inline void fill_port_list(ebpf_network_viewer_port_list_t **out, ebpf_network_viewer_port_list_t *in) +{ + if (likely(*out)) { + ebpf_network_viewer_port_list_t *move = *out, *store = *out; + uint16_t first = ntohs(in->first); + uint16_t last = ntohs(in->last); + while (move) { + uint16_t cmp_first = ntohs(move->first); + uint16_t cmp_last = ntohs(move->last); + if (cmp_first <= first && first <= cmp_last && + cmp_first <= last && last <= cmp_last ) { + info("The range/value (%u, %u) is inside the range/value (%u, %u) already inserted, it will be ignored.", + first, last, cmp_first, cmp_last); + freez(in->value); + freez(in); + return; + } else if (first <= cmp_first && cmp_first <= last && + first <= cmp_last && cmp_last <= last) { + info("The range (%u, %u) is bigger than previous range (%u, %u) already inserted, the previous will be ignored.", + first, last, cmp_first, cmp_last); + freez(move->value); + move->value = in->value; + move->first = in->first; + move->last = in->last; + freez(in); + return; + } + + store = move; + move = move->next; + } + + store->next = in; + } else { + *out = in; + } + +#ifdef NETDATA_INTERNAL_CHECKS + info("Adding values %s( %u, %u) to %s port list used on network viewer", + in->value, ntohs(in->first), ntohs(in->last), + (*out == network_viewer_opt.included_port)?"included":"excluded"); +#endif +} + +/** + * Parse Service List + * + * @param out a pointer to store the link list + * @param service the service used to create the structure that will be linked. + */ +static void parse_service_list(void **out, char *service) +{ + ebpf_network_viewer_port_list_t **list = (ebpf_network_viewer_port_list_t **)out; + struct servent *serv = getservbyname((const char *)service, "tcp"); + if (!serv) + serv = getservbyname((const char *)service, "udp"); + + if (!serv) { + info("Cannot resolv the service '%s' with protocols TCP and UDP, it will be ignored", service); + return; + } + + ebpf_network_viewer_port_list_t *w = callocz(1, sizeof(ebpf_network_viewer_port_list_t)); + w->value = strdupz(service); + w->hash = simple_hash(service); + + w->first = w->last = (uint16_t)serv->s_port; + + fill_port_list(list, w); +} + +/** + * Netmask + * + * Copied from iprange (https://github.com/firehol/iprange/blob/master/iprange.h) + * + * @param prefix create the netmask based in the CIDR value. + * + * @return + */ +static inline in_addr_t netmask(int prefix) { + + if (prefix == 0) + return (~((in_addr_t) - 1)); + else + return (in_addr_t)(~((1 << (32 - prefix)) - 1)); + +} + +/** + * Broadcast + * + * Copied from iprange (https://github.com/firehol/iprange/blob/master/iprange.h) + * + * @param addr is the ip address + * @param prefix is the CIDR value. + * + * @return It returns the last address of the range + */ +static inline in_addr_t broadcast(in_addr_t addr, int prefix) +{ + return (addr | ~netmask(prefix)); +} + +/** + * Network + * + * Copied from iprange (https://github.com/firehol/iprange/blob/master/iprange.h) + * + * @param addr is the ip address + * @param prefix is the CIDR value. + * + * @return It returns the first address of the range. + */ +static inline in_addr_t ipv4_network(in_addr_t addr, int prefix) +{ + return (addr & netmask(prefix)); +} + +/** + * IP to network long + * + * @param dst the vector to store the result + * @param ip the source ip given by our users. + * @param domain the ip domain (IPV4 or IPV6) + * @param source the original string + * + * @return it returns 0 on success and -1 otherwise. + */ +static inline int ip2nl(uint8_t *dst, char *ip, int domain, char *source) +{ + if (inet_pton(domain, ip, dst) <= 0) { + error("The address specified (%s) is invalid ", source); + return -1; + } + + return 0; +} + +/** + * Get IPV6 Last Address + * + * @param out the address to store the last address. + * @param in the address used to do the math. + * @param prefix number of bits used to calculate the address + */ +static void get_ipv6_last_addr(union netdata_ip_t *out, union netdata_ip_t *in, uint64_t prefix) +{ + uint64_t mask,tmp; + uint64_t ret[2]; + memcpy(ret, in->addr32, sizeof(union netdata_ip_t)); + + if (prefix == 128) { + memcpy(out->addr32, in->addr32, sizeof(union netdata_ip_t)); + return; + } else if (!prefix) { + ret[0] = ret[1] = 0xFFFFFFFFFFFFFFFF; + memcpy(out->addr32, ret, sizeof(union netdata_ip_t)); + return; + } else if (prefix <= 64) { + ret[1] = 0xFFFFFFFFFFFFFFFFULL; + + tmp = be64toh(ret[0]); + if (prefix > 0) { + mask = 0xFFFFFFFFFFFFFFFFULL << (64 - prefix); + tmp |= ~mask; + } + ret[0] = htobe64(tmp); + } else { + mask = 0xFFFFFFFFFFFFFFFFULL << (128 - prefix); + tmp = be64toh(ret[1]); + tmp |= ~mask; + ret[1] = htobe64(tmp); + } + + memcpy(out->addr32, ret, sizeof(union netdata_ip_t)); +} + +/** + * Calculate ipv6 first address + * + * @param out the address to store the first address. + * @param in the address used to do the math. + * @param prefix number of bits used to calculate the address + */ +static void get_ipv6_first_addr(union netdata_ip_t *out, union netdata_ip_t *in, uint64_t prefix) +{ + uint64_t mask,tmp; + uint64_t ret[2]; + + memcpy(ret, in->addr32, sizeof(union netdata_ip_t)); + + if (prefix == 128) { + memcpy(out->addr32, in->addr32, sizeof(union netdata_ip_t)); + return; + } else if (!prefix) { + ret[0] = ret[1] = 0; + memcpy(out->addr32, ret, sizeof(union netdata_ip_t)); + return; + } else if (prefix <= 64) { + ret[1] = 0ULL; + + tmp = be64toh(ret[0]); + if (prefix > 0) { + mask = 0xFFFFFFFFFFFFFFFFULL << (64 - prefix); + tmp &= mask; + } + ret[0] = htobe64(tmp); + } else { + mask = 0xFFFFFFFFFFFFFFFFULL << (128 - prefix); + tmp = be64toh(ret[1]); + tmp &= mask; + ret[1] = htobe64(tmp); + } + + memcpy(out->addr32, ret, sizeof(union netdata_ip_t)); +} + +/** + * Is ip inside the range + * + * Check if the ip is inside a IP range + * + * @param rfirst the first ip address of the range + * @param rlast the last ip address of the range + * @param cmpfirst the first ip to compare + * @param cmplast the last ip to compare + * @param family the IP family + * + * @return It returns 1 if the IP is inside the range and 0 otherwise + */ +static int is_ip_inside_range(union netdata_ip_t *rfirst, union netdata_ip_t *rlast, + union netdata_ip_t *cmpfirst, union netdata_ip_t *cmplast, int family) +{ + if (family == AF_INET) { + if (ntohl(rfirst->addr32[0]) <= ntohl(cmpfirst->addr32[0]) && + ntohl(rlast->addr32[0]) >= ntohl(cmplast->addr32[0])) + return 1; + } else { + if (memcmp(rfirst->addr8, cmpfirst->addr8, sizeof(union netdata_ip_t)) <= 0 && + memcmp(rlast->addr8, cmplast->addr8, sizeof(union netdata_ip_t)) >= 0) { + return 1; + } + + } + return 0; +} + +/** + * Fill IP list + * + * @param out a pointer to the link list. + * @param in the structure that will be linked. + */ +void fill_ip_list(ebpf_network_viewer_ip_list_t **out, ebpf_network_viewer_ip_list_t *in, char *table) +{ +#ifndef NETDATA_INTERNAL_CHECKS + UNUSED(table); +#endif + if (likely(*out)) { + ebpf_network_viewer_ip_list_t *move = *out, *store = *out; + while (move) { + if (in->ver == move->ver && is_ip_inside_range(&move->first, &move->last, &in->first, &in->last, in->ver)) { + info("The range/value (%s) is inside the range/value (%s) already inserted, it will be ignored.", + in->value, move->value); + freez(in->value); + freez(in); + return; + } + store = move; + move = move->next; + } + + store->next = in; + } else { + *out = in; + } + +#ifdef NETDATA_INTERNAL_CHECKS + char first[512], last[512]; + if (in->ver == AF_INET) { + if (inet_ntop(AF_INET, in->first.addr8, first, INET_ADDRSTRLEN) && + inet_ntop(AF_INET, in->last.addr8, last, INET_ADDRSTRLEN)) + info("Adding values %s - %s to %s IP list \"%s\" used on network viewer", + first, last, + (*out == network_viewer_opt.included_ips)?"included":"excluded", + table); + } else { + if (inet_ntop(AF_INET6, in->first.addr8, first, INET6_ADDRSTRLEN) && + inet_ntop(AF_INET6, in->last.addr8, last, INET6_ADDRSTRLEN)) + info("Adding values %s - %s to %s IP list \"%s\" used on network viewer", + first, last, + (*out == network_viewer_opt.included_ips)?"included":"excluded", + table); + } +#endif +} + +/** + * Parse IP List + * + * Parse IP list and link it. + * + * @param out a pointer to store the link list + * @param ip the value given as parameter + */ +static void parse_ip_list(void **out, char *ip) +{ + ebpf_network_viewer_ip_list_t **list = (ebpf_network_viewer_ip_list_t **)out; + + char *ipdup = strdupz(ip); + union netdata_ip_t first = { }; + union netdata_ip_t last = { }; + char *is_ipv6; + if (*ip == '*' && *(ip+1) == '\0') { + memset(first.addr8, 0, sizeof(first.addr8)); + memset(last.addr8, 0xFF, sizeof(last.addr8)); + + is_ipv6 = ip; + + clean_ip_structure(list); + goto storethisip; + } + + char *end = ip; + // Move while I cannot find a separator + while (*end && *end != '/' && *end != '-') end++; + + // We will use only the classic IPV6 for while, but we could consider the base 85 in a near future + // https://tools.ietf.org/html/rfc1924 + is_ipv6 = strchr(ip, ':'); + + int select; + if (*end && !is_ipv6) { // IPV4 range + select = (*end == '/') ? 0 : 1; + *end++ = '\0'; + if (*end == '!') { + info("The exclusion cannot be in the second part of the range %s, it will be ignored.", ipdup); + goto cleanipdup; + } + + if (!select) { // CIDR + select = ip2nl(first.addr8, ip, AF_INET, ipdup); + if (select) + goto cleanipdup; + + select = (int) str2i(end); + if (select < NETDATA_MINIMUM_IPV4_CIDR || select > NETDATA_MAXIMUM_IPV4_CIDR) { + info("The specified CIDR %s is not valid, the IP %s will be ignored.", end, ip); + goto cleanipdup; + } + + last.addr32[0] = htonl(broadcast(ntohl(first.addr32[0]), select)); + // This was added to remove + // https://app.codacy.com/manual/netdata/netdata/pullRequest?prid=5810941&bid=19021977 + UNUSED(last.addr32[0]); + + uint32_t ipv4_test = htonl(ipv4_network(ntohl(first.addr32[0]), select)); + if (first.addr32[0] != ipv4_test) { + first.addr32[0] = ipv4_test; + struct in_addr ipv4_convert; + ipv4_convert.s_addr = ipv4_test; + char ipv4_msg[INET_ADDRSTRLEN]; + if(inet_ntop(AF_INET, &ipv4_convert, ipv4_msg, INET_ADDRSTRLEN)) + info("The network value of CIDR %s was updated for %s .", ipdup, ipv4_msg); + } + } else { // Range + select = ip2nl(first.addr8, ip, AF_INET, ipdup); + if (select) + goto cleanipdup; + + select = ip2nl(last.addr8, end, AF_INET, ipdup); + if (select) + goto cleanipdup; + } + + if (htonl(first.addr32[0]) > htonl(last.addr32[0])) { + info("The specified range %s is invalid, the second address is smallest than the first, it will be ignored.", + ipdup); + goto cleanipdup; + } + } else if (is_ipv6) { // IPV6 + if (!*end) { // Unique + select = ip2nl(first.addr8, ip, AF_INET6, ipdup); + if (select) + goto cleanipdup; + + memcpy(last.addr8, first.addr8, sizeof(first.addr8)); + } else if (*end == '-') { + *end++ = 0x00; + if (*end == '!') { + info("The exclusion cannot be in the second part of the range %s, it will be ignored.", ipdup); + goto cleanipdup; + } + + select = ip2nl(first.addr8, ip, AF_INET6, ipdup); + if (select) + goto cleanipdup; + + select = ip2nl(last.addr8, end, AF_INET6, ipdup); + if (select) + goto cleanipdup; + } else { // CIDR + *end++ = 0x00; + if (*end == '!') { + info("The exclusion cannot be in the second part of the range %s, it will be ignored.", ipdup); + goto cleanipdup; + } + + select = str2i(end); + if (select < 0 || select > 128) { + info("The CIDR %s is not valid, the address %s will be ignored.", end, ip); + goto cleanipdup; + } + + uint64_t prefix = (uint64_t)select; + select = ip2nl(first.addr8, ip, AF_INET6, ipdup); + if (select) + goto cleanipdup; + + get_ipv6_last_addr(&last, &first, prefix); + + union netdata_ip_t ipv6_test; + get_ipv6_first_addr(&ipv6_test, &first, prefix); + + if (memcmp(first.addr8, ipv6_test.addr8, sizeof(union netdata_ip_t)) != 0) { + memcpy(first.addr8, ipv6_test.addr8, sizeof(union netdata_ip_t)); + + struct in6_addr ipv6_convert; + memcpy(ipv6_convert.s6_addr, ipv6_test.addr8, sizeof(union netdata_ip_t)); + + char ipv6_msg[INET6_ADDRSTRLEN]; + if(inet_ntop(AF_INET6, &ipv6_convert, ipv6_msg, INET6_ADDRSTRLEN)) + info("The network value of CIDR %s was updated for %s .", ipdup, ipv6_msg); + } + } + + if ((be64toh(*(uint64_t *)&first.addr32[2]) > be64toh(*(uint64_t *)&last.addr32[2]) && + !memcmp(first.addr32, last.addr32, 2*sizeof(uint32_t))) || + (be64toh(*(uint64_t *)&first.addr32) > be64toh(*(uint64_t *)&last.addr32)) ) { + info("The specified range %s is invalid, the second address is smallest than the first, it will be ignored.", + ipdup); + goto cleanipdup; + } + } else { // Unique ip + select = ip2nl(first.addr8, ip, AF_INET, ipdup); + if (select) + goto cleanipdup; + + memcpy(last.addr8, first.addr8, sizeof(first.addr8)); + } + + ebpf_network_viewer_ip_list_t *store; + + storethisip: + store = callocz(1, sizeof(ebpf_network_viewer_ip_list_t)); + store->value = ipdup; + store->hash = simple_hash(ipdup); + store->ver = (uint8_t)(!is_ipv6)?AF_INET:AF_INET6; + memcpy(store->first.addr8, first.addr8, sizeof(first.addr8)); + memcpy(store->last.addr8, last.addr8, sizeof(last.addr8)); + + fill_ip_list(list, store, "socket"); + return; + +cleanipdup: + freez(ipdup); +} + +/** + * Parse IP Range + * + * Parse the IP ranges given and create Network Viewer IP Structure + * + * @param ptr is a pointer with the text to parse. + */ +static void parse_ips(char *ptr) +{ + // No value + if (unlikely(!ptr)) + return; + + while (likely(ptr)) { + // Move forward until next valid character + while (isspace(*ptr)) ptr++; + + // No valid value found + if (unlikely(!*ptr)) + return; + + // Find space that ends the list + char *end = strchr(ptr, ' '); + if (end) { + *end++ = '\0'; + } + + int neg = 0; + if (*ptr == '!') { + neg++; + ptr++; + } + + if (isascii(*ptr)) { // Parse port + parse_ip_list((!neg)?(void **)&network_viewer_opt.included_ips:(void **)&network_viewer_opt.excluded_ips, + ptr); + } + + ptr = end; + } +} + + + +/** + * Parse port list + * + * Parse an allocated port list with the range given + * + * @param out a pointer to store the link list + * @param range the informed range for the user. + */ +static void parse_port_list(void **out, char *range) +{ + int first, last; + ebpf_network_viewer_port_list_t **list = (ebpf_network_viewer_port_list_t **)out; + + char *copied = strdupz(range); + if (*range == '*' && *(range+1) == '\0') { + first = 1; + last = 65535; + + clean_port_structure(list); + goto fillenvpl; + } + + char *end = range; + //Move while I cannot find a separator + while (*end && *end != ':' && *end != '-') end++; + + //It has a range + if (likely(*end)) { + *end++ = '\0'; + if (*end == '!') { + info("The exclusion cannot be in the second part of the range, the range %s will be ignored.", copied); + freez(copied); + return; + } + last = str2i((const char *)end); + } else { + last = 0; + } + + first = str2i((const char *)range); + if (first < NETDATA_MINIMUM_PORT_VALUE || first > NETDATA_MAXIMUM_PORT_VALUE) { + info("The first port %d of the range \"%s\" is invalid and it will be ignored!", first, copied); + freez(copied); + return; + } + + if (!last) + last = first; + + if (last < NETDATA_MINIMUM_PORT_VALUE || last > NETDATA_MAXIMUM_PORT_VALUE) { + info("The second port %d of the range \"%s\" is invalid and the whole range will be ignored!", last, copied); + freez(copied); + return; + } + + if (first > last) { + info("The specified order %s is wrong, the smallest value is always the first, it will be ignored!", copied); + freez(copied); + return; + } + + ebpf_network_viewer_port_list_t *w; +fillenvpl: + w = callocz(1, sizeof(ebpf_network_viewer_port_list_t)); + w->value = copied; + w->hash = simple_hash(copied); + w->first = (uint16_t)htons((uint16_t)first); + w->last = (uint16_t)htons((uint16_t)last); + w->cmp_first = (uint16_t)first; + w->cmp_last = (uint16_t)last; + + fill_port_list(list, w); +} + +/** + * Read max dimension. + * + * Netdata plot two dimensions per connection, so it is necessary to adjust the values. + * + * @param cfg the configuration structure + */ +static void read_max_dimension(struct config *cfg) +{ + int maxdim ; + maxdim = (int) appconfig_get_number(cfg, + EBPF_NETWORK_VIEWER_SECTION, + EBPF_MAXIMUM_DIMENSIONS, + NETDATA_NV_CAP_VALUE); + if (maxdim < 0) { + error("'maximum dimensions = %d' must be a positive number, Netdata will change for default value %ld.", + maxdim, NETDATA_NV_CAP_VALUE); + maxdim = NETDATA_NV_CAP_VALUE; + } + + maxdim /= 2; + if (!maxdim) { + info("The number of dimensions is too small (%u), we are setting it to minimum 2", network_viewer_opt.max_dim); + network_viewer_opt.max_dim = 1; + return; + } + + network_viewer_opt.max_dim = (uint32_t)maxdim; +} + +/** + * Parse Port Range + * + * Parse the port ranges given and create Network Viewer Port Structure + * + * @param ptr is a pointer with the text to parse. + */ +static void parse_ports(char *ptr) +{ + // No value + if (unlikely(!ptr)) + return; + + while (likely(ptr)) { + // Move forward until next valid character + while (isspace(*ptr)) ptr++; + + // No valid value found + if (unlikely(!*ptr)) + return; + + // Find space that ends the list + char *end = strchr(ptr, ' '); + if (end) { + *end++ = '\0'; + } + + int neg = 0; + if (*ptr == '!') { + neg++; + ptr++; + } + + if (isdigit(*ptr)) { // Parse port + parse_port_list((!neg)?(void **)&network_viewer_opt.included_port:(void **)&network_viewer_opt.excluded_port, + ptr); + } else if (isalpha(*ptr)) { // Parse service + parse_service_list((!neg)?(void **)&network_viewer_opt.included_port:(void **)&network_viewer_opt.excluded_port, + ptr); + } else if (*ptr == '*') { // All + parse_port_list((!neg)?(void **)&network_viewer_opt.included_port:(void **)&network_viewer_opt.excluded_port, + ptr); + } + + ptr = end; + } +} + +/** + * Link hostname + * + * @param out is the output link list + * @param in the hostname to add to list. + */ +static void link_hostname(ebpf_network_viewer_hostname_list_t **out, ebpf_network_viewer_hostname_list_t *in) +{ + if (likely(*out)) { + ebpf_network_viewer_hostname_list_t *move = *out; + for (; move->next ; move = move->next ) { + if (move->hash == in->hash && !strcmp(move->value, in->value)) { + info("The hostname %s was already inserted, it will be ignored.", in->value); + freez(in->value); + simple_pattern_free(in->value_pattern); + freez(in); + return; + } + } + + move->next = in; + } else { + *out = in; + } +#ifdef NETDATA_INTERNAL_CHECKS + info("Adding value %s to %s hostname list used on network viewer", + in->value, + (*out == network_viewer_opt.included_hostnames)?"included":"excluded"); +#endif +} + +/** + * Link Hostnames + * + * Parse the list of hostnames to create the link list. + * This is not associated with the IP, because simple patterns like *example* cannot be resolved to IP. + * + * @param out is the output link list + * @param parse is a pointer with the text to parser. + */ +static void link_hostnames(char *parse) +{ + // No value + if (unlikely(!parse)) + return; + + while (likely(parse)) { + // Find the first valid value + while (isspace(*parse)) parse++; + + // No valid value found + if (unlikely(!*parse)) + return; + + // Find space that ends the list + char *end = strchr(parse, ' '); + if (end) { + *end++ = '\0'; + } + + int neg = 0; + if (*parse == '!') { + neg++; + parse++; + } + + ebpf_network_viewer_hostname_list_t *hostname = callocz(1 , sizeof(ebpf_network_viewer_hostname_list_t)); + hostname->value = strdupz(parse); + hostname->hash = simple_hash(parse); + hostname->value_pattern = simple_pattern_create(parse, NULL, SIMPLE_PATTERN_EXACT); + + link_hostname((!neg)?&network_viewer_opt.included_hostnames:&network_viewer_opt.excluded_hostnames, + hostname); + + parse = end; + } +} + +/** + * Parse network viewer section + * + * @param cfg the configuration structure + */ +void parse_network_viewer_section(struct config *cfg) +{ + read_max_dimension(cfg); + + network_viewer_opt.hostname_resolution_enabled = appconfig_get_boolean(cfg, + EBPF_NETWORK_VIEWER_SECTION, + EBPF_CONFIG_RESOLVE_HOSTNAME, + CONFIG_BOOLEAN_NO); + + network_viewer_opt.service_resolution_enabled = appconfig_get_boolean(cfg, + EBPF_NETWORK_VIEWER_SECTION, + EBPF_CONFIG_RESOLVE_SERVICE, + CONFIG_BOOLEAN_NO); + + char *value = appconfig_get(cfg, EBPF_NETWORK_VIEWER_SECTION, EBPF_CONFIG_PORTS, NULL); + parse_ports(value); + + if (network_viewer_opt.hostname_resolution_enabled) { + value = appconfig_get(cfg, EBPF_NETWORK_VIEWER_SECTION, EBPF_CONFIG_HOSTNAMES, NULL); + link_hostnames(value); + } else { + info("Name resolution is disabled, collector will not parser \"hostnames\" list."); + } + + value = appconfig_get(cfg, EBPF_NETWORK_VIEWER_SECTION, + "ips", "!127.0.0.1/8 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 fc00::/7 !::1/128"); + parse_ips(value); +} + +/** + * Link dimension name + * + * Link user specified names inside a link list. + * + * @param port the port number associated to the dimension name. + * @param hash the calculated hash for the dimension name. + * @param name the dimension name. + */ +static void link_dimension_name(char *port, uint32_t hash, char *value) +{ + int test = str2i(port); + if (test < NETDATA_MINIMUM_PORT_VALUE || test > NETDATA_MAXIMUM_PORT_VALUE){ + error("The dimension given (%s = %s) has an invalid value and it will be ignored.", port, value); + return; + } + + ebpf_network_viewer_dim_name_t *w; + w = callocz(1, sizeof(ebpf_network_viewer_dim_name_t)); + + w->name = strdupz(value); + w->hash = hash; + + w->port = (uint16_t) htons(test); + + ebpf_network_viewer_dim_name_t *names = network_viewer_opt.names; + if (unlikely(!names)) { + network_viewer_opt.names = w; + } else { + for (; names->next; names = names->next) { + if (names->port == w->port) { + info("Dupplicated definition for a service, the name %s will be ignored. ", names->name); + freez(names->name); + names->name = w->name; + names->hash = w->hash; + freez(w); + return; + } + } + names->next = w; + } + +#ifdef NETDATA_INTERNAL_CHECKS + info("Adding values %s( %u) to dimension name list used on network viewer", w->name, htons(w->port)); +#endif +} + +/** + * Parse service Name section. + * + * This function gets the values that will be used to overwrite dimensions. + * + * @param cfg the configuration structure + */ +void parse_service_name_section(struct config *cfg) +{ + struct section *co = appconfig_get_section(cfg, EBPF_SERVICE_NAME_SECTION); + if (co) { + struct config_option *cv; + for (cv = co->values; cv ; cv = cv->next) { + link_dimension_name(cv->name, cv->hash, cv->value); + } + } + + // Always associated the default port to Netdata + ebpf_network_viewer_dim_name_t *names = network_viewer_opt.names; + if (names) { + uint16_t default_port = htons(19999); + while (names) { + if (names->port == default_port) + return; + + names = names->next; + } + } + + char *port_string = getenv("NETDATA_LISTEN_PORT"); + if (port_string) { + // if variable has an invalid value, we assume netdata is using 19999 + int default_port = str2i(port_string); + if (default_port > 0 && default_port < 65536) + link_dimension_name(port_string, simple_hash(port_string), "Netdata"); + } +} + /** * Socket thread * @@ -1875,6 +2828,10 @@ void *ebpf_socket_thread(void *ptr) ebpf_module_t *em = (ebpf_module_t *)ptr; fill_ebpf_data(&socket_data); + ebpf_update_module(em, &socket_config, NETDATA_NETWORK_CONFIG_FILE); + parse_network_viewer_section(&socket_config); + parse_service_name_section(&socket_config); + if (!em->enabled) goto endsocket; diff --git a/collectors/ebpf.plugin/ebpf_socket.h b/collectors/ebpf.plugin/ebpf_socket.h index 1316c003a..81001bab6 100644 --- a/collectors/ebpf.plugin/ebpf_socket.h +++ b/collectors/ebpf.plugin/ebpf_socket.h @@ -5,7 +5,6 @@ #include "libnetdata/avl/avl.h" // Vector indexes -#define NETDATA_MAX_SOCKET_VECTOR 6 #define NETDATA_UDP_START 3 #define NETDATA_RETRANSMIT_START 5 @@ -17,6 +16,28 @@ #define NETDATA_SOCKET_READ_SLEEP_MS 800000ULL +// config file +#define NETDATA_NETWORK_CONFIG_FILE "network.conf" +#define EBPF_NETWORK_VIEWER_SECTION "network connections" +#define EBPF_SERVICE_NAME_SECTION "service name" +#define EBPF_CONFIG_RESOLVE_HOSTNAME "resolve hostnames" +#define EBPF_CONFIG_RESOLVE_SERVICE "resolve service names" +#define EBPF_CONFIG_PORTS "ports" +#define EBPF_CONFIG_HOSTNAMES "hostnames" +#define EBPF_MAXIMUM_DIMENSIONS "maximum dimensions" + +enum ebpf_socket_publish_index { + NETDATA_IDX_TCP_SENDMSG, + NETDATA_IDX_TCP_CLEANUP_RBUF, + NETDATA_IDX_TCP_CLOSE, + NETDATA_IDX_UDP_RECVBUF, + NETDATA_IDX_UDP_SENDMSG, + NETDATA_IDX_TCP_RETRANSMIT, + + // Keep this as last and don't skip numbers as it is used as element counter + NETDATA_MAX_SOCKET_VECTOR +}; + typedef enum ebpf_socket_idx { NETDATA_KEY_CALLS_TCP_SENDMSG, NETDATA_KEY_ERROR_TCP_SENDMSG, @@ -38,6 +59,7 @@ typedef enum ebpf_socket_idx { NETDATA_KEY_TCP_RETRANSMIT, + // Keep this as last and don't skip numbers as it is used as element counter NETDATA_SOCKET_COUNTER } ebpf_socket_index_t; @@ -232,7 +254,7 @@ typedef struct netdata_socket_idx { */ typedef struct netdata_socket_plot { // Search - avl avl; + avl_t avl; netdata_socket_idx_t index; // Current data @@ -269,6 +291,9 @@ typedef struct netdata_vector_plot { extern void clean_port_structure(ebpf_network_viewer_port_list_t **clean); extern ebpf_network_viewer_port_list_t *listen_ports; extern void update_listen_table(uint16_t value, uint8_t proto); +extern void parse_network_viewer_section(struct config *cfg); +extern void fill_ip_list(ebpf_network_viewer_ip_list_t **out, ebpf_network_viewer_ip_list_t *in, char *table); +extern void parse_service_name_section(struct config *cfg); extern ebpf_socket_publish_apps_t **socket_bandwidth_curr; diff --git a/collectors/ebpf.plugin/ebpf_sync.c b/collectors/ebpf.plugin/ebpf_sync.c new file mode 100644 index 000000000..f0db1cc4a --- /dev/null +++ b/collectors/ebpf.plugin/ebpf_sync.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "ebpf.h" +#include "ebpf_sync.h" + +static ebpf_data_t sync_data; + +static char *sync_counter_dimension_name[NETDATA_SYNC_IDX_END] = { "sync", "syncfs", "msync", "fsync", "fdatasync", + "sync_file_range" }; +static netdata_syscall_stat_t sync_counter_aggregated_data[NETDATA_SYNC_IDX_END]; +static netdata_publish_syscall_t sync_counter_publish_aggregated[NETDATA_SYNC_IDX_END]; + +static int read_thread_closed = 1; + +static netdata_idx_t sync_hash_values[NETDATA_SYNC_IDX_END]; + +struct netdata_static_thread sync_threads = {"SYNC KERNEL", NULL, NULL, 1, + NULL, NULL, NULL}; + +struct config sync_config = { .first_section = NULL, + .last_section = NULL, + .mutex = NETDATA_MUTEX_INITIALIZER, + .index = { .avl_tree = { .root = NULL, .compar = appconfig_section_compare }, + .rwlock = AVL_LOCK_INITIALIZER } }; + +ebpf_sync_syscalls_t local_syscalls[] = { + {.syscall = "sync", .enabled = CONFIG_BOOLEAN_YES, .objects = NULL, .probe_links = NULL}, + {.syscall = "syncfs", .enabled = CONFIG_BOOLEAN_YES, .objects = NULL, .probe_links = NULL}, + {.syscall = "msync", .enabled = CONFIG_BOOLEAN_YES, .objects = NULL, .probe_links = NULL}, + {.syscall = "fsync", .enabled = CONFIG_BOOLEAN_YES, .objects = NULL, .probe_links = NULL}, + {.syscall = "fdatasync", .enabled = CONFIG_BOOLEAN_YES, .objects = NULL, .probe_links = NULL}, + {.syscall = "sync_file_range", .enabled = CONFIG_BOOLEAN_YES, .objects = NULL, .probe_links = NULL}, + {.syscall = NULL, .enabled = CONFIG_BOOLEAN_NO, .objects = NULL, .probe_links = NULL} +}; + +/***************************************************************** + * + * INITIALIZE THREAD + * + *****************************************************************/ + +/* + * Initialize Syscalls + * + * Load the eBPF programs to monitor syscalls + * + * @return 0 on success and -1 otherwise. + */ +static int ebpf_sync_initialize_syscall(ebpf_module_t *em) +{ + int i; + const char *saved_name = em->thread_name; + for (i = 0; local_syscalls[i].syscall; i++) { + ebpf_sync_syscalls_t *w = &local_syscalls[i]; + if (!w->probe_links && w->enabled) { + fill_ebpf_data(&w->kernel_info); + if (ebpf_update_kernel(&w->kernel_info)) { + em->thread_name = saved_name; + error("Cannot update the kernel for eBPF module %s", w->syscall); + return -1; + } + + em->thread_name = w->syscall; + w->probe_links = ebpf_load_program(ebpf_plugin_dir, em, kernel_string, &w->objects, w->kernel_info.map_fd); + if (!w->probe_links) { + em->thread_name = saved_name; + return -1; + } + } + } + em->thread_name = saved_name; + + memset(sync_counter_aggregated_data, 0 , NETDATA_SYNC_IDX_END * sizeof(netdata_syscall_stat_t)); + memset(sync_counter_publish_aggregated, 0 , NETDATA_SYNC_IDX_END * sizeof(netdata_publish_syscall_t)); + memset(sync_hash_values, 0 , NETDATA_SYNC_IDX_END * sizeof(netdata_idx_t)); + + return 0; +} + +/***************************************************************** + * + * DATA THREAD + * + *****************************************************************/ + +/** + * Read global table + * + * Read the table with number of calls for all functions + */ +static void read_global_table() +{ + netdata_idx_t stored; + uint32_t idx = NETDATA_SYNC_CALL; + int i; + for (i = 0; local_syscalls[i].syscall; i++) { + if (local_syscalls[i].enabled) { + int fd = local_syscalls[i].kernel_info.map_fd[NETDATA_SYNC_GLOBLAL_TABLE]; + if (!bpf_map_lookup_elem(fd, &idx, &stored)) { + sync_hash_values[i] = stored; + } + } + } +} + +/** + * Sync read hash + * + * This is the thread callback. + * + * @param ptr It is a NULL value for this thread. + * + * @return It always returns NULL. + */ +void *ebpf_sync_read_hash(void *ptr) +{ + ebpf_module_t *em = (ebpf_module_t *)ptr; + read_thread_closed = 0; + + heartbeat_t hb; + heartbeat_init(&hb); + usec_t step = NETDATA_EBPF_SYNC_SLEEP_MS * em->update_time; + + while (!close_ebpf_plugin) { + usec_t dt = heartbeat_next(&hb, step); + (void)dt; + + read_global_table(); + } + read_thread_closed = 1; + + return NULL; +} + +/** + * Create Sync charts + * + * Create charts and dimensions according user input. + * + * @param id chart id + * @param idx the first index with data. + * @param end the last index with data. + */ +static void ebpf_send_sync_chart(char *id, + int idx, + int end) +{ + write_begin_chart(NETDATA_EBPF_MEMORY_GROUP, id); + + netdata_publish_syscall_t *move = &sync_counter_publish_aggregated[idx]; + + while (move && idx <= end) { + if (local_syscalls[idx].enabled) + write_chart_dimension(move->name, sync_hash_values[idx]); + + move = move->next; + idx++; + } + + write_end_chart(); +} + +/** + * Send data + * + * Send global charts to Netdata + */ +static void sync_send_data() +{ + if (local_syscalls[NETDATA_SYNC_FSYNC_IDX].enabled || local_syscalls[NETDATA_SYNC_FDATASYNC_IDX].enabled) { + ebpf_send_sync_chart(NETDATA_EBPF_FILE_SYNC_CHART, NETDATA_SYNC_FSYNC_IDX, NETDATA_SYNC_FDATASYNC_IDX); + } + + if (local_syscalls[NETDATA_SYNC_MSYNC_IDX].enabled) + ebpf_one_dimension_write_charts(NETDATA_EBPF_MEMORY_GROUP, NETDATA_EBPF_MSYNC_CHART, + sync_counter_publish_aggregated[NETDATA_SYNC_MSYNC_IDX].dimension, + sync_hash_values[NETDATA_SYNC_MSYNC_IDX]); + + if (local_syscalls[NETDATA_SYNC_SYNC_IDX].enabled || local_syscalls[NETDATA_SYNC_SYNCFS_IDX].enabled) { + ebpf_send_sync_chart(NETDATA_EBPF_SYNC_CHART, NETDATA_SYNC_SYNC_IDX, NETDATA_SYNC_SYNCFS_IDX); + } + + if (local_syscalls[NETDATA_SYNC_SYNC_FILE_RANGE_IDX].enabled) + ebpf_one_dimension_write_charts(NETDATA_EBPF_MEMORY_GROUP, NETDATA_EBPF_FILE_SEGMENT_CHART, + sync_counter_publish_aggregated[NETDATA_SYNC_SYNC_FILE_RANGE_IDX].dimension, + sync_hash_values[NETDATA_SYNC_SYNC_FILE_RANGE_IDX]); +} + +/** +* Main loop for this collector. +*/ +static void sync_collector(ebpf_module_t *em) +{ + sync_threads.thread = mallocz(sizeof(netdata_thread_t)); + sync_threads.start_routine = ebpf_sync_read_hash; + + netdata_thread_create(sync_threads.thread, sync_threads.name, NETDATA_THREAD_OPTION_JOINABLE, + ebpf_sync_read_hash, em); + + while (!close_ebpf_plugin) { + pthread_mutex_lock(&collect_data_mutex); + pthread_cond_wait(&collect_data_cond_var, &collect_data_mutex); + + pthread_mutex_lock(&lock); + + sync_send_data(); + + pthread_mutex_unlock(&lock); + pthread_mutex_unlock(&collect_data_mutex); + } +} + + +/***************************************************************** + * + * CLEANUP THREAD + * + *****************************************************************/ + +/** + * Cleanup Objects + * + * Cleanup loaded objects when thread was initialized. + */ +void ebpf_sync_cleanup_objects() +{ + int i; + for (i = 0; local_syscalls[i].syscall; i++) { + ebpf_sync_syscalls_t *w = &local_syscalls[i]; + if (w->probe_links) { + freez(w->kernel_info.map_fd); + + struct bpf_program *prog; + size_t j = 0 ; + bpf_object__for_each_program(prog, w->objects) { + bpf_link__destroy(w->probe_links[j]); + j++; + } + bpf_object__close(w->objects); + } + } +} + +/** + * Clean up the main thread. + * + * @param ptr thread data. + */ +static void ebpf_sync_cleanup(void *ptr) +{ + ebpf_module_t *em = (ebpf_module_t *)ptr; + if (!em->enabled) + return; + + heartbeat_t hb; + heartbeat_init(&hb); + uint32_t tick = 2*USEC_PER_MS; + while (!read_thread_closed) { + usec_t dt = heartbeat_next(&hb, tick); + UNUSED(dt); + } + + ebpf_sync_cleanup_objects(); + freez(sync_threads.thread); +} + +/***************************************************************** + * + * MAIN THREAD + * + *****************************************************************/ + +/** + * Create Sync charts + * + * Create charts and dimensions according user input. + * + * @param id chart id + * @param title chart title + * @param order order number of the specified chart + * @param idx the first index with data. + * @param end the last index with data. + */ +static void ebpf_create_sync_chart(char *id, + char *title, + int order, + int idx, + int end) +{ + ebpf_write_chart_cmd(NETDATA_EBPF_MEMORY_GROUP, id, title, EBPF_COMMON_DIMENSION_CALL, + NETDATA_EBPF_SYNC_SUBMENU, NETDATA_EBPF_CHART_TYPE_LINE, NULL, order); + + netdata_publish_syscall_t *move = &sync_counter_publish_aggregated[idx]; + + while (move && idx <= end) { + if (local_syscalls[idx].enabled) + ebpf_write_global_dimension(move->name, move->dimension, move->algorithm); + + move = move->next; + idx++; + } +} + +/** + * Create global charts + * + * Call ebpf_create_chart to create the charts for the collector. + */ +static void ebpf_create_sync_charts() +{ + if (local_syscalls[NETDATA_SYNC_FSYNC_IDX].enabled || local_syscalls[NETDATA_SYNC_FDATASYNC_IDX].enabled) + ebpf_create_sync_chart(NETDATA_EBPF_FILE_SYNC_CHART, + "Monitor calls for fsync(2) and fdatasync(2).", 21300, + NETDATA_SYNC_FSYNC_IDX, NETDATA_SYNC_FDATASYNC_IDX); + + if (local_syscalls[NETDATA_SYNC_MSYNC_IDX].enabled) + ebpf_create_sync_chart(NETDATA_EBPF_MSYNC_CHART, + "Monitor calls for msync(2).", 21301, + NETDATA_SYNC_MSYNC_IDX, NETDATA_SYNC_MSYNC_IDX); + + if (local_syscalls[NETDATA_SYNC_SYNC_IDX].enabled || local_syscalls[NETDATA_SYNC_SYNCFS_IDX].enabled) + ebpf_create_sync_chart(NETDATA_EBPF_SYNC_CHART, + "Monitor calls for sync(2) and syncfs(2).", 21302, + NETDATA_SYNC_SYNC_IDX, NETDATA_SYNC_SYNCFS_IDX); + + if (local_syscalls[NETDATA_SYNC_SYNC_FILE_RANGE_IDX].enabled) + ebpf_create_sync_chart(NETDATA_EBPF_FILE_SEGMENT_CHART, + "Monitor calls for sync_file_range(2).", 21303, + NETDATA_SYNC_SYNC_FILE_RANGE_IDX, NETDATA_SYNC_SYNC_FILE_RANGE_IDX); +} + +/** + * Parse Syscalls + * + * Parse syscall options available inside ebpf.d/sync.conf + */ +static void ebpf_sync_parse_syscalls() +{ + int i; + for (i = 0; local_syscalls[i].syscall; i++) { + local_syscalls[i].enabled = appconfig_get_boolean(&sync_config, NETDATA_SYNC_CONFIG_NAME, + local_syscalls[i].syscall, CONFIG_BOOLEAN_YES); + } +} + +/** + * Sync thread + * + * Thread used to make sync thread + * + * @param ptr a pointer to `struct ebpf_module` + * + * @return It always return NULL + */ +void *ebpf_sync_thread(void *ptr) +{ + netdata_thread_cleanup_push(ebpf_sync_cleanup, ptr); + + ebpf_module_t *em = (ebpf_module_t *)ptr; + fill_ebpf_data(&sync_data); + + ebpf_update_module(em, &sync_config, NETDATA_SYNC_CONFIG_FILE); + ebpf_sync_parse_syscalls(); + + if (!em->enabled) + goto endsync; + + if (ebpf_sync_initialize_syscall(em)) { + pthread_mutex_unlock(&lock); + goto endsync; + } + + int algorithms[NETDATA_SYNC_IDX_END] = { NETDATA_EBPF_INCREMENTAL_IDX, NETDATA_EBPF_INCREMENTAL_IDX, + NETDATA_EBPF_INCREMENTAL_IDX, NETDATA_EBPF_INCREMENTAL_IDX, + NETDATA_EBPF_INCREMENTAL_IDX, NETDATA_EBPF_INCREMENTAL_IDX }; + ebpf_global_labels(sync_counter_aggregated_data, sync_counter_publish_aggregated, + sync_counter_dimension_name, sync_counter_dimension_name, + algorithms, NETDATA_SYNC_IDX_END); + + pthread_mutex_lock(&lock); + ebpf_create_sync_charts(); + pthread_mutex_unlock(&lock); + + sync_collector(em); + +endsync: + netdata_thread_cleanup_pop(1); + return NULL; +} diff --git a/collectors/ebpf.plugin/ebpf_sync.h b/collectors/ebpf.plugin/ebpf_sync.h new file mode 100644 index 000000000..458318218 --- /dev/null +++ b/collectors/ebpf.plugin/ebpf_sync.h @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_EBPF_SYNC_H +#define NETDATA_EBPF_SYNC_H 1 + +// charts +#define NETDATA_EBPF_SYNC_CHART "sync" +#define NETDATA_EBPF_MSYNC_CHART "memory_map" +#define NETDATA_EBPF_FILE_SYNC_CHART "file_sync" +#define NETDATA_EBPF_FILE_SEGMENT_CHART "file_segment" +#define NETDATA_EBPF_SYNC_SUBMENU "synchronization (eBPF)" + +#define NETDATA_EBPF_SYNC_SLEEP_MS 800000ULL + +// configuration file +#define NETDATA_SYNC_CONFIG_FILE "sync.conf" +#define NETDATA_SYNC_CONFIG_NAME "syscalls" + +enum sync_syscalls_index { + NETDATA_SYNC_SYNC_IDX, + NETDATA_SYNC_SYNCFS_IDX, + NETDATA_SYNC_MSYNC_IDX, + NETDATA_SYNC_FSYNC_IDX, + NETDATA_SYNC_FDATASYNC_IDX, + NETDATA_SYNC_SYNC_FILE_RANGE_IDX, + + NETDATA_SYNC_IDX_END +}; + +typedef struct ebpf_sync_syscalls { + char *syscall; + int enabled; + uint32_t flags; + + struct bpf_object *objects; + struct bpf_link **probe_links; + + ebpf_data_t kernel_info; +} ebpf_sync_syscalls_t; + +enum netdata_sync_charts { + NETDATA_SYNC_CALL, + + // Keep this as last and don't skip numbers as it is used as element counter + NETDATA_SYNC_END +}; + +enum netdata_sync_table { + NETDATA_SYNC_GLOBLAL_TABLE +}; + +extern void *ebpf_sync_thread(void *ptr); + +#endif /* NETDATA_EBPF_SYNC_H */ diff --git a/collectors/freeipmi.plugin/README.md b/collectors/freeipmi.plugin/README.md index 52945e3c6..02a61dd2f 100644 --- a/collectors/freeipmi.plugin/README.md +++ b/collectors/freeipmi.plugin/README.md @@ -45,7 +45,7 @@ The plugin does a speed test when it starts, to find out the duration needed by The plugin supports a few options. To see them, run: -```sh +```text # /usr/libexec/netdata/plugins.d/freeipmi.plugin -h netdata freeipmi.plugin 1.8.0-546-g72ce5d6b_rolling @@ -72,6 +72,8 @@ The plugin supports a few options. To see them, run: password PASS connect to remote IPMI host default: local IPMI processor + noauthcodecheck don't check the authentication codes returned + driver-type IPMIDRIVER Specify the driver type to use instead of doing an auto selection. The currently available outofband drivers are LAN and LAN_2_0, diff --git a/collectors/freeipmi.plugin/freeipmi_plugin.c b/collectors/freeipmi.plugin/freeipmi_plugin.c index bd3c533ca..e9702e785 100644 --- a/collectors/freeipmi.plugin/freeipmi_plugin.c +++ b/collectors/freeipmi.plugin/freeipmi_plugin.c @@ -26,8 +26,6 @@ #include #include -#ifdef HAVE_FREEIPMI - #define IPMI_PARSE_DEVICE_LAN_STR "lan" #define IPMI_PARSE_DEVICE_LAN_2_0_STR "lan_2_0" #define IPMI_PARSE_DEVICE_LAN_2_0_STR2 "lan20" @@ -1621,6 +1619,14 @@ int parse_outofband_driver_type (const char *str) return (-1); } +int host_is_local(const char *host) +{ + if (host && (!strcmp(host, "localhost") || !strcmp(host, "127.0.0.1") || !strcmp(host, "::1"))) + return (1); + + return (0); +} + int main (int argc, char **argv) { // ------------------------------------------------------------------------ @@ -1691,6 +1697,8 @@ int main (int argc, char **argv) { " password PASS connect to remote IPMI host\n" " default: local IPMI processor\n" "\n" + " noauthcodecheck don't check the authentication codes returned\n" + "\n" " driver-type IPMIDRIVER\n" " Specify the driver type to use instead of doing an auto selection. \n" " The currently available outofband drivers are LAN and LAN_2_0,\n" @@ -1765,6 +1773,23 @@ int main (int argc, char **argv) { if(debug) fprintf(stderr, "freeipmi.plugin: inband driver type set to '%d'\n", driver_type); } continue; + } else if (i < argc && strcmp("noauthcodecheck", argv[i]) == 0) { + if (!hostname || host_is_local(hostname)) { + if (debug) + fprintf( + stderr, + "freeipmi.plugin: noauthcodecheck workaround flag is ignored for inband configuration\n"); + } else if (protocol_version < 0 || protocol_version == IPMI_MONITORING_PROTOCOL_VERSION_1_5) { + workaround_flags |= IPMI_MONITORING_WORKAROUND_FLAGS_PROTOCOL_VERSION_1_5_NO_AUTH_CODE_CHECK; + if (debug) + fprintf(stderr, "freeipmi.plugin: noauthcodecheck workaround flag enabled\n"); + } else { + if (debug) + fprintf( + stderr, + "freeipmi.plugin: noauthcodecheck workaround flag is ignored for protocol version 2.0\n"); + } + continue; } else if(i < argc && strcmp("sdr-cache-dir", argv[i]) == 0) { sdr_cache_directory = argv[++i]; @@ -1861,11 +1886,3 @@ int main (int argc, char **argv) { if(now_monotonic_sec() - started_t > 14400) exit(0); } } - -#else // !HAVE_FREEIPMI - -int main(int argc, char **argv) { - fatal("freeipmi.plugin is not compiled."); -} - -#endif // !HAVE_FREEIPMI diff --git a/collectors/nfacct.plugin/plugin_nfacct.c b/collectors/nfacct.plugin/plugin_nfacct.c index 996070f1c..acdd0586d 100644 --- a/collectors/nfacct.plugin/plugin_nfacct.c +++ b/collectors/nfacct.plugin/plugin_nfacct.c @@ -1,6 +1,9 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "../../libnetdata/libnetdata.h" +#include +#include +#include #define PLUGIN_NFACCT_NAME "nfacct.plugin" @@ -13,9 +16,6 @@ #define NETDATA_CHART_PRIO_NETFILTER_PACKETS 8906 #define NETDATA_CHART_PRIO_NETFILTER_BYTES 8907 -#ifdef HAVE_LIBMNL -#include - static inline size_t mnl_buffer_size() { long s = MNL_SOCKET_BUFFER_SIZE; if(s <= 0) return 8192; @@ -50,25 +50,13 @@ int health_variable_lookup(const char *variable, uint32_t hash, struct rrdcalc * // required by get_system_cpus() char *netdata_configured_host_prefix = ""; - -// Variables - +// variables static int debug = 0; - static int netdata_update_every = 1; -// ---------------------------------------------------------------------------- -// DO_NFSTAT - collect netfilter connection tracker statistics via netlink -// example: https://github.com/formorer/pkg-conntrack-tools/blob/master/src/conntrack.c - -#ifdef HAVE_LINUX_NETFILTER_NFNETLINK_CONNTRACK_H -#define DO_NFSTAT 1 - #define RRD_TYPE_NET_STAT_NETFILTER "netfilter" #define RRD_TYPE_NET_STAT_CONNTRACK "netlink" -#include - static struct { int update_every; char *buf; @@ -530,16 +518,6 @@ static void nfstat_send_metrics() { printf("END\n"); } -#endif // HAVE_LINUX_NETFILTER_NFNETLINK_CONNTRACK_H - - -// ---------------------------------------------------------------------------- -// DO_NFACCT - collect netfilter accounting statistics via netlink - -#ifdef HAVE_LIBNETFILTER_ACCT -#define DO_NFACCT 1 - -#include struct nfacct_data { char *name; @@ -760,8 +738,6 @@ static void nfacct_send_metrics() { printf("END\n"); } -#endif // HAVE_LIBNETFILTER_ACCT - static void nfacct_signal_handler(int signo) { exit((signo == SIGPIPE)?1:0); @@ -866,15 +842,13 @@ int main(int argc, char **argv) { else if(freq) error("update frequency %d seconds is too small for NFACCT. Using %d.", freq, netdata_update_every); -#ifdef DO_NFACCT - if(debug) fprintf(stderr, "nfacct.plugin: calling nfacct_init()\n"); + if (debug) + fprintf(stderr, "nfacct.plugin: calling nfacct_init()\n"); int nfacct = !nfacct_init(netdata_update_every); -#endif -#ifdef DO_NFSTAT - if(debug) fprintf(stderr, "nfacct.plugin: calling nfstat_init()\n"); + if (debug) + fprintf(stderr, "nfacct.plugin: calling nfstat_init()\n"); int nfstat = !nfstat_init(netdata_update_every); -#endif // ------------------------------------------------------------------------ // the main loop @@ -899,7 +873,6 @@ int main(int argc, char **argv) { , dt ); -#ifdef DO_NFACCT if(likely(nfacct)) { if(debug) fprintf(stderr, "nfacct.plugin: calling nfacct_collect()\n"); nfacct = !nfacct_collect(); @@ -909,9 +882,7 @@ int main(int argc, char **argv) { nfacct_send_metrics(); } } -#endif -#ifdef DO_NFSTAT if(likely(nfstat)) { if(debug) fprintf(stderr, "nfacct.plugin: calling nfstat_collect()\n"); nfstat = !nfstat_collect(); @@ -921,7 +892,6 @@ int main(int argc, char **argv) { nfstat_send_metrics(); } } -#endif fflush(stdout); @@ -931,14 +901,3 @@ int main(int argc, char **argv) { info("NFACCT process exiting"); } - -#else // !HAVE_LIBMNL - -int main(int argc, char **argv) { - (void)argc; - (void)argv; - - fatal("nfacct.plugin is not compiled."); -} - -#endif // !HAVE_LIBMNL diff --git a/collectors/proc.plugin/proc_diskstats.c b/collectors/proc.plugin/proc_diskstats.c index eee0cbe7f..b5d02f329 100644 --- a/collectors/proc.plugin/proc_diskstats.c +++ b/collectors/proc.plugin/proc_diskstats.c @@ -17,6 +17,7 @@ static struct disk { char *disk; // the name of the disk (sda, sdb, etc, after being looked up) char *device; // the device of the disk (before being looked up) + uint32_t hash; unsigned long major; unsigned long minor; int sector_size; @@ -73,6 +74,9 @@ static struct disk { RRDSET *st_backlog; RRDDIM *rd_backlog_backlog; + RRDSET *st_busy; + RRDDIM *rd_busy_busy; + RRDSET *st_util; RRDDIM *rd_util_utilization; @@ -390,7 +394,7 @@ static inline int get_disk_name_from_path(const char *path, char *result, size_t continue; } - if(major(sb.st_rdev) != major || minor(sb.st_rdev) != minor) { + if(major(sb.st_rdev) != major || minor(sb.st_rdev) != minor || strcmp(basename(filename), disk)) { //info("DEVICE-MAPPER ('%s', %lu:%lu): filename '%s' does not match %lu:%lu.", disk, major, minor, filename, (unsigned long)major(sb.st_rdev), (unsigned long)minor(sb.st_rdev)); continue; } @@ -544,13 +548,17 @@ static struct disk *get_disk(unsigned long major, unsigned long minor, char *dis struct disk *d; + uint32_t hash = simple_hash(disk); + // search for it in our RAM list. // this is sequential, but since we just walk through // and the number of disks / partitions in a system // should not be that many, it should be acceptable - for(d = disk_root; d ; d = d->next) - if(unlikely(d->major == major && d->minor == minor)) + for(d = disk_root; d ; d = d->next){ + if (unlikely( + d->major == major && d->minor == minor && d->hash == hash && !strcmp(d->device, disk))) return d; + } // not found // create a new disk structure @@ -558,6 +566,7 @@ static struct disk *get_disk(unsigned long major, unsigned long minor, char *dis d->disk = get_disk_name(major, minor, disk); d->device = strdupz(disk); + d->hash = simple_hash(d->device); d->major = major; d->minor = minor; d->type = DISK_TYPE_UNKNOWN; // Default type. Changed later if not correct. @@ -624,12 +633,12 @@ static struct disk *get_disk(unsigned long major, unsigned long minor, char *dis // check if we can find its mount point // mountinfo_find() can be called with NULL disk_mountinfo_root - struct mountinfo *mi = mountinfo_find(disk_mountinfo_root, d->major, d->minor); + struct mountinfo *mi = mountinfo_find(disk_mountinfo_root, d->major, d->minor, d->device); if(unlikely(!mi)) { // mountinfo_free_all can be called with NULL mountinfo_free_all(disk_mountinfo_root); disk_mountinfo_root = mountinfo_read(0); - mi = mountinfo_find(disk_mountinfo_root, d->major, d->minor); + mi = mountinfo_find(disk_mountinfo_root, d->major, d->minor, d->device); } if(unlikely(mi)) @@ -942,13 +951,6 @@ int do_proc_diskstats(int update_every, usec_t dt) { // I/O completion time and the backlog that may be accumulating. backlog_ms = str2ull(procfile_lineword(ff, l, 13)); // rq_ticks - - // -------------------------------------------------------------------------- - // remove slashes from disk names - char *s; - for(s = disk; *s ;s++) - if(*s == '/') *s = '_'; - // -------------------------------------------------------------------------- // get a disk structure for the disk @@ -1094,7 +1096,7 @@ int do_proc_diskstats(int update_every, usec_t dt) { rrdset_flag_set(d->st_backlog, RRDSET_FLAG_DETAIL); - d->rd_backlog_backlog = rrddim_add(d->st_backlog, "backlog", NULL, 1, 10, RRD_ALGORITHM_INCREMENTAL); + d->rd_backlog_backlog = rrddim_add(d->st_backlog, "backlog", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); } else rrdset_next(d->st_backlog); @@ -1108,6 +1110,34 @@ int do_proc_diskstats(int update_every, usec_t dt) { (busy_ms || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { d->do_util = CONFIG_BOOLEAN_YES; + if(unlikely(!d->st_busy)) { + d->st_busy = rrdset_create_localhost( + "disk_busy" + , d->device + , d->disk + , family + , "disk.busy" + , "Disk Busy Time" + , "milliseconds" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_DISKSTATS_NAME + , NETDATA_CHART_PRIO_DISK_BUSY + , update_every + , RRDSET_TYPE_AREA + ); + + rrdset_flag_set(d->st_busy, RRDSET_FLAG_DETAIL); + + d->rd_busy_busy = + rrddim_add(d->st_busy, "busy", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } + else rrdset_next(d->st_busy); + + last_busy_ms = rrddim_set_by_pointer(d->st_busy, d->rd_busy_busy, busy_ms); + rrdset_done(d->st_busy); + + // -------------------------------------------------------------------- + if(unlikely(!d->st_util)) { d->st_util = rrdset_create_localhost( "disk_util" @@ -1126,11 +1156,15 @@ int do_proc_diskstats(int update_every, usec_t dt) { rrdset_flag_set(d->st_util, RRDSET_FLAG_DETAIL); - d->rd_util_utilization = rrddim_add(d->st_util, "utilization", NULL, 1, 10, RRD_ALGORITHM_INCREMENTAL); + d->rd_util_utilization = rrddim_add(d->st_util, "utilization", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); } else rrdset_next(d->st_util); - last_busy_ms = rrddim_set_by_pointer(d->st_util, d->rd_util_utilization, busy_ms); + collected_number disk_utilization = (busy_ms - last_busy_ms) / (10 * update_every); + if (disk_utilization > 100) + disk_utilization = 100; + + rrddim_set_by_pointer(d->st_util, d->rd_util_utilization, disk_utilization); rrdset_done(d->st_util); } diff --git a/collectors/proc.plugin/proc_net_dev.c b/collectors/proc.plugin/proc_net_dev.c index a7db37e61..24715f296 100644 --- a/collectors/proc.plugin/proc_net_dev.c +++ b/collectors/proc.plugin/proc_net_dev.c @@ -5,6 +5,8 @@ #define PLUGIN_PROC_MODULE_NETDEV_NAME "/proc/net/dev" #define CONFIG_SECTION_PLUGIN_PROC_NETDEV "plugin:" PLUGIN_PROC_CONFIG_NAME ":" PLUGIN_PROC_MODULE_NETDEV_NAME +#define STATE_LENGTH_MAX 32 + // As defined in https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-net const char *operstate_names[] = { "unknown", "notpresent", "down", "lowerlayerdown", "testing", "dormant", "up" }; @@ -41,6 +43,11 @@ static struct netdev { int do_fifo; int do_compressed; int do_events; + int do_speed; + int do_duplex; + int do_operstate; + int do_carrier; + int do_mtu; const char *chart_type_net_bytes; const char *chart_type_net_packets; @@ -49,6 +56,11 @@ static struct netdev { const char *chart_type_net_events; const char *chart_type_net_drops; const char *chart_type_net_compressed; + const char *chart_type_net_speed; + const char *chart_type_net_duplex; + const char *chart_type_net_operstate; + const char *chart_type_net_carrier; + const char *chart_type_net_mtu; const char *chart_id_net_bytes; const char *chart_id_net_packets; @@ -57,6 +69,11 @@ static struct netdev { const char *chart_id_net_events; const char *chart_id_net_drops; const char *chart_id_net_compressed; + const char *chart_id_net_speed; + const char *chart_id_net_duplex; + const char *chart_id_net_operstate; + const char *chart_id_net_carrier; + const char *chart_id_net_mtu; const char *chart_family; @@ -86,6 +103,8 @@ static struct netdev { kernel_uint_t speed; kernel_uint_t duplex; kernel_uint_t operstate; + unsigned long long carrier; + unsigned long long mtu; // charts RRDSET *st_bandwidth; @@ -95,6 +114,11 @@ static struct netdev { RRDSET *st_fifo; RRDSET *st_compressed; RRDSET *st_events; + RRDSET *st_speed; + RRDSET *st_duplex; + RRDSET *st_operstate; + RRDSET *st_carrier; + RRDSET *st_mtu; // dimensions RRDDIM *rd_rbytes; @@ -115,18 +139,19 @@ static struct netdev { RRDDIM *rd_tcarrier; RRDDIM *rd_tcompressed; - usec_t speed_last_collected_usec; - usec_t duplex_last_collected_usec; - usec_t operstate_last_collected_usec; + RRDDIM *rd_speed; + RRDDIM *rd_duplex; + RRDDIM *rd_operstate; + RRDDIM *rd_carrier; + RRDDIM *rd_mtu; char *filename_speed; RRDSETVAR *chart_var_speed; char *filename_duplex; - RRDSETVAR *chart_var_duplex; - char *filename_operstate; - RRDSETVAR *chart_var_operstate; + char *filename_carrier; + char *filename_mtu; struct netdev *next; } *netdev_root = NULL, *netdev_last_used = NULL; @@ -143,6 +168,11 @@ static void netdev_charts_release(struct netdev *d) { if(d->st_fifo) rrdset_is_obsolete(d->st_fifo); if(d->st_compressed) rrdset_is_obsolete(d->st_compressed); if(d->st_events) rrdset_is_obsolete(d->st_events); + if(d->st_speed) rrdset_is_obsolete(d->st_speed); + if(d->st_duplex) rrdset_is_obsolete(d->st_duplex); + if(d->st_operstate) rrdset_is_obsolete(d->st_operstate); + if(d->st_carrier) rrdset_is_obsolete(d->st_carrier); + if(d->st_mtu) rrdset_is_obsolete(d->st_mtu); d->st_bandwidth = NULL; d->st_compressed = NULL; @@ -151,6 +181,11 @@ static void netdev_charts_release(struct netdev *d) { d->st_events = NULL; d->st_fifo = NULL; d->st_packets = NULL; + d->st_speed = NULL; + d->st_duplex = NULL; + d->st_operstate = NULL; + d->st_carrier = NULL; + d->st_mtu = NULL; d->rd_rbytes = NULL; d->rd_rpackets = NULL; @@ -170,9 +205,13 @@ static void netdev_charts_release(struct netdev *d) { d->rd_tcarrier = NULL; d->rd_tcompressed = NULL; + d->rd_speed = NULL; + d->rd_duplex = NULL; + d->rd_operstate = NULL; + d->rd_carrier = NULL; + d->rd_mtu = NULL; + d->chart_var_speed = NULL; - d->chart_var_duplex = NULL; - d->chart_var_operstate = NULL; } static void netdev_free_chart_strings(struct netdev *d) { @@ -183,6 +222,11 @@ static void netdev_free_chart_strings(struct netdev *d) { freez((void *)d->chart_type_net_events); freez((void *)d->chart_type_net_fifo); freez((void *)d->chart_type_net_packets); + freez((void *)d->chart_type_net_speed); + freez((void *)d->chart_type_net_duplex); + freez((void *)d->chart_type_net_operstate); + freez((void *)d->chart_type_net_carrier); + freez((void *)d->chart_type_net_mtu); freez((void *)d->chart_id_net_bytes); freez((void *)d->chart_id_net_compressed); @@ -191,6 +235,11 @@ static void netdev_free_chart_strings(struct netdev *d) { freez((void *)d->chart_id_net_events); freez((void *)d->chart_id_net_fifo); freez((void *)d->chart_id_net_packets); + freez((void *)d->chart_id_net_speed); + freez((void *)d->chart_id_net_duplex); + freez((void *)d->chart_id_net_operstate); + freez((void *)d->chart_id_net_carrier); + freez((void *)d->chart_id_net_mtu); freez((void *)d->chart_family); } @@ -204,6 +253,8 @@ static void netdev_free(struct netdev *d) { freez((void *)d->filename_speed); freez((void *)d->filename_duplex); freez((void *)d->filename_operstate); + freez((void *)d->filename_carrier); + freez((void *)d->filename_mtu); freez((void *)d); netdev_added--; } @@ -325,6 +376,11 @@ static inline void netdev_rename_cgroup(struct netdev *d, struct netdev_rename * d->chart_type_net_events = strdupz(buffer); d->chart_type_net_fifo = strdupz(buffer); d->chart_type_net_packets = strdupz(buffer); + d->chart_type_net_speed = strdupz(buffer); + d->chart_type_net_duplex = strdupz(buffer); + d->chart_type_net_operstate = strdupz(buffer); + d->chart_type_net_carrier = strdupz(buffer); + d->chart_type_net_mtu = strdupz(buffer); snprintfz(buffer, RRD_ID_LENGTH_MAX, "net_%s", r->container_device); d->chart_id_net_bytes = strdupz(buffer); @@ -347,6 +403,21 @@ static inline void netdev_rename_cgroup(struct netdev *d, struct netdev_rename * snprintfz(buffer, RRD_ID_LENGTH_MAX, "net_packets_%s", r->container_device); d->chart_id_net_packets = strdupz(buffer); + snprintfz(buffer, RRD_ID_LENGTH_MAX, "net_speed_%s", r->container_device); + d->chart_id_net_speed = strdupz(buffer); + + snprintfz(buffer, RRD_ID_LENGTH_MAX, "net_duplex_%s", r->container_device); + d->chart_id_net_duplex = strdupz(buffer); + + snprintfz(buffer, RRD_ID_LENGTH_MAX, "net_operstate_%s", r->container_device); + d->chart_id_net_operstate = strdupz(buffer); + + snprintfz(buffer, RRD_ID_LENGTH_MAX, "net_carrier_%s", r->container_device); + d->chart_id_net_carrier = strdupz(buffer); + + snprintfz(buffer, RRD_ID_LENGTH_MAX, "net_mtu_%s", r->container_device); + d->chart_id_net_mtu = strdupz(buffer); + snprintfz(buffer, RRD_ID_LENGTH_MAX, "net %s", r->container_device); d->chart_family = strdupz(buffer); @@ -451,6 +522,11 @@ static struct netdev *get_netdev(const char *name) { d->chart_type_net_events = strdupz("net_events"); d->chart_type_net_fifo = strdupz("net_fifo"); d->chart_type_net_packets = strdupz("net_packets"); + d->chart_type_net_speed = strdupz("net_speed"); + d->chart_type_net_duplex = strdupz("net_duplex"); + d->chart_type_net_operstate = strdupz("net_operstate"); + d->chart_type_net_carrier = strdupz("net_carrier"); + d->chart_type_net_mtu = strdupz("net_mtu"); d->chart_id_net_bytes = strdupz(d->name); d->chart_id_net_compressed = strdupz(d->name); @@ -459,6 +535,11 @@ static struct netdev *get_netdev(const char *name) { d->chart_id_net_events = strdupz(d->name); d->chart_id_net_fifo = strdupz(d->name); d->chart_id_net_packets = strdupz(d->name); + d->chart_id_net_speed = strdupz(d->name); + d->chart_id_net_duplex = strdupz(d->name); + d->chart_id_net_operstate = strdupz(d->name); + d->chart_id_net_carrier = strdupz(d->name); + d->chart_id_net_mtu = strdupz(d->name); d->chart_family = strdupz(d->name); d->priority = NETDATA_CHART_PRIO_FIRST_NET_IFACE; @@ -485,14 +566,13 @@ int do_proc_net_dev(int update_every, usec_t dt) { static procfile *ff = NULL; static int enable_new_interfaces = -1; static int do_bandwidth = -1, do_packets = -1, do_errors = -1, do_drops = -1, do_fifo = -1, do_compressed = -1, - do_events = -1; + do_events = -1, do_speed = -1, do_duplex = -1, do_operstate = -1, do_carrier = -1, do_mtu = -1; static char *path_to_sys_devices_virtual_net = NULL, *path_to_sys_class_net_speed = NULL, *proc_net_dev_filename = NULL; static char *path_to_sys_class_net_duplex = NULL; static char *path_to_sys_class_net_operstate = NULL; - static long long int dt_to_refresh_speed = 0; - static long long int dt_to_refresh_duplex = 0; - static long long int dt_to_refresh_operstate = 0; + static char *path_to_sys_class_net_carrier = NULL; + static char *path_to_sys_class_net_mtu = NULL; if(unlikely(enable_new_interfaces == -1)) { char filename[FILENAME_MAX + 1]; @@ -512,6 +592,12 @@ int do_proc_net_dev(int update_every, usec_t dt) { snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/class/net/%s/operstate"); path_to_sys_class_net_operstate = config_get(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "path to get net device operstate", filename); + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/class/net/%s/carrier"); + path_to_sys_class_net_carrier = config_get(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "path to get net device carrier", filename); + + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/class/net/%s/mtu"); + path_to_sys_class_net_mtu = config_get(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "path to get net device mtu", filename); + enable_new_interfaces = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "enable new interfaces detected at runtime", CONFIG_BOOLEAN_AUTO); @@ -522,19 +608,13 @@ int do_proc_net_dev(int update_every, usec_t dt) { do_fifo = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "fifo for all interfaces", CONFIG_BOOLEAN_AUTO); do_compressed = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "compressed packets for all interfaces", CONFIG_BOOLEAN_AUTO); do_events = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "frames, collisions, carrier counters for all interfaces", CONFIG_BOOLEAN_AUTO); + do_speed = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "speed for all interfaces", CONFIG_BOOLEAN_AUTO); + do_duplex = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "duplex for all interfaces", CONFIG_BOOLEAN_AUTO); + do_operstate = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "operstate for all interfaces", CONFIG_BOOLEAN_AUTO); + do_carrier = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "carrier for all interfaces", CONFIG_BOOLEAN_AUTO); + do_mtu = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "mtu for all interfaces", CONFIG_BOOLEAN_AUTO); disabled_list = simple_pattern_create(config_get(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "disable by default interfaces matching", "lo fireqos* *-ifb"), NULL, SIMPLE_PATTERN_EXACT); - - dt_to_refresh_speed = config_get_number(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "refresh interface speed every seconds", 10) * USEC_PER_SEC; - dt_to_refresh_duplex = config_get_number(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "refresh interface duplex every seconds", 10) * USEC_PER_SEC; - dt_to_refresh_operstate = config_get_number(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "refresh interface operstate every seconds", 10) * USEC_PER_SEC; - - if (dt_to_refresh_operstate < 0) - dt_to_refresh_operstate = 0; - if (dt_to_refresh_duplex < 0) - dt_to_refresh_duplex = 0; - if (dt_to_refresh_speed < 0) - dt_to_refresh_speed = 0; } if(unlikely(!ff)) { @@ -595,9 +675,16 @@ int do_proc_net_dev(int update_every, usec_t dt) { snprintfz(buffer, FILENAME_MAX, path_to_sys_class_net_duplex, d->name); d->filename_duplex = strdupz(buffer); } + snprintfz(buffer, FILENAME_MAX, path_to_sys_class_net_operstate, d->name); d->filename_operstate = strdupz(buffer); + snprintfz(buffer, FILENAME_MAX, path_to_sys_class_net_carrier, d->name); + d->filename_carrier = strdupz(buffer); + + snprintfz(buffer, FILENAME_MAX, path_to_sys_class_net_mtu, d->name); + d->filename_mtu = strdupz(buffer); + snprintfz(buffer, FILENAME_MAX, "plugin:proc:/proc/net/dev:%s", d->name); d->enabled = config_get_boolean_ondemand(buffer, "enabled", d->enabled); d->virtual = config_get_boolean(buffer, "virtual", d->virtual); @@ -612,6 +699,11 @@ int do_proc_net_dev(int update_every, usec_t dt) { d->do_fifo = config_get_boolean_ondemand(buffer, "fifo", do_fifo); d->do_compressed = config_get_boolean_ondemand(buffer, "compressed", do_compressed); d->do_events = config_get_boolean_ondemand(buffer, "events", do_events); + d->do_speed = config_get_boolean_ondemand(buffer, "speed", do_speed); + d->do_duplex = config_get_boolean_ondemand(buffer, "duplex", do_duplex); + d->do_operstate = config_get_boolean_ondemand(buffer, "operstate", do_operstate); + d->do_carrier = config_get_boolean_ondemand(buffer, "carrier", do_carrier); + d->do_mtu = config_get_boolean_ondemand(buffer, "mtu", do_mtu); } if(unlikely(!d->enabled)) @@ -659,6 +751,55 @@ int do_proc_net_dev(int update_every, usec_t dt) { d->tcarrier = str2kernel_uint_t(procfile_lineword(ff, l, 15)); } + if (d->do_duplex != CONFIG_BOOLEAN_NO && d->filename_duplex) { + char buffer[STATE_LENGTH_MAX + 1]; + + if (read_file(d->filename_duplex, buffer, STATE_LENGTH_MAX)) { + error("Cannot refresh interface %s duplex state by reading '%s'. I will stop updating it.", d->name, d->filename_duplex); + freez(d->filename_duplex); + d->filename_duplex = NULL; + } else { + // values can be unknown, half or full -- just check the first letter for speed + if (buffer[0] == 'f') + d->duplex = 2; + else if (buffer[0] == 'h') + d->duplex = 1; + else + d->duplex = 0; + } + } + + if(d->do_operstate != CONFIG_BOOLEAN_NO && d->filename_operstate) { + char buffer[STATE_LENGTH_MAX + 1], *trimmed_buffer; + + if (read_file(d->filename_operstate, buffer, STATE_LENGTH_MAX)) { + error( + "Cannot refresh %s operstate by reading '%s'. Will not update its status anymore.", + d->name, d->filename_operstate); + freez(d->filename_operstate); + d->filename_operstate = NULL; + } else { + trimmed_buffer = trim(buffer); + d->operstate = get_operstate(trimmed_buffer); + } + } + + if (d->do_carrier != CONFIG_BOOLEAN_NO && d->filename_carrier) { + if (read_single_number_file(d->filename_carrier, &d->carrier)) { + error("Cannot refresh interface %s carrier state by reading '%s'. Stop updating it.", d->name, d->filename_carrier); + freez(d->filename_carrier); + d->filename_carrier = NULL; + } + } + + if (d->do_mtu != CONFIG_BOOLEAN_NO && d->filename_mtu) { + if (read_single_number_file(d->filename_mtu, &d->mtu)) { + error("Cannot refresh mtu for interface %s by reading '%s'. Stop updating it.", d->name, d->filename_carrier); + freez(d->filename_carrier); + d->filename_carrier = NULL; + } + } + //info("PROC_NET_DEV: %s speed %zu, bytes %zu/%zu, packets %zu/%zu/%zu, errors %zu/%zu, drops %zu/%zu, fifo %zu/%zu, compressed %zu/%zu, rframe %zu, tcollisions %zu, tcarrier %zu" // , d->name, d->speed // , d->rbytes, d->tbytes @@ -715,102 +856,179 @@ int do_proc_net_dev(int update_every, usec_t dt) { // update the interface speed if(d->filename_speed) { - d->speed_last_collected_usec += dt; - - if(unlikely(d->speed_last_collected_usec >= (usec_t)dt_to_refresh_speed)) { - - if(unlikely(!d->chart_var_speed)) { - d->chart_var_speed = rrdsetvar_custom_chart_variable_create(d->st_bandwidth, "nic_speed_max"); - if(!d->chart_var_speed) { - error("Cannot create interface %s chart variable 'nic_speed_max'. Will not update its speed anymore.", d->name); - freez(d->filename_speed); - d->filename_speed = NULL; - } + if(unlikely(!d->chart_var_speed)) { + d->chart_var_speed = rrdsetvar_custom_chart_variable_create(d->st_bandwidth, "nic_speed_max"); + if(!d->chart_var_speed) { + error("Cannot create interface %s chart variable 'nic_speed_max'. Will not update its speed anymore.", d->name); + freez(d->filename_speed); + d->filename_speed = NULL; } + } - if(d->filename_speed && d->chart_var_speed) { - if(read_single_number_file(d->filename_speed, (unsigned long long *) &d->speed)) { - error("Cannot refresh interface %s speed by reading '%s'. Will not update its speed anymore.", d->name, d->filename_speed); - freez(d->filename_speed); - d->filename_speed = NULL; - } - else { - rrdsetvar_custom_chart_variable_set(d->chart_var_speed, (calculated_number) d->speed); - d->speed_last_collected_usec = 0; + if(d->filename_speed && d->chart_var_speed) { + if(read_single_number_file(d->filename_speed, (unsigned long long *) &d->speed)) { + error("Cannot refresh interface %s speed by reading '%s'. Will not update its speed anymore.", d->name, d->filename_speed); + freez(d->filename_speed); + d->filename_speed = NULL; + } + else { + rrdsetvar_custom_chart_variable_set(d->chart_var_speed, (calculated_number) d->speed * KILOBITS_IN_A_MEGABIT); + + if(d->do_speed != CONFIG_BOOLEAN_NO) { + if(unlikely(!d->st_speed)) { + d->st_speed = rrdset_create_localhost( + d->chart_type_net_speed + , d->chart_id_net_speed + , NULL + , d->chart_family + , "net.speed" + , "Interface Speed" + , "kilobits/s" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_NETDEV_NAME + , d->priority + 7 + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(d->st_speed, RRDSET_FLAG_DETAIL); + + rrdset_update_labels(d->st_speed, d->chart_labels); + + d->rd_speed = rrddim_add(d->st_speed, "speed", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } + else rrdset_next(d->st_speed); + + rrddim_set_by_pointer(d->st_speed, d->rd_speed, (collected_number)d->speed * KILOBITS_IN_A_MEGABIT); + rrdset_done(d->st_speed); } } } } + } - if (d->filename_duplex) { - d->duplex_last_collected_usec += dt; + // -------------------------------------------------------------------- - if (unlikely(d->duplex_last_collected_usec >= (usec_t)dt_to_refresh_duplex)) { - if (unlikely(!d->chart_var_duplex)) { - d->chart_var_duplex = rrdsetvar_custom_chart_variable_create(d->st_bandwidth, "duplex"); - if (!d->chart_var_duplex) { - error("Cannot create interface %s chart variable 'duplex'. Will not update the duplex status anymore.", d->name); - freez(d->filename_duplex); - d->filename_duplex = NULL; - } - } + if(d->do_duplex != CONFIG_BOOLEAN_NO && d->filename_duplex) { + if(unlikely(!d->st_duplex)) { + d->st_duplex = rrdset_create_localhost( + d->chart_type_net_duplex + , d->chart_id_net_duplex + , NULL + , d->chart_family + , "net.duplex" + , "Interface Duplex State" + , "state" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_NETDEV_NAME + , d->priority + 8 + , update_every + , RRDSET_TYPE_LINE + ); - if (d->filename_duplex && d->chart_var_duplex) { - char buffer[32 + 1]; - - if (read_file(d->filename_duplex, buffer, 32)) { - error("Cannot refresh interface %s duplex state by reading '%s'. I will stop updating it.", d->name, d->filename_duplex); - freez(d->filename_duplex); - d->filename_duplex = NULL; - } else { - // values can be unknown, half or full -- just check the first letter for speed - if (buffer[0] == 'f') - d->duplex = 2; - else if (buffer[0] == 'h') - d->duplex = 1; - else - d->duplex = 0; - - rrdsetvar_custom_chart_variable_set(d->chart_var_duplex, (calculated_number)d->duplex); - d->duplex_last_collected_usec = 0; - } - } - } + rrdset_flag_set(d->st_duplex, RRDSET_FLAG_DETAIL); + + rrdset_update_labels(d->st_duplex, d->chart_labels); + + d->rd_duplex = rrddim_add(d->st_duplex, "duplex", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); } + else rrdset_next(d->st_duplex); - if (d->filename_operstate) { - d->operstate_last_collected_usec += dt; - - if (unlikely(d->operstate_last_collected_usec >= (usec_t)dt_to_refresh_operstate)) { - if (unlikely(!d->chart_var_operstate)) { - d->chart_var_operstate = rrdsetvar_custom_chart_variable_create(d->st_bandwidth, "operstate"); - if (!d->chart_var_operstate) { - error( - "Cannot create interface %s chart variable 'operstate'. I will stop updating it.", - d->name); - freez(d->filename_operstate); - d->filename_operstate = NULL; - } - } + rrddim_set_by_pointer(d->st_duplex, d->rd_duplex, (collected_number)d->duplex); + rrdset_done(d->st_duplex); + } - if (d->filename_operstate && d->chart_var_operstate) { - char buffer[32 + 1], *trimmed_buffer; - - if (read_file(d->filename_operstate, buffer, 32)) { - error( - "Cannot refresh %s operstate by reading '%s'. Will not update its status anymore.", - d->name, d->filename_operstate); - freez(d->filename_operstate); - d->filename_operstate = NULL; - } else { - trimmed_buffer = trim(buffer); - d->operstate = get_operstate(trimmed_buffer); - rrdsetvar_custom_chart_variable_set(d->chart_var_operstate, (calculated_number)d->operstate); - d->operstate_last_collected_usec = 0; - } - } - } + // -------------------------------------------------------------------- + + if(d->do_operstate != CONFIG_BOOLEAN_NO && d->filename_operstate) { + if(unlikely(!d->st_operstate)) { + d->st_operstate = rrdset_create_localhost( + d->chart_type_net_operstate + , d->chart_id_net_operstate + , NULL + , d->chart_family + , "net.operstate" + , "Interface Operational State" + , "state" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_NETDEV_NAME + , d->priority + 9 + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(d->st_operstate, RRDSET_FLAG_DETAIL); + + rrdset_update_labels(d->st_operstate, d->chart_labels); + + d->rd_operstate = rrddim_add(d->st_operstate, "state", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } + else rrdset_next(d->st_operstate); + + rrddim_set_by_pointer(d->st_operstate, d->rd_operstate, (collected_number)d->operstate); + rrdset_done(d->st_operstate); + } + + // -------------------------------------------------------------------- + + if(d->do_carrier != CONFIG_BOOLEAN_NO && d->filename_carrier) { + if(unlikely(!d->st_carrier)) { + d->st_carrier = rrdset_create_localhost( + d->chart_type_net_carrier + , d->chart_id_net_carrier + , NULL + , d->chart_family + , "net.carrier" + , "Inteface Physical Link State" + , "state" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_NETDEV_NAME + , d->priority + 10 + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(d->st_carrier, RRDSET_FLAG_DETAIL); + + rrdset_update_labels(d->st_carrier, d->chart_labels); + + d->rd_carrier = rrddim_add(d->st_carrier, "carrier", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); } + else rrdset_next(d->st_carrier); + + rrddim_set_by_pointer(d->st_carrier, d->rd_carrier, (collected_number)d->carrier); + rrdset_done(d->st_carrier); + } + + // -------------------------------------------------------------------- + + if(d->do_mtu != CONFIG_BOOLEAN_NO && d->filename_mtu) { + if(unlikely(!d->st_mtu)) { + d->st_mtu = rrdset_create_localhost( + d->chart_type_net_mtu + , d->chart_id_net_mtu + , NULL + , d->chart_family + , "net.mtu" + , "Interface MTU" + , "octets" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_NETDEV_NAME + , d->priority + 11 + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(d->st_mtu, RRDSET_FLAG_DETAIL); + + rrdset_update_labels(d->st_mtu, d->chart_labels); + + d->rd_mtu = rrddim_add(d->st_mtu, "mtu", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } + else rrdset_next(d->st_mtu); + + rrddim_set_by_pointer(d->st_mtu, d->rd_mtu, (collected_number)d->mtu); + rrdset_done(d->st_mtu); } // -------------------------------------------------------------------- diff --git a/collectors/proc.plugin/proc_self_mountinfo.c b/collectors/proc.plugin/proc_self_mountinfo.c index 3f17ccce2..ca00f8a89 100644 --- a/collectors/proc.plugin/proc_self_mountinfo.c +++ b/collectors/proc.plugin/proc_self_mountinfo.c @@ -47,11 +47,17 @@ // find the mount info with the given major:minor // in the supplied linked list of mountinfo structures -struct mountinfo *mountinfo_find(struct mountinfo *root, unsigned long major, unsigned long minor) { +struct mountinfo *mountinfo_find(struct mountinfo *root, unsigned long major, unsigned long minor, char *device) { struct mountinfo *mi; + uint32_t hash = simple_hash(device); + for(mi = root; mi ; mi = mi->next) - if(unlikely(mi->major == major && mi->minor == minor)) + if (unlikely( + mi->major == major && + mi->minor == minor && + mi->mount_source_name_hash == hash && + !strcmp(mi->mount_source_name, device))) return mi; return NULL; @@ -120,6 +126,7 @@ static void mountinfo_free(struct mountinfo *mi) { */ freez(mi->filesystem); freez(mi->mount_source); + freez(mi->mount_source_name); freez(mi->super_options); freez(mi); } @@ -273,6 +280,9 @@ struct mountinfo *mountinfo_read(int do_statvfs) { mi->mount_source = strdupz_decoding_octal(procfile_lineword(ff, l, w)); w++; mi->mount_source_hash = simple_hash(mi->mount_source); + mi->mount_source_name = strdupz(basename(mi->mount_source)); + mi->mount_source_name_hash = simple_hash(mi->mount_source_name); + mi->super_options = strdupz(procfile_lineword(ff, l, w)); w++; if(unlikely(is_read_only(mi->super_options))) @@ -316,6 +326,9 @@ struct mountinfo *mountinfo_read(int do_statvfs) { mi->mount_source = NULL; mi->mount_source_hash = 0; + mi->mount_source_name = NULL; + mi->mount_source_name_hash = 0; + mi->super_options = NULL; mi->st_dev = 0; diff --git a/collectors/proc.plugin/proc_self_mountinfo.h b/collectors/proc.plugin/proc_self_mountinfo.h index 15d63c786..92918a73d 100644 --- a/collectors/proc.plugin/proc_self_mountinfo.h +++ b/collectors/proc.plugin/proc_self_mountinfo.h @@ -38,6 +38,9 @@ struct mountinfo { char *mount_source; // mount source: filesystem-specific information or "none". uint32_t mount_source_hash; + char *mount_source_name; + uint32_t mount_source_name_hash; + char *super_options; // super options: per-superblock options. uint32_t flags; @@ -47,11 +50,11 @@ struct mountinfo { struct mountinfo *next; }; -extern struct mountinfo *mountinfo_find(struct mountinfo *root, unsigned long major, unsigned long minor); +extern struct mountinfo *mountinfo_find(struct mountinfo *root, unsigned long major, unsigned long minor, char *device); extern struct mountinfo *mountinfo_find_by_filesystem_mount_source(struct mountinfo *root, const char *filesystem, const char *mount_source); extern struct mountinfo *mountinfo_find_by_filesystem_super_option(struct mountinfo *root, const char *filesystem, const char *super_options); extern void mountinfo_free_all(struct mountinfo *mi); extern struct mountinfo *mountinfo_read(int do_statvfs); -#endif /* NETDATA_PROC_SELF_MOUNTINFO_H */ \ No newline at end of file +#endif /* NETDATA_PROC_SELF_MOUNTINFO_H */ diff --git a/collectors/python.d.plugin/README.md b/collectors/python.d.plugin/README.md index a05bc81dd..312986e48 100644 --- a/collectors/python.d.plugin/README.md +++ b/collectors/python.d.plugin/README.md @@ -84,7 +84,12 @@ If you plan to submit the module in a PR, make sure and go through the [PR check For a quick start, you can look at the [example plugin](https://raw.githubusercontent.com/netdata/netdata/master/collectors/python.d.plugin/example/example.chart.py). -**Note**: If you are working 'locally' on a new collector and would like to run it in an already installed and running Netdata (as opposed to having to install Netdata from source again with your new changes) to can copy over the relevant file to where Netdata expects it and then either `sudo service netdata restart` to have it be picked up and used by Netdata or you can just run the updated collector in debug mode by following a process like below (this assumes you have [installed Netdata from a GitHub fork](https://learn.netdata.cloud/docs/agent/packaging/installer/methods/manual) you have made to do your development on). +**Note**: If you are working 'locally' on a new collector and would like to run it in an already installed and running +Netdata (as opposed to having to install Netdata from source again with your new changes) to can copy over the relevant +file to where Netdata expects it and then either `sudo systemctl restart netdata` to have it be picked up and used by +Netdata or you can just run the updated collector in debug mode by following a process like below (this assumes you have +[installed Netdata from a GitHub fork](https://learn.netdata.cloud/docs/agent/packaging/installer/methods/manual) you +have made to do your development on). ```bash # clone your fork (done once at the start but shown here for clarity) diff --git a/collectors/python.d.plugin/adaptec_raid/README.md b/collectors/python.d.plugin/adaptec_raid/README.md index b14e8f9ba..4e99508b3 100644 --- a/collectors/python.d.plugin/adaptec_raid/README.md +++ b/collectors/python.d.plugin/adaptec_raid/README.md @@ -60,8 +60,8 @@ cd /etc/netdata # Replace this path with your Netdata config directory, if dif sudo ./edit-config python.d.conf ``` -Change the value of the `adaptec_raid` setting to `yes`. Save the file and restart the Netdata Agent -with `sudo systemctl restart netdata`, or the appropriate method for your system. +Change the value of the `adaptec_raid` setting to `yes`. Save the file and restart the Netdata Agent with `sudo +systemctl restart netdata`, or the [appropriate method](/docs/configure/start-stop-restart.md) for your system. ## Configuration diff --git a/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py b/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py index 564c2ce87..bb59d88e1 100644 --- a/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py +++ b/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py @@ -23,20 +23,22 @@ ORDER = [ CHARTS = { 'ld_status': { - 'options': [None, 'Status Is Not OK', 'bool', 'logical devices', 'adapter_raid.ld_status', 'line'], + 'options': [None, 'Status of logical devices (1: Failed or Degraded)', 'bool', 'logical devices', + 'adaptec_raid.ld_status', 'line'], 'lines': [] }, 'pd_state': { - 'options': [None, 'State Is Not OK', 'bool', 'physical devices', 'adapter_raid.pd_state', 'line'], + 'options': [None, 'State of physical devices (1: not Online)', 'bool', 'physical devices', + 'adaptec_raid.pd_state', 'line'], 'lines': [] }, 'pd_smart_warnings': { 'options': [None, 'S.M.A.R.T warnings', 'count', 'physical devices', - 'adapter_raid.smart_warnings', 'line'], + 'adaptec_raid.smart_warnings', 'line'], 'lines': [] }, 'pd_temperature': { - 'options': [None, 'Temperature', 'celsius', 'physical devices', 'adapter_raid.temperature', 'line'], + 'options': [None, 'Temperature', 'celsius', 'physical devices', 'adaptec_raid.temperature', 'line'], 'lines': [] }, } diff --git a/collectors/python.d.plugin/alarms/README.md b/collectors/python.d.plugin/alarms/README.md index ea96061cc..3f2a8176e 100644 --- a/collectors/python.d.plugin/alarms/README.md +++ b/collectors/python.d.plugin/alarms/README.md @@ -23,7 +23,7 @@ Below is an example of the chart produced when running `stress-ng --all 2` for a ## Configuration -Enable the collector and restart Netdata. +Enable the collector and [restart Netdata](/docs/configure/start-stop-restart.md). ```bash cd /etc/netdata/ @@ -51,6 +51,8 @@ local: CLEAR: 0 WARNING: 1 CRITICAL: 2 + # set to true to include a chart with calculated alarm values over time + collect_alarm_values: false ``` It will default to pulling all alarms at each time step from the Netdata rest api at `http://127.0.0.1:19999/api/v1/alarms?all` diff --git a/collectors/python.d.plugin/alarms/alarms.chart.py b/collectors/python.d.plugin/alarms/alarms.chart.py index 973a1f382..1eec40450 100644 --- a/collectors/python.d.plugin/alarms/alarms.chart.py +++ b/collectors/python.d.plugin/alarms/alarms.chart.py @@ -11,36 +11,44 @@ update_every = 10 disabled_by_default = True -def charts_template(sm): +def charts_template(sm, alarm_status_chart_type='line'): order = [ 'alarms', + 'values' ] mappings = ', '.join(['{0}={1}'.format(k, v) for k, v in sm.items()]) charts = { 'alarms': { - 'options': [None, 'Alarms ({0})'.format(mappings), 'status', 'alarms', 'alarms.status', 'line'], + 'options': [None, 'Alarms ({0})'.format(mappings), 'status', 'status', 'alarms.status', alarm_status_chart_type], 'lines': [], 'variables': [ ['alarms_num'], ] + }, + 'values': { + 'options': [None, 'Alarm Values', 'value', 'value', 'alarms.value', 'line'], + 'lines': [], } } return order, charts DEFAULT_STATUS_MAP = {'CLEAR': 0, 'WARNING': 1, 'CRITICAL': 2} - DEFAULT_URL = 'http://127.0.0.1:19999/api/v1/alarms?all' +DEFAULT_COLLECT_ALARM_VALUES = False +DEFAULT_ALARM_STATUS_CHART_TYPE = 'line' class Service(UrlService): def __init__(self, configuration=None, name=None): UrlService.__init__(self, configuration=configuration, name=name) self.sm = self.configuration.get('status_map', DEFAULT_STATUS_MAP) - self.order, self.definitions = charts_template(self.sm) + self.alarm_status_chart_type = self.configuration.get('alarm_status_chart_type', DEFAULT_ALARM_STATUS_CHART_TYPE) + self.order, self.definitions = charts_template(self.sm, self.alarm_status_chart_type) self.url = self.configuration.get('url', DEFAULT_URL) - self.collected_alarms = set() + self.collect_alarm_values = bool(self.configuration.get('collect_alarm_values', DEFAULT_COLLECT_ALARM_VALUES)) + self.collected_dims = {'alarms': set(), 'values': set()} def _get_data(self): raw_data = self._get_raw_data() @@ -51,21 +59,26 @@ class Service(UrlService): alarms = raw_data.get('alarms', {}) data = {a: self.sm[alarms[a]['status']] for a in alarms if alarms[a]['status'] in self.sm} - self.update_charts(alarms, data) + self.update_charts('alarms', data) data['alarms_num'] = len(data) + if self.collect_alarm_values: + data_values = {'{}_value'.format(a): alarms[a]['value'] * 100 for a in alarms if 'value' in alarms[a] and alarms[a]['value'] is not None} + self.update_charts('values', data_values, divisor=100) + data.update(data_values) + return data - def update_charts(self, alarms, data): + def update_charts(self, chart, data, algorithm='absolute', multiplier=1, divisor=1): if not self.charts: return - for a in data: - if a not in self.collected_alarms: - self.collected_alarms.add(a) - self.charts['alarms'].add_dimension([a, a, 'absolute', '1', '1']) + for dim in data: + if dim not in self.collected_dims[chart]: + self.collected_dims[chart].add(dim) + self.charts[chart].add_dimension([dim, dim, algorithm, multiplier, divisor]) - for a in list(self.collected_alarms): - if a not in alarms: - self.collected_alarms.remove(a) - self.charts['alarms'].del_dimension(a, hide=False) + for dim in list(self.collected_dims[chart]): + if dim not in data: + self.collected_dims[chart].remove(dim) + self.charts[chart].del_dimension(dim, hide=False) diff --git a/collectors/python.d.plugin/alarms/alarms.conf b/collectors/python.d.plugin/alarms/alarms.conf index fd7780c59..5e83d8f56 100644 --- a/collectors/python.d.plugin/alarms/alarms.conf +++ b/collectors/python.d.plugin/alarms/alarms.conf @@ -48,3 +48,7 @@ local: CLEAR: 0 WARNING: 1 CRITICAL: 2 + # set to true to include a chart with calculated alarm values over time + collect_alarm_values: false + # define the type of chart for plotting status over time e.g. 'line' or 'stacked' + alarm_status_chart_type: 'line' diff --git a/collectors/python.d.plugin/anomalies/README.md b/collectors/python.d.plugin/anomalies/README.md index 862f4f345..bcbfdbcd7 100644 --- a/collectors/python.d.plugin/anomalies/README.md +++ b/collectors/python.d.plugin/anomalies/README.md @@ -45,7 +45,8 @@ pip3 install --user netdata-pandas==0.0.32 numba==0.50.1 scikit-learn==0.23.2 py ## Configuration -Install the Python requirements above, enable the collector and restart Netdata. +Install the Python requirements above, enable the collector and [restart +Netdata](/docs/configure/start-stop-restart.md). ```bash cd /etc/netdata/ diff --git a/collectors/python.d.plugin/chrony/README.md b/collectors/python.d.plugin/chrony/README.md index b1e7ec35c..4681b4f6d 100644 --- a/collectors/python.d.plugin/chrony/README.md +++ b/collectors/python.d.plugin/chrony/README.md @@ -55,7 +55,7 @@ local: command: 'chronyc -n tracking' ``` -Save the file and restart the Netdata Agent with `sudo systemctl restart netdata`, or the appropriate method for your -system, to finish configuring the `chrony` collector. +Save the file and restart the Netdata Agent with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to finish configuring the `chrony` collector. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fchrony%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/collectors/python.d.plugin/example/example.chart.py b/collectors/python.d.plugin/example/example.chart.py index 61ae47f22..d6c0b6658 100644 --- a/collectors/python.d.plugin/example/example.chart.py +++ b/collectors/python.d.plugin/example/example.chart.py @@ -29,6 +29,9 @@ class Service(SimpleService): self.order = ORDER self.definitions = CHARTS self.random = SystemRandom() + self.num_lines = self.configuration.get('num_lines', 4) + self.lower = self.configuration.get('lower', 0) + self.upper = self.configuration.get('upper', 100) @staticmethod def check(): @@ -37,12 +40,12 @@ class Service(SimpleService): def get_data(self): data = dict() - for i in range(1, 4): + for i in range(0, self.num_lines): dimension_id = ''.join(['random', str(i)]) if dimension_id not in self.charts['random']: self.charts['random'].add_dimension([dimension_id]) - data[dimension_id] = self.random.randint(0, 100) + data[dimension_id] = self.random.randint(self.lower, self.upper) return data diff --git a/collectors/python.d.plugin/example/example.conf b/collectors/python.d.plugin/example/example.conf index 3d8435173..31261b840 100644 --- a/collectors/python.d.plugin/example/example.conf +++ b/collectors/python.d.plugin/example/example.conf @@ -51,7 +51,7 @@ # predefined parameters. These are: # # job_name: -# name: myname # the JOB's name as it will appear at the +# name: myname # the JOB's name as it will appear on the dashboard # # dashboard (by default is the job_name) # # JOBs sharing a name are mutually exclusive # update_every: 1 # the JOB's data collection frequency @@ -61,8 +61,27 @@ # # Additionally to the above, example also supports the following: # -# - none +# num_lines: 4 # the number of lines to create +# lower: 0 # the lower bound of numbers to randomly sample from +# upper: 100 # the upper bound of numbers to randomly sample from # # ---------------------------------------------------------------------- # AUTO-DETECTION JOBS -# only one of them will run (they have the same name) + +four_lines: + name: "Four Lines" # the JOB's name as it will appear on the dashboard + update_every: 1 # the JOB's data collection frequency + priority: 60000 # the JOB's order on the dashboard + penalty: yes # the JOB's penalty + autodetection_retry: 0 # the JOB's re-check interval in seconds + num_lines: 4 # the number of lines to create + lower: 0 # the lower bound of numbers to randomly sample from + upper: 100 # the upper bound of numbers to randomly sample from + +# if you wanted to make another job to run in addition to the one above then +# you would just uncomment the job configuration below. +# two_lines: +# name: "Two Lines" # the JOB's name as it will appear on the dashboard +# num_lines: 2 # the number of lines to create +# lower: 50 # the lower bound of numbers to randomly sample from +# upper: 75 # the upper bound of numbers to randomly sample from diff --git a/collectors/python.d.plugin/hpssa/README.md b/collectors/python.d.plugin/hpssa/README.md index af8c4378e..69c8d8a33 100644 --- a/collectors/python.d.plugin/hpssa/README.md +++ b/collectors/python.d.plugin/hpssa/README.md @@ -59,8 +59,8 @@ cd /etc/netdata # Replace this path with your Netdata config directory, if dif sudo ./edit-config python.d.conf ``` -Change the value of the `hpssa` setting to `yes`. Save the file and restart the Netdata Agent -with `sudo systemctl restart netdata`, or the appropriate method for your system. +Change the value of the `hpssa` setting to `yes`. Save the file and restart the Netdata Agent with `sudo systemctl +restart netdata`, or the [appropriate method](/docs/configure/start-stop-restart.md) for your system. ## Configuration @@ -78,4 +78,7 @@ If `ssacli` cannot be found in the `PATH`, configure it in `hpssa.conf`. ssacli_path: /usr/sbin/ssacli ``` +Save the file and restart the Netdata Agent with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. + [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fhpssa%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/collectors/python.d.plugin/megacli/README.md b/collectors/python.d.plugin/megacli/README.md index 4fb7eb1c2..e411c4c11 100644 --- a/collectors/python.d.plugin/megacli/README.md +++ b/collectors/python.d.plugin/megacli/README.md @@ -80,6 +80,7 @@ Battery stats disabled by default. To enable them, modify `megacli.conf`. do_battery: yes ``` ---- +Save the file and restart the Netdata Agent with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fmegacli%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/collectors/python.d.plugin/samba/README.md b/collectors/python.d.plugin/samba/README.md index a5126510f..04cb7dcf7 100644 --- a/collectors/python.d.plugin/samba/README.md +++ b/collectors/python.d.plugin/samba/README.md @@ -103,8 +103,8 @@ cd /etc/netdata # Replace this path with your Netdata config directory, if dif sudo ./edit-config python.d.conf ``` -Change the value of the `samba` setting to `yes`. Save the file and restart the Netdata Agent -with `sudo systemctl restart netdata`, or the appropriate method for your system. +Change the value of the `samba` setting to `yes`. Save the file and restart the Netdata Agent with `sudo systemctl +restart netdata`, or the [appropriate method](/docs/configure/start-stop-restart.md) for your system. ## Configuration diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py index 8f10a5351..e4a19d411 100644 --- a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py +++ b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py @@ -49,6 +49,7 @@ ATTR198 = '198' ATTR199 = '199' ATTR202 = '202' ATTR206 = '206' +ATTR233 = '233' ATTR_READ_ERR_COR = 'read-total-err-corrected' ATTR_READ_ERR_UNC = 'read-total-unc-errors' ATTR_WRITE_ERR_COR = 'write-total-err-corrected' @@ -111,6 +112,7 @@ ORDER = [ 'current_pending_sector_count', 'offline_uncorrectable_sector_count', 'percent_lifetime_used', + 'media_wearout_indicator', ] CHARTS = { @@ -322,6 +324,12 @@ CHARTS = { 'lines': [], 'attrs': [ATTR202], 'algo': ABSOLUTE, + }, + 'media_wearout_indicator': { + 'options': [None, 'Media Wearout Indicator', 'percentage', 'wear', 'smartd_log.media_wearout_indicator', 'line'], + 'lines': [], + 'attrs': [ATTR233], + 'algo': ABSOLUTE, } } @@ -506,6 +514,7 @@ def ata_attribute_factory(value): ATTR7, ATTR202, ATTR206, + ATTR233, ]: return AtaNormalized(*value) diff --git a/collectors/statsd.plugin/Makefile.am b/collectors/statsd.plugin/Makefile.am index b01302d16..71f2d468d 100644 --- a/collectors/statsd.plugin/Makefile.am +++ b/collectors/statsd.plugin/Makefile.am @@ -10,6 +10,7 @@ dist_noinst_DATA = \ statsdconfigdir=$(libconfigdir)/statsd.d dist_statsdconfig_DATA = \ example.conf \ + k6.conf \ $(NULL) userstatsdconfigdir=$(configdir)/statsd.d diff --git a/collectors/statsd.plugin/README.md b/collectors/statsd.plugin/README.md index 070bfc554..0e9c954fc 100644 --- a/collectors/statsd.plugin/README.md +++ b/collectors/statsd.plugin/README.md @@ -1,6 +1,6 @@ @@ -10,7 +10,6 @@ StatsD is a system to collect data from any application. Applications send metri If you want to learn more about the StatsD protocol, we have written a [blog post](https://www.netdata.cloud/blog/introduction-to-statsd/) about it! -There is a [plethora of client libraries](https://github.com/etsy/statsd/wiki#client-implementations) for embedding statsd metrics to any application framework. This makes statsd quite popular for custom application metrics. Netdata is a fully featured statsd server. It can collect statsd formatted metrics, visualize them on its dashboards and store them in it's database for long-term retention. @@ -22,11 +21,11 @@ Netdata statsd is fast. It can collect more than **1.200.000 metrics per second* ## Metrics supported by Netdata -Netdata fully supports the statsd protocol. All statsd client libraries can be used with Netdata too. +Netdata fully supports the StatsD protocol. All StatsD client libraries can be used with Netdata too. - **Gauges** - The application sends `name:value|g`, where `value` is any **decimal/fractional** number, statsd reports the latest value collected and the number of times it was updated (events). + The application sends `name:value|g`, where `value` is any **decimal/fractional** number, StatsD reports the latest value collected and the number of times it was updated (events). The application may increment or decrement a previous value, by setting the first character of the value to `+` or `-` (so, the only way to set a gauge to an absolute negative value, is to first set it to zero). @@ -36,11 +35,11 @@ Netdata fully supports the statsd protocol. All statsd client libraries can be u - **Counters** and **Meters** - The application sends `name:value|c`, `name:value|C` or `name:value|m`, where `value` is a positive or negative **integer** number of events occurred, statsd reports the **rate** and the number of times it was updated (events). + The application sends `name:value|c`, `name:value|C` or `name:value|m`, where `value` is a positive or negative **integer** number of events occurred, StatsD reports the **rate** and the number of times it was updated (events). - `:value` can be omitted and statsd will assume it is `1`. `|c`, `|C` and `|m` can be omitted an statsd will assume it is `|m`. So, the application may send just `name` and statsd will parse it as `name:1|m`. + `:value` can be omitted and StatsD will assume it is `1`. `|c`, `|C` and `|m` can be omitted an StatsD will assume it is `|m`. So, the application may send just `name` and StatsD will parse it as `name:1|m`. - - Counters use `|c` (etsy/statsd compatible) or `|C` (brubeck compatible) + - Counters use `|c` (etsy/StatsD compatible) or `|C` (brubeck compatible) - Meters use `|m` [Sampling rate](#sampling-rates) is supported. @@ -49,7 +48,7 @@ Netdata fully supports the statsd protocol. All statsd client libraries can be u - **Timers** and **Histograms** - The application sends `name:value|ms` or `name:value|h`, where `value` is any **decimal/fractional** number, statsd reports **min**, **max**, **average**, **sum**, **95th percentile**, **median** and **standard deviation** and the total number of times it was updated (events). + The application sends `name:value|ms` or `name:value|h`, where `value` is any **decimal/fractional** number, StatsD reports **min**, **max**, **average**, **sum**, **95th percentile**, **median** and **standard deviation** and the total number of times it was updated (events). - Timers use `|ms` - Histograms use `|h` @@ -62,7 +61,7 @@ Netdata fully supports the statsd protocol. All statsd client libraries can be u - **Sets** - The application sends `name:value|s`, where `value` is anything (**number or text**, leading and trailing spaces are removed), statsd reports the number of unique values sent and the number of times it was updated (events). + The application sends `name:value|s`, where `value` is anything (**number or text**, leading and trailing spaces are removed), StatsD reports the number of unique values sent and the number of times it was updated (events). Sampling rate is **not** supported for Sets. `value` is always considered text. @@ -88,7 +87,7 @@ On disconnect, Netdata will process the entire buffer, even if it is not termina #### UDP packets -When sending multiple packets over UDP, it is important not to exceed the network MTU, usually about 1500 packets. +When sending multiple packets over UDP, it is important not to exceed the network MTU, which is usually 1500 bytes. Netdata will accept UDP packets up to 9000 bytes, but the underlying network will not exceed MTU. @@ -152,7 +151,7 @@ Netdata can visualize StatsD collected metrics in 2 ways: ### Private metric charts -Private charts are controlled with `create private charts for metrics matching = *`. This setting accepts a space separated list of [simple patterns](/libnetdata/simple_pattern/README.md). Netdata will create private charts for all metrics **by default** +Private charts are controlled with `create private charts for metrics matching = *`. This setting accepts a space-separated list of [simple patterns](/libnetdata/simple_pattern/README.md). Netdata will create private charts for all metrics **by default**. For example, to render charts for all `myapp.*` metrics, except `myapp.*.badmetric`, use: @@ -166,7 +165,8 @@ The default behavior is to use the same settings as the rest of the Netdata Agen - `private charts memory mode` - `private charts history` -### Optimise private metric charts visualization and storage +### Optimize private metric charts visualization and storage + If you have thousands of metrics, each with its own private chart, you may notice that your web browser becomes slow when you view the Netdata dashboard (this is a web browser issue we need to address at the Netdata UI). So, Netdata has a protection to stop creating charts when `max private charts allowed = 200` (soft limit) is reached. @@ -246,16 +246,15 @@ Synthetic charts are organized in For each application you need to create a `.conf` file in `/etc/netdata/statsd.d`. -For example, if you want to monitor the application `myapp` using StatD and Netdata, create the file `/etc/netdata/statsd.d/myapp.conf`, with this content: - +For example, if you want to monitor the application `myapp` using StatsD and Netdata, create the file `/etc/netdata/statsd.d/myapp.conf`, with this content: ``` [app] name = myapp metrics = myapp.* private charts = no gaps when not collected = no - memory mode = ram history = 60 +# memory mode = ram [dictionary] m1 = metric1 @@ -283,8 +282,9 @@ Using the above configuration `myapp` should get its own section on the dashboar - `metrics` is a Netdata [simple pattern](/libnetdata/simple_pattern/README.md). This pattern should match all the possible StatsD metrics that will be participating in the application `myapp`. - `private charts = yes|no`, enables or disables private charts for the metrics matched. - `gaps when not collected = yes|no`, enables or disables gaps on the charts of the application in case that no metrics are collected. -- `memory mode` sets the memory mode for all charts of the application. The default is the global default for Netdata (not the global default for StatsD private charts). -- `history` sets the size of the round robin database for this application. The default is the global default for Netdata (not the global default for StatsD private charts). This is only relevant if you use `memory mode = save`. Read more on our guide: [longer metrics storage](https://learn.netdata.cloud/guides/longer-metrics-storage). +- `memory mode` sets the memory mode for all charts of the application. The default is the global default for Netdata (not the global default for StatsD private charts). We suggest not to use this (we have commented it out in the example) and let your app use the global default for Netdata, which is our dbengine. + +- `history` sets the size of the round robin database for this application. The default is the global default for Netdata (not the global default for StatsD private charts). This is only relevant if you use `memory mode = save`. Read more on our [metrics storage(]/docs/store/change-metrics-storage.md) doc. `[dictionary]` defines name-value associations. These are used to renaming metrics, when added to synthetic charts. Metric names are also defined at each `dimension` line. However, using the dictionary dimension names can be declared globally, for each app and is the only way to rename dimensions when using patterns. Of course the dictionary can be empty or missing. @@ -526,7 +526,7 @@ You can also use StatsD with: ### Shell -Getting the proper support for a programming language is not always easy, but shell is always available on most UNIX systems. You can use shell and `nc` to easily instrument your systems and send metric data to Netdata StatsD. Here is how: +Getting the proper support for a programming language is not always easy, but the Unix shell is available on most Unix systems. You can use shell and `nc` to instrument your systems and send metric data to Netdata's StatsD implementation. Here's how: The command you need to run is: @@ -600,6 +600,6 @@ StatsD "metric1:10|g" "metric2:10|c" ... ``` The function is smart enough to call `nc` just once and pass all the metrics to it. It will also automatically switch to TCP if the metrics to send are above 1000 bytes. -If you have gotten thus far, make sure to check out our [Community Forums](https://community.netdata.cloud) to share your experience using Netdata with StatsD. +If you have gotten thus far, make sure to check out our [community forums](https://community.netdata.cloud) to share your experience using Netdata with StatsD. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fstatsd.plugin%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/collectors/statsd.plugin/k6.conf b/collectors/statsd.plugin/k6.conf new file mode 100644 index 000000000..775f53060 --- /dev/null +++ b/collectors/statsd.plugin/k6.conf @@ -0,0 +1,104 @@ +[app] + name = k6 + metrics = k6* + private charts = no + gaps when not collected = yes + +[dictionary] + http_reqs = HTTP Requests + vus = Virtual active users + vus_max = max Virtual active users + iteration_duration = iteration duration + iteration_duration_max = max iteration duration + iteration_duration_min = min iteration duration + iteration_duration_avg = avg iteration duration + dropped_iterations = Dropped iterations + http_req_blocked = Blocked HTTP requests + http_req_connecting = Connecting HTTP requests + http_req_sending = Sending HTTP requests + http_req_receiving = Receiving HTTP requests + http_req_waiting = Waiting HTTP requests + http_req_duration_median = Median HTTP req duration + http_req_duration_average = AVG HTTP req duration + http_req_duration = HTTP req duration + http_req_duration_max = max HTTP req duration + http_req_duration_min = min HTTP req duration + http_req_duration_p95 = 95 percentile of HTTP req duration + data_received = Received data + data_sent = Sent data + + +[http_reqs] + name = http_reqs + title = HTTP Requests + family = http requests + context = k6.http_requests + dimension = k6.http_reqs http_reqs last 1 1 sum + type = line + units = requests/s + +[vus] + name = vus + title = Virtual Active Users + family = k6_metrics + dimension = k6.vus vus last 1 1 + dimension = k6.vus_max vus_max last 1 1 + type = line + units = vus + +[iteration_duration] + name = iteration_duration_2 + title = Iteration duration + family = k6_metrics + dimension = k6.iteration_duration iteration_duration last 1 1 + dimension = k6.iteration_duration iteration_duration_max max 1 1 + dimension = k6.iteration_duration iteration_duration_min min 1 1 + dimension = k6.iteration_duration iteration_duration_avg avg 1 1 + type = line + units = s + +[dropped_iterations] + name = dropped_iterations + title = Dropped Iterations + family = k6_metrics + dimension = k6.dropped_iterations dropped_iterations last 1 1 + units = iterations + type = line + +[data] + name = data + title = K6 Data + family = k6_metrics + dimension = k6.data_received data_received last 1 1 + dimension = k6.data_sent data_sent last -1 1 + units = kb/s + type = area + +[http_req_status] + name = http_req_status + title = Time spent on HTTP + family = http requests + dimension = k6.http_req_blocked http_req_blocked last 1 1 + dimension = k6.http_req_connecting http_req_connecting last 1 1 + units = ms + type = line + +[http_req_duration_types] + name = http_req_duration_types + title = Time spent on HTTP connection states + family = http requests + dimension = k6.http_req_sending http_req_sending last 1 1 + dimension = k6.http_req_waiting http_req_waiting last 1 1 + dimension = k6.http_req_receiving http_req_receiving last 1 1 + units = ms + type = stacked + +[http_req_duration] + name = http_req_duration + title = Total time for HTTP request + family = http requests + dimension = k6.http_req_duration http_req_duration_median median 1 1 + dimension = k6.http_req_duration http_req_duration_max max 1 1 + dimension = k6.http_req_duration http_req_duration_average avg 1 1 + dimension = k6.http_req_duration http_req_duration_min min 1 1 + dimension = k6.http_req_duration httP_req_duration_p95 percentile 1 1 diff --git a/collectors/statsd.plugin/statsd.c b/collectors/statsd.plugin/statsd.c index a8f94130a..e89585719 100644 --- a/collectors/statsd.plugin/statsd.c +++ b/collectors/statsd.plugin/statsd.c @@ -107,7 +107,7 @@ typedef enum statsd_metric_type { typedef struct statsd_metric { - avl avl; // indexing - has to be first + avl_t avl; // indexing - has to be first const char *name; // the name of the metric uint32_t hash; // hash of the name @@ -376,7 +376,7 @@ static inline STATSD_METRIC *statsd_metric_index_find(STATSD_INDEX *index, const tmp.name = name; tmp.hash = (hash)?hash:simple_hash(tmp.name); - return (STATSD_METRIC *)STATSD_AVL_SEARCH(&index->index, (avl *)&tmp); + return (STATSD_METRIC *)STATSD_AVL_SEARCH(&index->index, (avl_t *)&tmp); } static inline STATSD_METRIC *statsd_find_or_add_metric(STATSD_INDEX *index, const char *name, STATSD_METRIC_TYPE type) { @@ -398,7 +398,7 @@ static inline STATSD_METRIC *statsd_find_or_add_metric(STATSD_INDEX *index, cons m->histogram.ext = callocz(sizeof(STATSD_METRIC_HISTOGRAM_EXTENSIONS), 1); netdata_mutex_init(&m->histogram.ext->mutex); } - STATSD_METRIC *n = (STATSD_METRIC *)STATSD_AVL_INSERT(&index->index, (avl *)m); + STATSD_METRIC *n = (STATSD_METRIC *)STATSD_AVL_INSERT(&index->index, (avl_t *)m); if(unlikely(n != m)) { freez((void *)m->histogram.ext); freez((void *)m->name); diff --git a/collectors/tc.plugin/plugin_tc.c b/collectors/tc.plugin/plugin_tc.c index b92450efe..26affee09 100644 --- a/collectors/tc.plugin/plugin_tc.c +++ b/collectors/tc.plugin/plugin_tc.c @@ -12,7 +12,7 @@ #define TC_LINE_MAX 1024 struct tc_class { - avl avl; + avl_t avl; char *id; uint32_t hash; @@ -56,7 +56,7 @@ struct tc_class { }; struct tc_device { - avl avl; + avl_t avl; char *id; uint32_t hash; @@ -107,15 +107,15 @@ avl_tree_type tc_device_root_index = { tc_device_compare }; -#define tc_device_index_add(st) (struct tc_device *)avl_insert(&tc_device_root_index, (avl *)(st)) -#define tc_device_index_del(st) (struct tc_device *)avl_remove(&tc_device_root_index, (avl *)(st)) +#define tc_device_index_add(st) (struct tc_device *)avl_insert(&tc_device_root_index, (avl_t *)(st)) +#define tc_device_index_del(st) (struct tc_device *)avl_remove(&tc_device_root_index, (avl_t *)(st)) static inline struct tc_device *tc_device_index_find(const char *id, uint32_t hash) { struct tc_device tmp; tmp.id = (char *)id; tmp.hash = (hash)?hash:simple_hash(tmp.id); - return (struct tc_device *)avl_search(&(tc_device_root_index), (avl *)&tmp); + return (struct tc_device *)avl_search(&(tc_device_root_index), (avl_t *)&tmp); } @@ -128,15 +128,15 @@ static int tc_class_compare(void* a, void* b) { else return strcmp(((struct tc_class *)a)->id, ((struct tc_class *)b)->id); } -#define tc_class_index_add(st, rd) (struct tc_class *)avl_insert(&((st)->classes_index), (avl *)(rd)) -#define tc_class_index_del(st, rd) (struct tc_class *)avl_remove(&((st)->classes_index), (avl *)(rd)) +#define tc_class_index_add(st, rd) (struct tc_class *)avl_insert(&((st)->classes_index), (avl_t *)(rd)) +#define tc_class_index_del(st, rd) (struct tc_class *)avl_remove(&((st)->classes_index), (avl_t *)(rd)) static inline struct tc_class *tc_class_index_find(struct tc_device *st, const char *id, uint32_t hash) { struct tc_class tmp; tmp.id = (char *)id; tmp.hash = (hash)?hash:simple_hash(tmp.id); - return (struct tc_class *)avl_search(&(st->classes_index), (avl *) &tmp); + return (struct tc_class *)avl_search(&(st->classes_index), (avl_t *) &tmp); } // ---------------------------------------------------------------------------- diff --git a/collectors/xenstat.plugin/xenstat_plugin.c b/collectors/xenstat.plugin/xenstat_plugin.c index 647ac1db7..a322dd1c1 100644 --- a/collectors/xenstat.plugin/xenstat_plugin.c +++ b/collectors/xenstat.plugin/xenstat_plugin.c @@ -2,6 +2,9 @@ #include "../../libnetdata/libnetdata.h" +#include +#include + #define PLUGIN_XENSTAT_NAME "xenstat.plugin" #define NETDATA_CHART_PRIO_XENSTAT_NODE_CPUS 30001 @@ -62,15 +65,9 @@ int health_variable_lookup(const char *variable, uint32_t hash, struct rrdcalc * char *netdata_configured_host_prefix = ""; // Variables - static int debug = 0; - static int netdata_update_every = 1; -#ifdef HAVE_LIBXENSTAT -#include -#include - struct vcpu_metrics { unsigned int id; @@ -1093,14 +1090,3 @@ int main(int argc, char **argv) { xenstat_uninit(xhandle); info("XENSTAT process exiting"); } - -#else // !HAVE_LIBXENSTAT - -int main(int argc, char **argv) { - (void)argc; - (void)argv; - - fatal("xenstat.plugin is not compiled."); -} - -#endif // !HAVE_LIBXENSTAT diff --git a/configure.ac b/configure.ac index 252cd3dd3..b42b007de 100644 --- a/configure.ac +++ b/configure.ac @@ -197,6 +197,14 @@ AC_ARG_ENABLE( [ enable_cloud="detect" ] ) +AC_ARG_WITH( + [aclk-ng], + [AS_HELP_STRING([--with-aclk-ng], + [Requires ACLK-NG to be used even in case ACLK Legacy can run on this system])], + [aclk_ng="$withval"], + [aclk_ng="fallback"] +) + if test "${enable_cloud}" = "no"; then AC_DEFINE([DISABLE_CLOUD], [1], [disable netdata cloud functionality]) fi @@ -632,7 +640,7 @@ AM_CONDITIONAL([ENABLE_CAPABILITY], [test "${with_libcap}" = "yes"]) AC_MSG_CHECKING([if cloud functionality should be enabled]) AC_MSG_RESULT([${enable_cloud}]) -if test "$enable_cloud" != "no"; then +if test "$enable_cloud" != "no" -a "$aclk_ng" != "yes"; then # just to have all messages that can fail ACLK build in one place # so it is easier to see why it can't be built if test -n "${SSL_LIBS}"; then @@ -706,6 +714,7 @@ if test "$enable_cloud" != "no"; then fi AC_MSG_RESULT([${can_enable_aclk}]) +# TODO fix this (you need to try fallback) test "${enable_cloud}" = "yes" -a "${can_enable_aclk}" = "no" && \ AC_MSG_ERROR([User required agent-cloud-link but it can't be built!]) @@ -715,7 +724,6 @@ if test "$enable_cloud" != "no"; then else enable_aclk=$enable_cloud fi - AC_SUBST([can_enable_aclk]) if test "${enable_aclk}" = "yes"; then AC_DEFINE([ENABLE_ACLK], [1], [netdata ACLK]) @@ -723,7 +731,35 @@ if test "$enable_cloud" != "no"; then AC_MSG_RESULT([${enable_aclk}]) fi + +if test "$enable_cloud" = "no" -a "$aclk_ng" = "yes"; then + AC_MSG_ERROR([--disable-cloud && --aclk-ng not allowed together (such configuration is self contradicting)]) +fi + +if test "$enable_cloud" != "no" -a "$aclk_ng" != "no"; then + AC_MSG_CHECKING([if JSON-C available for ACLK Next Generation]) + if test "$enable_jsonc" != "yes"; then + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes]) + if test "$aclk_ng" != "yes" -a "$enable_aclk" == "no"; then #default "fallback" + AC_MSG_NOTICE([ACLK Legacy could not be built. Trying ACLK-NG as fallback.]) + aclk_ng="yes" + fi + if test "$aclk_ng" = "yes"; then + #TODO Check OpenSSL and JSON-C + AC_MSG_CHECKING([if ACLK Next Generation can be built]) + AC_DEFINE([ACLK_NG], [1], [ACLK Next Generation Should be used]) + AC_DEFINE([ENABLE_ACLK], [1], [netdata ACLK]) + enable_aclk="yes" + AC_MSG_RESULT([yes]) + OPTIONAL_MQTT_WSS_CFLAGS="-Imqtt_websockets/src/include" + fi + fi +fi AC_SUBST([enable_cloud]) +AC_SUBST([enable_aclk]) +AM_CONDITIONAL([ACLK_NG], [test "${aclk_ng}" = "yes"]) AM_CONDITIONAL([ENABLE_ACLK], [test "${enable_aclk}" = "yes"]) # ----------------------------------------------------------------------------- @@ -888,9 +924,7 @@ if test "${enable_plugin_nfacct}" != "no" -a "${have_libnetfilter_acct}" = "yes" -a "${have_libmnl}" = "yes" \ -a "${have_nfnetlink_conntrack}" = "yes"; then enable_plugin_nfacct="yes" - AC_DEFINE([HAVE_LIBMNL], [1], [libmnl usability]) - AC_DEFINE([HAVE_LIBNETFILTER_ACCT], [1], [libnetfilter_acct usability]) - AC_DEFINE([HAVE_LINUX_NETFILTER_NFNETLINK_CONNTRACK_H], [1], [libnetfilter_nfnetlink_conntrack header usability]) + AC_DEFINE([HAVE_NFACCT], [1], [netfilter accounting usability]) OPTIONAL_NFACCT_CFLAGS="${NFACCT_CFLAGS} ${LIBMNL_CFLAGS}" OPTIONAL_NFACCT_LIBS="${NFACCT_LIBS} ${LIBMNL_LIBS}" else @@ -956,8 +990,6 @@ AC_MSG_CHECKING([if xenstat.plugin should be enabled]) if test "${enable_plugin_xenstat}" != "no" -a "${have_libxenstat}" = "yes" -a "${have_libxenlight}" = "yes" -a "${have_libyajl}" = "yes"; then enable_plugin_xenstat="yes" AC_DEFINE([HAVE_LIBXENSTAT], [1], [libxenstat usability]) - AC_DEFINE([HAVE_LIBXENLIGHT], [1], [libxenlight usability]) - AC_DEFINE([HAVE_LIBYAJL], [1], [libyajl usability]) OPTIONAL_XENSTAT_CFLAGS="${XENLIGHT_CFLAGS} ${YAJL_CFLAGS}" OPTIONAL_XENSTAT_LIBS="-lxenstat ${XENLIGHT_LIBS} ${YAJL_LIBS}" else @@ -1424,7 +1456,8 @@ AC_SUBST([webdir]) CFLAGS="${CFLAGS} ${OPTIONAL_MATH_CFLAGS} ${OPTIONAL_NFACCT_CFLAGS} ${OPTIONAL_ZLIB_CFLAGS} ${OPTIONAL_UUID_CFLAGS} \ ${OPTIONAL_LIBCAP_CFLAGS} ${OPTIONAL_IPMIMONITORING_CFLAGS} ${OPTIONAL_CUPS_CFLAGS} ${OPTIONAL_XENSTAT_FLAGS} \ ${OPTIONAL_KINESIS_CFLAGS} ${OPTIONAL_PUBSUB_CFLAGS} ${OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS} \ - ${OPTIONAL_MONGOC_CFLAGS} ${LWS_CFLAGS} ${OPTIONAL_JSONC_STATIC_CFLAGS} ${OPTIONAL_BPF_CFLAGS} ${OPTIONAL_JUDY_CFLAGS}" + ${OPTIONAL_MONGOC_CFLAGS} ${LWS_CFLAGS} ${OPTIONAL_JSONC_STATIC_CFLAGS} ${OPTIONAL_BPF_CFLAGS} ${OPTIONAL_JUDY_CFLAGS} \ + ${OPTIONAL_MQTT_WSS_CFLAGS}" CXXFLAGS="${CFLAGS} ${CXX11FLAG}" @@ -1474,6 +1507,7 @@ AC_SUBST([OPTIONAL_PROMETHEUS_REMOTE_WRITE_LIBS]) AC_SUBST([OPTIONAL_MONGOC_CFLAGS]) AC_SUBST([OPTIONAL_MONGOC_LIBS]) AC_SUBST([OPTIONAL_LWS_LIBS]) +AC_SUBST([OPTIONAL_MQTT_WSS_CFLAGS]) # ----------------------------------------------------------------------------- # Check if cmocka is available - needed for unit testing @@ -1619,6 +1653,7 @@ AC_CONFIG_FILES([ spawn/Makefile parser/Makefile ]) + AC_OUTPUT test "${with_math}" != "yes" && AC_MSG_WARN([You are building without math. math allows accurate calculations. It should be enabled.]) || : diff --git a/contrib/debian/netdata.postinst b/contrib/debian/netdata.postinst index eb9104bb7..17182c7e9 100644 --- a/contrib/debian/netdata.postinst +++ b/contrib/debian/netdata.postinst @@ -54,9 +54,9 @@ case "$1" in chown -R root:netdata /usr/libexec/netdata/plugins.d chown -R root:netdata /var/lib/netdata/www setcap cap_dac_read_search,cap_sys_ptrace+ep /usr/libexec/netdata/plugins.d/apps.plugin + setcap cap_dac_read_search+ep /usr/libexec/netdata/plugins.d/slabinfo.plugin + setcap cap_perfmon+ep /usr/libexec/netdata/plugins.d/perf.plugin || setcap cap_sys_admin+ep /usr/libexec/netdata/plugins.d/perf.plugin - chmod 4750 /usr/libexec/netdata/plugins.d/perf.plugin - chmod 4750 /usr/libexec/netdata/plugins.d/slabinfo.plugin chmod 4750 /usr/libexec/netdata/plugins.d/cgroup-network chmod 4750 /usr/libexec/netdata/plugins.d/nfacct.plugin diff --git a/contrib/debian/rules b/contrib/debian/rules index 9bfd057f1..533c2ecfb 100755 --- a/contrib/debian/rules +++ b/contrib/debian/rules @@ -103,9 +103,9 @@ override_dh_fixperms: # apps.plugin should only be runnable by the netdata user. It will be # given extra capabilities in the postinst script. # - chmod 0754 $(TOP)/usr/libexec/netdata/plugins.d/apps.plugin - chmod 0754 $(TOP)/usr/libexec/netdata/plugins.d/perf.plugin - chmod 0754 $(TOP)/usr/libexec/netdata/plugins.d/slabinfo.plugin + chmod 0750 $(TOP)/usr/libexec/netdata/plugins.d/apps.plugin + chmod 0750 $(TOP)/usr/libexec/netdata/plugins.d/perf.plugin + chmod 0750 $(TOP)/usr/libexec/netdata/plugins.d/slabinfo.plugin chmod 0750 $(TOP)/usr/libexec/netdata/plugins.d/go.d.plugin # CUPS plugin package diff --git a/coverity-scan.sh b/coverity-scan.sh index cd2ff0279..047d882cd 100755 --- a/coverity-scan.sh +++ b/coverity-scan.sh @@ -40,7 +40,7 @@ set -e INSTALL_DIR="/opt" # the version of coverity to use -COVERITY_BUILD_VERSION="${COVERITY_BUILD_VERSION:-cov-analysis-linux64-2019.03}" +COVERITY_BUILD_VERSION="${COVERITY_BUILD_VERSION:-cov-analysis-linux64-2020.09}" # TODO: For some reasons this does not fully load on Debian 10 (Haven't checked if it happens on other distros yet), it breaks source packaging/installer/functions.sh || echo "Failed to fully load the functions library" diff --git a/daemon/README.md b/daemon/README.md index ec1f1c7cc..359b3ea39 100644 --- a/daemon/README.md +++ b/daemon/README.md @@ -395,10 +395,10 @@ all programs), edit `netdata.conf` and set: process nice level = -1 ``` -then execute this to restart netdata: +then execute this to [restart Netdata](/docs/configure/start-stop-restart.md): ```sh -sudo service netdata restart +sudo systemctl restart netdata ``` #### Example 2: Netdata with nice -1 on systemd systems diff --git a/daemon/anonymous-statistics.sh.in b/daemon/anonymous-statistics.sh.in index f0d9c10ef..47004f3d0 100755 --- a/daemon/anonymous-statistics.sh.in +++ b/daemon/anonymous-statistics.sh.in @@ -26,79 +26,66 @@ fi NETDATA_VERSION=$(echo "${NETDATA_VERSION}" | sed 's/-.*//g' | tr -d 'v') # ------------------------------------------------------------------------------------------------- -# send the anonymous statistics to GA -# https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters -# The maximum index for a cd parameter is 20 so we have effectively run out. + +# define body of request to be sent +REQ_BODY="$(cat << EOF +{ + "api_key": "mqkwGT0JNFqO-zX2t0mW6Tec9yooaVu7xCBlXtHnt5Y", + "event": "${ACTION} ${ACTION_RESULT}", + "properties": { + "distinct_id": "${NETDATA_REGISTRY_UNIQUE_ID}", + "\$current_url": "agent backend", + "\$pathname": "netdata-backend", + "\$host": "backend.netdata.io", + "\$ip": "127.0.0.1", + "event_source": "agent backend", + "action": "${ACTION}", + "action_result": "${ACTION_RESULT}", + "action_data": "${ACTION_DATA}", + "netdata_machine_guid": "${NETDATA_REGISTRY_UNIQUE_ID}", + "netdata_version": "${NETDATA_VERSION}", + "host_os_name": "${NETDATA_HOST_OS_NAME}", + "host_os_id": "${NETDATA_HOST_OS_ID}", + "host_os_id_like": "${NETDATA_HOST_OS_ID_LIKE}", + "host_os_version": "${NETDATA_HOST_OS_VERSION}", + "host_os_version_id": "${NETDATA_HOST_OS_VERSION_ID}", + "host_os_detection": "${NETDATA_HOST_OS_DETECTION}", + "host_is_k8s_node": "${NETDATA_HOST_IS_K8S_NODE}", + "system_kernel_name": "${NETDATA_SYSTEM_KERNEL_NAME}", + "system_kernel_version": "${NETDATA_SYSTEM_KERNEL_VERSION}", + "system_architecture": "${NETDATA_SYSTEM_ARCHITECTURE}", + "system_virtualization": "${NETDATA_SYSTEM_VIRTUALIZATION}", + "system_virt_detection": "${NETDATA_SYSTEM_VIRT_DETECTION}", + "system_container": "${NETDATA_SYSTEM_CONTAINER}", + "system_container_detection": "${NETDATA_SYSTEM_CONTAINER_DETECTION}", + "container_os_name": "${NETDATA_CONTAINER_OS_NAME}", + "container_os_id": "${NETDATA_CONTAINER_OS_ID}", + "container_os_id_like": "${NETDATA_CONTAINER_OS_ID_LIKE}", + "container_os_version": "${NETDATA_CONTAINER_OS_VERSION}", + "container_os_version_id": "${NETDATA_CONTAINER_OS_VERSION_ID}", + "container_os_detection": "${NETDATA_CONTAINER_OS_DETECTION}", + "system_cpu_detection": "${NETDATA_SYSTEM_CPU_DETECTION}", + "system_cpu_freq": "${NETDATA_SYSTEM_CPU_FREQ}", + "system_cpu_logical_cpu_count": "${NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT}", + "system_cpu_model": "${NETDATA_SYSTEM_CPU_MODEL}", + "system_cpu_vendor": "${NETDATA_SYSTEM_CPU_VENDOR}", + "system_disk_detection": "${NETDATA_SYSTEM_DISK_DETECTION}", + "system_ram_detection": "${NETDATA_SYSTEM_RAM_DETECTION}", + "system_total_disk_size": "${NETDATA_SYSTEM_TOTAL_DISK_SIZE}", + "system_total_ram": "${NETDATA_SYSTEM_TOTAL_RAM}" + } +} +EOF +)" + +# send the anonymous statistics to the Netdata PostHog if [ -n "$(command -v curl 2> /dev/null)" ]; then - curl -X POST -Ss --max-time 2 \ - --data "v=1" \ - --data "tid=UA-64295674-3" \ - --data "aip=1" \ - --data "ds=shell" \ - --data-urlencode "cid=${NETDATA_REGISTRY_UNIQUE_ID}" \ - --data-urlencode "cs=${NETDATA_REGISTRY_UNIQUE_ID}" \ - --data "t=event" \ - --data "ni=1" \ - --data "an=anonymous-statistics" \ - --data-urlencode "av=${NETDATA_VERSION}" \ - --data-urlencode "ec=${ACTION}" \ - --data-urlencode "ea=${ACTION_RESULT}" \ - --data-urlencode "el=${ACTION_DATA}" \ - --data-urlencode "cd1=${NETDATA_HOST_OS_NAME}" \ - --data-urlencode "cd2=${NETDATA_HOST_OS_ID}" \ - --data-urlencode "cd3=${NETDATA_HOST_OS_ID_LIKE}" \ - --data-urlencode "cd4=${NETDATA_HOST_OS_VERSION}" \ - --data-urlencode "cd5=${NETDATA_HOST_OS_VERSION_ID}" \ - --data-urlencode "cd6=${NETDATA_HOST_OS_DETECTION}" \ - --data-urlencode "cd7=${NETDATA_SYSTEM_KERNEL_NAME}" \ - --data-urlencode "cd8=${NETDATA_SYSTEM_KERNEL_VERSION}" \ - --data-urlencode "cd9=${NETDATA_SYSTEM_ARCHITECTURE}" \ - --data-urlencode "cd10=${NETDATA_SYSTEM_VIRTUALIZATION}" \ - --data-urlencode "cd11=${NETDATA_SYSTEM_VIRT_DETECTION}" \ - --data-urlencode "cd12=${NETDATA_SYSTEM_CONTAINER}" \ - --data-urlencode "cd13=${NETDATA_SYSTEM_CONTAINER_DETECTION}" \ - --data-urlencode "cd14=${NETDATA_CONTAINER_OS_NAME}" \ - --data-urlencode "cd15=${NETDATA_CONTAINER_OS_ID}" \ - --data-urlencode "cd16=${NETDATA_CONTAINER_OS_ID_LIKE}" \ - --data-urlencode "cd17=${NETDATA_CONTAINER_OS_VERSION}" \ - --data-urlencode "cd18=${NETDATA_CONTAINER_OS_VERSION_ID}" \ - --data-urlencode "cd19=${NETDATA_CONTAINER_OS_DETECTION}" \ - --data-urlencode "cd20=${NETDATA_HOST_IS_K8S_NODE}" \ - "https://www.google-analytics.com/collect" > /dev/null 2>&1 + curl -X POST --header "Content-Type: application/json" -d "${REQ_BODY}" https://posthog.netdata.cloud/capture/ > /dev/null 2>&1 else - wget -q -O - --timeout=1 "https://www.google-analytics.com/collect?\ -&v=1\ -&tid=UA-64295674-3\ -&aip=1\ -&ds=shell\ -&cid=${NETDATA_REGISTRY_UNIQUE_ID}\ -&cs=${NETDATA_REGISTRY_UNIQUE_ID}\ -&t=event\ -&ni=1\ -&an=anonymous-statistics\ -&av=${NETDATA_VERSION}\ -&ec=${ACTION}\ -&ea=${ACTION_RESULT}\ -&el=${ACTION_DATA}\ -&cd1=${NETDATA_HOST_OS_NAME}\ -&cd2=${NETDATA_HOST_OS_ID}\ -&cd3=${NETDATA_HOST_OS_ID_LIKE}\ -&cd4=${NETDATA_HOST_OS_VERSION}\ -&cd5=${NETDATA_HOST_OS_VERSION_ID}\ -&cd6=${NETDATA_HOST_OS_DETECTION}\ -&cd7=${NETDATA_SYSTEM_KERNEL_NAME}\ -&cd8=${NETDATA_SYSTEM_KERNEL_VERSION}\ -&cd9=${NETDATA_SYSTEM_ARCHITECTURE}\ -&cd10=${NETDATA_SYSTEM_VIRTUALIZATION}\ -&cd11=${NETDATA_SYSTEM_VIRT_DETECTION}\ -&cd12=${NETDATA_SYSTEM_CONTAINER}\ -&cd13=${NETDATA_SYSTEM_CONTAINER_DETECTION}\ -&cd14=${NETDATA_CONTAINER_OS_NAME} \ -&cd15=${NETDATA_CONTAINER_OS_ID} \ -&cd16=${NETDATA_CONTAINER_OS_ID_LIKE} \ -&cd17=${NETDATA_CONTAINER_OS_VERSION} \ -&cd18=${NETDATA_CONTAINER_OS_VERSION_ID} \ -&cd19=${NETDATA_CONTAINER_OS_DETECTION} \ -&cd20=${NETDATA_HOST_IS_K8S_NODE} \ -" > /dev/null 2>&1 + wget -q -O - --no-check-certificate \ + --method POST \ + --timeout=1 \ + --header 'Content-Type: application/json' \ + --body-data "${REQ_BODY}" \ + 'https://posthog.netdata.cloud/capture/' > /dev/null 2>&1 fi diff --git a/daemon/buildinfo.c b/daemon/buildinfo.c index de02a72e1..b16390544 100644 --- a/daemon/buildinfo.c +++ b/daemon/buildinfo.c @@ -6,224 +6,309 @@ // Optional features #ifdef ENABLE_ACLK -#define FEAT_CLOUD "YES" +#define FEAT_CLOUD 1 +#define FEAT_CLOUD_MSG "" +#ifdef ACLK_NG +#define ACLK_IMPL "Next Generation" #else +#define ACLK_IMPL "Legacy" +#endif +#else +#define ACLK_IMPL "" #ifdef DISABLE_CLOUD -#define FEAT_CLOUD "NO (by user request e.g. '--disable-cloud')" +#define FEAT_CLOUD 0 +#define FEAT_CLOUD_MSG "(by user request)" #else -#define FEAT_CLOUD "NO" +#define FEAT_CLOUD 0 +#define FEAT_CLOUD_MSG "" #endif #endif #ifdef ENABLE_DBENGINE -#define FEAT_DBENGINE "YES" +#define FEAT_DBENGINE 1 #else -#define FEAT_DBENGINE "NO" +#define FEAT_DBENGINE 0 #endif #if defined(HAVE_X509_VERIFY_PARAM_set1_host) && HAVE_X509_VERIFY_PARAM_set1_host == 1 -#define FEAT_TLS_HOST_VERIFY "YES" +#define FEAT_TLS_HOST_VERIFY 1 #else -#define FEAT_TLS_HOST_VERIFY "NO" +#define FEAT_TLS_HOST_VERIFY 0 #endif #ifdef ENABLE_HTTPS -#define FEAT_NATIVE_HTTPS "YES" +#define FEAT_NATIVE_HTTPS 1 #else -#define FEAT_NATIVE_HTTPS "NO" +#define FEAT_NATIVE_HTTPS 0 #endif // Optional libraries #ifdef ENABLE_JSONC -#define FEAT_JSONC "YES" +#define FEAT_JSONC 1 #else -#define FEAT_JSONC "NO" +#define FEAT_JSONC 0 #endif #ifdef ENABLE_JEMALLOC -#define FEAT_JEMALLOC "YES" +#define FEAT_JEMALLOC 1 #else -#define FEAT_JEMALLOC "NO" +#define FEAT_JEMALLOC 0 #endif #ifdef ENABLE_TCMALLOC -#define FEAT_TCMALLOC "YES" +#define FEAT_TCMALLOC 1 #else -#define FEAT_TCMALLOC "NO" +#define FEAT_TCMALLOC 0 #endif #ifdef HAVE_CAPABILITY -#define FEAT_LIBCAP "YES" +#define FEAT_LIBCAP 1 #else -#define FEAT_LIBCAP "NO" +#define FEAT_LIBCAP 0 #endif +#ifndef ACLK_NG #ifdef ACLK_NO_LIBMOSQ -#define FEAT_MOSQUITTO "NO" +#define FEAT_MOSQUITTO 0 #else -#define FEAT_MOSQUITTO "YES" +#define FEAT_MOSQUITTO 1 #endif #ifdef ACLK_NO_LWS -#define FEAT_LWS "NO" +#define FEAT_LWS 0 +#define FEAT_LWS_MSG "" #else #ifdef ENABLE_ACLK #include #endif #ifdef BUNDLED_LWS -#define FEAT_LWS "YES static" +#define FEAT_LWS 1 +#define FEAT_LWS_MSG "static" #else -#define FEAT_LWS "YES shared-lib" +#define FEAT_LWS 1 +#define FEAT_LWS_MSG "shared-lib" #endif #endif +#endif /* ACLK_NG */ #ifdef NETDATA_WITH_ZLIB -#define FEAT_ZLIB "YES" +#define FEAT_ZLIB 1 #else -#define FEAT_ZLIB "NO" +#define FEAT_ZLIB 0 #endif #ifdef STORAGE_WITH_MATH -#define FEAT_LIBM "YES" +#define FEAT_LIBM 1 #else -#define FEAT_LIBM "NO" +#define FEAT_LIBM 0 #endif #ifdef HAVE_CRYPTO -#define FEAT_CRYPTO "YES" +#define FEAT_CRYPTO 1 #else -#define FEAT_CRYPTO "NO" +#define FEAT_CRYPTO 0 #endif // Optional plugins #ifdef ENABLE_APPS_PLUGIN -#define FEAT_APPS_PLUGIN "YES" +#define FEAT_APPS_PLUGIN 1 #else -#define FEAT_APPS_PLUGIN "NO" +#define FEAT_APPS_PLUGIN 0 #endif #ifdef HAVE_FREEIPMI -#define FEAT_IPMI "YES" +#define FEAT_IPMI 1 #else -#define FEAT_IPMI "NO" +#define FEAT_IPMI 0 #endif #ifdef HAVE_CUPS -#define FEAT_CUPS "YES" +#define FEAT_CUPS 1 #else -#define FEAT_CUPS "NO" +#define FEAT_CUPS 0 #endif -#ifdef HAVE_LIBMNL -#define FEAT_NFACCT "YES" +#ifdef HAVE_NFACCT +#define FEAT_NFACCT 1 #else -#define FEAT_NFACCT "NO" +#define FEAT_NFACCT 0 #endif #ifdef HAVE_LIBXENSTAT -#define FEAT_XEN "YES" +#define FEAT_XEN 1 #else -#define FEAT_XEN "NO" +#define FEAT_XEN 0 #endif #ifdef HAVE_XENSTAT_VBD_ERROR -#define FEAT_XEN_VBD_ERROR "YES" +#define FEAT_XEN_VBD_ERROR 1 #else -#define FEAT_XEN_VBD_ERROR "NO" +#define FEAT_XEN_VBD_ERROR 0 #endif #ifdef HAVE_LIBBPF -#define FEAT_EBPF "YES" +#define FEAT_EBPF 1 #else -#define FEAT_EBPF "NO" +#define FEAT_EBPF 0 #endif #ifdef HAVE_SETNS -#define FEAT_CGROUP_NET "YES" +#define FEAT_CGROUP_NET 1 #else -#define FEAT_CGROUP_NET "NO" +#define FEAT_CGROUP_NET 0 #endif #ifdef ENABLE_PERF_PLUGIN -#define FEAT_PERF "YES" +#define FEAT_PERF 1 #else -#define FEAT_PERF "NO" +#define FEAT_PERF 0 #endif #ifdef ENABLE_SLABINFO -#define FEAT_SLABINFO "YES" +#define FEAT_SLABINFO 1 #else -#define FEAT_SLABINFO "NO" +#define FEAT_SLABINFO 0 #endif // Optional Exporters #ifdef HAVE_KINESIS -#define FEAT_KINESIS "YES" +#define FEAT_KINESIS 1 #else -#define FEAT_KINESIS "NO" +#define FEAT_KINESIS 0 #endif #ifdef ENABLE_EXPORTING_PUBSUB -#define FEAT_PUBSUB "YES" +#define FEAT_PUBSUB 1 #else -#define FEAT_PUBSUB "NO" +#define FEAT_PUBSUB 0 #endif #ifdef HAVE_MONGOC -#define FEAT_MONGO "YES" +#define FEAT_MONGO 1 #else -#define FEAT_MONGO "NO" +#define FEAT_MONGO 0 #endif #ifdef ENABLE_PROMETHEUS_REMOTE_WRITE -#define FEAT_REMOTE_WRITE "YES" +#define FEAT_REMOTE_WRITE 1 #else -#define FEAT_REMOTE_WRITE "NO" +#define FEAT_REMOTE_WRITE 0 #endif +#define FEAT_YES_NO(x) ((x) ? "YES" : "NO") void print_build_info(void) { printf("Configure options: %s\n", CONFIGURE_COMMAND); printf("Features:\n"); - printf(" dbengine: %s\n", FEAT_DBENGINE); - printf(" Native HTTPS: %s\n", FEAT_NATIVE_HTTPS); - printf(" Netdata Cloud: %s\n", FEAT_CLOUD); - printf(" TLS Host Verification: %s\n", FEAT_TLS_HOST_VERIFY); + printf(" dbengine: %s\n", FEAT_YES_NO(FEAT_DBENGINE)); + printf(" Native HTTPS: %s\n", FEAT_YES_NO(FEAT_NATIVE_HTTPS)); + printf(" Netdata Cloud: %s %s\n", FEAT_YES_NO(FEAT_CLOUD), FEAT_CLOUD_MSG); +#if FEAT_CLOUD == 1 + printf(" Cloud Implementation: %s\n", ACLK_IMPL); +#endif + printf(" TLS Host Verification: %s\n", FEAT_YES_NO(FEAT_TLS_HOST_VERIFY)); printf("Libraries:\n"); - printf(" jemalloc: %s\n", FEAT_JEMALLOC); - printf(" JSON-C: %s\n", FEAT_JSONC); - printf(" libcap: %s\n", FEAT_LIBCAP); - printf(" libcrypto: %s\n", FEAT_CRYPTO); - printf(" libm: %s\n", FEAT_LIBM); + printf(" jemalloc: %s\n", FEAT_YES_NO(FEAT_JEMALLOC)); + printf(" JSON-C: %s\n", FEAT_YES_NO(FEAT_JSONC)); + printf(" libcap: %s\n", FEAT_YES_NO(FEAT_LIBCAP)); + printf(" libcrypto: %s\n", FEAT_YES_NO(FEAT_CRYPTO)); + printf(" libm: %s\n", FEAT_YES_NO(FEAT_LIBM)); +#ifndef ACLK_NG #if defined(ENABLE_ACLK) - printf(" LWS: %s v%d.%d.%d\n", FEAT_LWS, LWS_LIBRARY_VERSION_MAJOR, LWS_LIBRARY_VERSION_MINOR, LWS_LIBRARY_VERSION_PATCH); + printf(" LWS: %s %s v%d.%d.%d\n", FEAT_YES_NO(FEAT_LWS), FEAT_LWS_MSG, LWS_LIBRARY_VERSION_MAJOR, LWS_LIBRARY_VERSION_MINOR, LWS_LIBRARY_VERSION_PATCH); #else - printf(" LWS: %s\n", FEAT_LWS); + printf(" LWS: %s %s\n", FEAT_YES_NO(FEAT_LWS), FEAT_LWS_MSG); +#endif + printf(" mosquitto: %s\n", FEAT_YES_NO(FEAT_MOSQUITTO)); #endif - printf(" mosquitto: %s\n", FEAT_MOSQUITTO); - printf(" tcalloc: %s\n", FEAT_TCMALLOC); - printf(" zlib: %s\n", FEAT_ZLIB); + printf(" tcalloc: %s\n", FEAT_YES_NO(FEAT_TCMALLOC)); + printf(" zlib: %s\n", FEAT_YES_NO(FEAT_ZLIB)); printf("Plugins:\n"); - printf(" apps: %s\n", FEAT_APPS_PLUGIN); - printf(" cgroup Network Tracking: %s\n", FEAT_CGROUP_NET); - printf(" CUPS: %s\n", FEAT_CUPS); - printf(" EBPF: %s\n", FEAT_EBPF); - printf(" IPMI: %s\n", FEAT_IPMI); - printf(" NFACCT: %s\n", FEAT_NFACCT); - printf(" perf: %s\n", FEAT_PERF); - printf(" slabinfo: %s\n", FEAT_SLABINFO); - printf(" Xen: %s\n", FEAT_XEN); - printf(" Xen VBD Error Tracking: %s\n", FEAT_XEN_VBD_ERROR); + printf(" apps: %s\n", FEAT_YES_NO(FEAT_APPS_PLUGIN)); + printf(" cgroup Network Tracking: %s\n", FEAT_YES_NO(FEAT_CGROUP_NET)); + printf(" CUPS: %s\n", FEAT_YES_NO(FEAT_CUPS)); + printf(" EBPF: %s\n", FEAT_YES_NO(FEAT_EBPF)); + printf(" IPMI: %s\n", FEAT_YES_NO(FEAT_IPMI)); + printf(" NFACCT: %s\n", FEAT_YES_NO(FEAT_NFACCT)); + printf(" perf: %s\n", FEAT_YES_NO(FEAT_PERF)); + printf(" slabinfo: %s\n", FEAT_YES_NO(FEAT_SLABINFO)); + printf(" Xen: %s\n", FEAT_YES_NO(FEAT_XEN)); + printf(" Xen VBD Error Tracking: %s\n", FEAT_YES_NO(FEAT_XEN_VBD_ERROR)); printf("Exporters:\n"); - printf(" AWS Kinesis: %s\n", FEAT_KINESIS); - printf(" GCP PubSub: %s\n", FEAT_PUBSUB); - printf(" MongoDB: %s\n", FEAT_MONGO); - printf(" Prometheus Remote Write: %s\n", FEAT_REMOTE_WRITE); + printf(" AWS Kinesis: %s\n", FEAT_YES_NO(FEAT_KINESIS)); + printf(" GCP PubSub: %s\n", FEAT_YES_NO(FEAT_PUBSUB)); + printf(" MongoDB: %s\n", FEAT_YES_NO(FEAT_MONGO)); + printf(" Prometheus Remote Write: %s\n", FEAT_YES_NO(FEAT_REMOTE_WRITE)); +}; + + +#define FEAT_JSON_BOOL(x) ((x) ? "true" : "false") +// This intentionally does not use JSON-C so it works even if JSON-C is not present +// This is used for anonymous statistics reporting, so it intentionally +// does not include the configure options, which would be very easy to use +// for tracking custom builds (and complicate outputting valid JSON). +void print_build_info_json(void) { + printf("{\n"); + printf(" \"features\": {\n"); + printf(" \"dbengine\": %s,\n", FEAT_JSON_BOOL(FEAT_DBENGINE)); + printf(" \"native-https\": %s,\n", FEAT_JSON_BOOL(FEAT_NATIVE_HTTPS)); + printf(" \"cloud\": %s,\n", FEAT_JSON_BOOL(FEAT_CLOUD)); +#ifdef DISABLE_CLOUD + printf(" \"cloud-disabled\": true,\n"); +#else + printf(" \"cloud-disabled\": false,\n"); +#endif +#if FEAT_CLOUD == 1 + printf(" \"cloud-implementation\": \"%s\",\n", ACLK_IMPL); +#endif + printf(" \"tls-host-verify\": %s\n", FEAT_JSON_BOOL(FEAT_TLS_HOST_VERIFY)); + printf(" },\n"); + + printf(" \"libs\": {\n"); + printf(" \"jemalloc\": %s,\n", FEAT_JSON_BOOL(FEAT_JEMALLOC)); + printf(" \"jsonc\": %s,\n", FEAT_JSON_BOOL(FEAT_JSONC)); + printf(" \"libcap\": %s,\n", FEAT_JSON_BOOL(FEAT_LIBCAP)); + printf(" \"libcrypto\": %s,\n", FEAT_JSON_BOOL(FEAT_CRYPTO)); + printf(" \"libm\": %s,\n", FEAT_JSON_BOOL(FEAT_LIBM)); +#ifndef ACLK_NG +#if defined(ENABLE_ACLK) + printf(" \"lws\": %s,\n", FEAT_JSON_BOOL(FEAT_LWS)); + printf(" \"lws-version\": \"%d.%d.%d\",\n", LWS_LIBRARY_VERSION_MAJOR, LWS_LIBRARY_VERSION_MINOR, LWS_LIBRARY_VERSION_PATCH); + printf(" \"lws-type\": \"%s\",\n", FEAT_LWS_MSG); +#else + printf(" \"lws\": %s,\n", FEAT_JSON_BOOL(FEAT_LWS)); +#endif + printf(" \"mosquitto\": %s,\n", FEAT_JSON_BOOL(FEAT_MOSQUITTO)); +#endif + printf(" \"tcmalloc\": %s,\n", FEAT_JSON_BOOL(FEAT_TCMALLOC)); + printf(" \"zlib\": %s\n", FEAT_JSON_BOOL(FEAT_ZLIB)); + printf(" },\n"); + + printf(" \"plugins\": {\n"); + printf(" \"apps\": %s,\n", FEAT_JSON_BOOL(FEAT_APPS_PLUGIN)); + printf(" \"cgroup-net\": %s,\n", FEAT_JSON_BOOL(FEAT_CGROUP_NET)); + printf(" \"cups\": %s,\n", FEAT_JSON_BOOL(FEAT_CUPS)); + printf(" \"ebpf\": %s,\n", FEAT_JSON_BOOL(FEAT_EBPF)); + printf(" \"ipmi\": %s,\n", FEAT_JSON_BOOL(FEAT_IPMI)); + printf(" \"nfacct\": %s,\n", FEAT_JSON_BOOL(FEAT_NFACCT)); + printf(" \"perf\": %s,\n", FEAT_JSON_BOOL(FEAT_PERF)); + printf(" \"slabinfo\": %s,\n", FEAT_JSON_BOOL(FEAT_SLABINFO)); + printf(" \"xen\": %s,\n", FEAT_JSON_BOOL(FEAT_XEN)); + printf(" \"xen-vbd-error\": %s\n", FEAT_JSON_BOOL(FEAT_XEN_VBD_ERROR)); + printf(" },\n"); + + printf(" \"exporters\": {\n"); + printf(" \"kinesis\": %s,\n", FEAT_JSON_BOOL(FEAT_KINESIS)); + printf(" \"pubsub\": %s,\n", FEAT_JSON_BOOL(FEAT_PUBSUB)); + printf(" \"mongodb\": %s,\n", FEAT_JSON_BOOL(FEAT_MONGO)); + printf(" \"prom-remote-write\": %s\n", FEAT_JSON_BOOL(FEAT_REMOTE_WRITE)); + printf(" }\n"); + printf("}\n"); }; diff --git a/daemon/buildinfo.h b/daemon/buildinfo.h index 76912ea67..05e5efb5a 100644 --- a/daemon/buildinfo.h +++ b/daemon/buildinfo.h @@ -5,4 +5,6 @@ extern void print_build_info(void); +extern void print_build_info_json(void); + #endif // NETDATA_BUILDINFO_H diff --git a/daemon/common.h b/daemon/common.h index 68af95740..1a58ddda8 100644 --- a/daemon/common.h +++ b/daemon/common.h @@ -66,7 +66,11 @@ #include "claim/claim.h" // netdata agent cloud link +#ifndef ACLK_NG #include "aclk/legacy/agent_cloud_link.h" +#else +#include "aclk/aclk.h" +#endif // global GUID map functions diff --git a/daemon/config/README.md b/daemon/config/README.md index a1e2b04b5..b1e790a21 100644 --- a/daemon/config/README.md +++ b/daemon/config/README.md @@ -28,10 +28,11 @@ The configuration file is a `name = value` dictionary. Netdata will not complain ## Applying changes -After `netdata.conf` has been modified, Netdata needs to be restarted for changes to apply: +After `netdata.conf` has been modified, Netdata needs to be [restarted](/docs/configure/start-stop-restart.md) for +changes to apply: ```bash -sudo service netdata restart +sudo systemctl restart netdata ``` If the above does not work, try the following: diff --git a/daemon/main.c b/daemon/main.c index 7c002ac47..346486a84 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -50,6 +50,7 @@ void netdata_cleanup_and_exit(int ret) { rrdeng_exit(&multidb_ctx); #endif } + sql_close_database(); // unlink the pid if(pidfile[0]) { @@ -80,7 +81,7 @@ struct netdata_static_thread static_threads[] = { NETDATA_PLUGIN_HOOK_IDLEJITTER NETDATA_PLUGIN_HOOK_STATSD -#ifdef ENABLE_ACLK +#if defined(ENABLE_ACLK) || defined(ACLK_NG) NETDATA_ACLK_HOOK #endif @@ -1258,6 +1259,10 @@ int main(int argc, char **argv) { print_build_info(); return 0; } + else if(strcmp(optarg, "buildinfojson") == 0) { + print_build_info_json(); + return 0; + } else { fprintf(stderr, "Unknown -W parameter '%s'\n", optarg); return help(1); diff --git a/daemon/main.h b/daemon/main.h index 9d9f4ef0f..4000ab70b 100644 --- a/daemon/main.h +++ b/daemon/main.h @@ -19,7 +19,7 @@ struct option_def { const char val; /** The name of the long option. */ const char *description; - /** Short descripton what the option does */ + /** Short description what the option does */ /** Name of the argument displayed in SYNOPSIS */ const char *arg_name; /** Default value if not set */ diff --git a/daemon/unit_test.c b/daemon/unit_test.c index e6a69e354..9a17aa762 100644 --- a/daemon/unit_test.c +++ b/daemon/unit_test.c @@ -371,7 +371,7 @@ int unit_test_str2ld() { return -1; } } - else if(mine != sys && abs(mine-sys) > 0.000001) { + else if(mine != sys && ABS(mine-sys) > 0.000001) { fprintf(stderr, "Value '%s' is parsed as %" LONG_DOUBLE_MODIFIER ", but system believes it is %" LONG_DOUBLE_MODIFIER ", delta %" LONG_DOUBLE_MODIFIER ".\n", values[i], mine, sys, sys-mine); return -1; } diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index a18207100..d7698de01 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -699,6 +699,74 @@ void pg_cache_get_filtered_info_prev(struct rrdengine_instance *ctx, struct pg_c } uv_rwlock_rdunlock(&page_index->lock); } + +/** + * Searches for an unallocated page without triggering disk I/O. Attempts to reserve the page and get a reference. + * @param ctx DB context + * @param id lookup by UUID + * @param start_time exact starting time in usec + * @param ret_page_indexp Sets the page index pointer (*ret_page_indexp) for the given UUID. + * @return the page descriptor or NULL on failure. It can fail if: + * 1. The page is already allocated to the page cache. + * 2. It did not succeed to get a reference. + * 3. It did not succeed to reserve a spot in the page cache. + */ +struct rrdeng_page_descr *pg_cache_lookup_unpopulated_and_lock(struct rrdengine_instance *ctx, uuid_t *id, + usec_t start_time) +{ + struct page_cache *pg_cache = &ctx->pg_cache; + struct rrdeng_page_descr *descr = NULL; + struct page_cache_descr *pg_cache_descr = NULL; + unsigned long flags; + Pvoid_t *PValue; + struct pg_cache_page_index *page_index = NULL; + Word_t Index; + + uv_rwlock_rdlock(&pg_cache->metrics_index.lock); + PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); + if (likely(NULL != PValue)) { + page_index = *PValue; + } + uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); + + if ((NULL == PValue) || !pg_cache_try_reserve_pages(ctx, 1)) { + /* Failed to find page or failed to reserve a spot in the cache */ + return NULL; + } + + uv_rwlock_rdlock(&page_index->lock); + Index = (Word_t)(start_time / USEC_PER_SEC); + PValue = JudyLGet(page_index->JudyL_array, Index, PJE0); + if (likely(NULL != PValue)) { + descr = *PValue; + } + if (NULL == PValue || 0 == descr->page_length) { + /* Failed to find non-empty page */ + uv_rwlock_rdunlock(&page_index->lock); + + pg_cache_release_pages(ctx, 1); + return NULL; + } + + rrdeng_page_descr_mutex_lock(ctx, descr); + pg_cache_descr = descr->pg_cache_descr; + flags = pg_cache_descr->flags; + uv_rwlock_rdunlock(&page_index->lock); + + if ((flags & RRD_PAGE_POPULATED) || !pg_cache_try_get_unsafe(descr, 1)) { + /* Failed to get reference or page is already populated */ + rrdeng_page_descr_mutex_unlock(ctx, descr); + + pg_cache_release_pages(ctx, 1); + return NULL; + } + /* success */ + rrdeng_page_descr_mutex_unlock(ctx, descr); + rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); + + return descr; +} + /** * Searches for pages in a time range and triggers disk I/O if necessary and possible. * Does not get a reference. diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h index 31e9739da..d5350ef56 100644 --- a/database/engine/pagecache.h +++ b/database/engine/pagecache.h @@ -172,6 +172,8 @@ extern usec_t pg_cache_oldest_time_in_range(struct rrdengine_instance *ctx, uuid extern void pg_cache_get_filtered_info_prev(struct rrdengine_instance *ctx, struct pg_cache_page_index *page_index, usec_t point_in_time, pg_cache_page_info_filter_t *filter, struct rrdeng_page_info *page_info); +extern struct rrdeng_page_descr *pg_cache_lookup_unpopulated_and_lock(struct rrdengine_instance *ctx, uuid_t *id, + usec_t start_time); extern unsigned pg_cache_preload(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time, usec_t end_time, struct rrdeng_page_info **page_info_arrayp, struct pg_cache_page_index **ret_page_indexp); diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index 43135ff01..0c4a401cb 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -9,6 +9,8 @@ rrdeng_stats_t rrdeng_reserved_file_descriptors = 0; rrdeng_stats_t global_pg_cache_over_half_dirty_events = 0; rrdeng_stats_t global_flushing_pressure_page_deletions = 0; +static unsigned pages_per_extent = MAX_PAGES_PER_EXTENT; + static void sanity_check(void) { /* Magic numbers must fit in the super-blocks */ @@ -305,19 +307,32 @@ after_crc_check: } } - for (i = 0 ; i < xt_io_descr->descr_count; ++i) { - page = mallocz(RRDENG_BLOCK_SIZE); - descr = xt_io_descr->descr_array[i]; - for (j = 0, page_offset = 0; j < count; ++j) { + for (i = 0, page_offset = 0; i < count; page_offset += header->descr[i++].page_length) { + uint8_t is_prefetched_page; + descr = NULL; + for (j = 0 ; j < xt_io_descr->descr_count; ++j) { + struct rrdeng_page_descr *descrj; + + descrj = xt_io_descr->descr_array[j]; /* care, we don't hold the descriptor mutex */ - if (!uuid_compare(*(uuid_t *) header->descr[j].uuid, *descr->id) && - header->descr[j].page_length == descr->page_length && - header->descr[j].start_time == descr->start_time && - header->descr[j].end_time == descr->end_time) { + if (!uuid_compare(*(uuid_t *) header->descr[i].uuid, *descrj->id) && + header->descr[i].page_length == descrj->page_length && + header->descr[i].start_time == descrj->start_time && + header->descr[i].end_time == descrj->end_time) { + descr = descrj; break; } - page_offset += header->descr[j].page_length; } + is_prefetched_page = 0; + if (!descr) { /* This extent page has not been requested. Try populating it for locality (best effort). */ + descr = pg_cache_lookup_unpopulated_and_lock(ctx, (uuid_t *)header->descr[i].uuid, + header->descr[i].start_time); + if (!descr) + continue; /* Failed to reserve a suitable page */ + is_prefetched_page = 1; + } + page = mallocz(RRDENG_BLOCK_SIZE); + /* care, we don't hold the descriptor mutex */ if (have_read_error) { /* Applications should make sure NULL values match 0 as does SN_EMPTY_SLOT */ @@ -334,7 +349,7 @@ after_crc_check: pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING; rrdeng_page_descr_mutex_unlock(ctx, descr); pg_cache_replaceQ_insert(ctx, descr); - if (xt_io_descr->release_descr) { + if (xt_io_descr->release_descr || is_prefetched_page) { pg_cache_put(ctx, descr); } else { debug(D_RRDENGINE, "%s: Waking up waiters.", __func__); @@ -666,7 +681,7 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), descr = unlikely(NULL == PValue) ? NULL : *PValue ; - descr != NULL && count != MAX_PAGES_PER_EXTENT ; + descr != NULL && count != pages_per_extent ; PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), descr = unlikely(NULL == PValue) ? NULL : *PValue) { @@ -1031,6 +1046,21 @@ struct rrdeng_cmd rrdeng_deq_cmd(struct rrdengine_worker_config* wc) return ret; } +static void load_configuration_dynamic(void) +{ + unsigned read_num; + static int printed_error = 0; + + read_num = (unsigned) config_get_number(CONFIG_SECTION_GLOBAL, "dbengine extent pages", + MAX_PAGES_PER_EXTENT); + if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT) { + pages_per_extent = read_num; + } else if (!printed_error) { + printed_error = 1; + error("Invalid dbengine extent pages %u given. Defaulting to %u.", read_num, pages_per_extent); + } +} + void async_cb(uv_async_t *handle) { uv_stop(handle->loop); @@ -1084,6 +1114,7 @@ void timer_cb(uv_timer_t* handle) } } } + load_configuration_dynamic(); #ifdef NETDATA_INTERNAL_CHECKS { char buf[4096]; diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index 87af04bff..2d48665f8 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -56,16 +56,20 @@ enum rrdeng_opcode { RRDENG_MAX_OPCODE }; +struct rrdeng_read_page { + struct rrdeng_page_descr *page_cache_descr; +}; + +struct rrdeng_read_extent { + struct rrdeng_page_descr *page_cache_descr[MAX_PAGES_PER_EXTENT]; + int page_count; +}; + struct rrdeng_cmd { enum rrdeng_opcode opcode; union { - struct rrdeng_read_page { - struct rrdeng_page_descr *page_cache_descr; - } read_page; - struct rrdeng_read_extent { - struct rrdeng_page_descr *page_cache_descr[MAX_PAGES_PER_EXTENT]; - int page_count; - } read_extent; + struct rrdeng_read_page read_page; + struct rrdeng_read_extent read_extent; struct completion *completion; }; }; @@ -230,4 +234,4 @@ extern void rrdeng_worker(void* arg); extern void rrdeng_enq_cmd(struct rrdengine_worker_config* wc, struct rrdeng_cmd *cmd); extern struct rrdeng_cmd rrdeng_deq_cmd(struct rrdengine_worker_config* wc); -#endif /* NETDATA_RRDENGINE_H */ \ No newline at end of file +#endif /* NETDATA_RRDENGINE_H */ diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index 7b2ff5b72..cb46e06e3 100755 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -691,6 +691,36 @@ time_t rrdeng_metric_oldest_time(RRDDIM *rd) return page_index->oldest_time / USEC_PER_SEC; } +int rrdeng_metric_latest_time_by_uuid(uuid_t *dim_uuid, time_t *first_entry_t, time_t *last_entry_t) +{ + struct page_cache *pg_cache; + struct rrdengine_instance *ctx; + Pvoid_t *PValue; + struct pg_cache_page_index *page_index = NULL; + + ctx = get_rrdeng_ctx_from_host(localhost); + if (unlikely(!ctx)) { + error("Failed to fetch multidb context"); + return 1; + } + pg_cache = &ctx->pg_cache; + + uv_rwlock_rdlock(&pg_cache->metrics_index.lock); + PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, dim_uuid, sizeof(uuid_t)); + if (likely(NULL != PValue)) { + page_index = *PValue; + } + uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); + + if (likely(page_index)) { + *first_entry_t = page_index->oldest_time / USEC_PER_SEC; + *last_entry_t = page_index->latest_time / USEC_PER_SEC; + return 0; + } + + return 1; +} + /* Also gets a reference for the page */ void *rrdeng_create_page(struct rrdengine_instance *ctx, uuid_t *id, struct rrdeng_page_descr **ret_descr) { diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index 41375b980..00e55e662 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -59,5 +59,6 @@ extern int rrdeng_init(RRDHOST *host, struct rrdengine_instance **ctxp, char *db extern int rrdeng_exit(struct rrdengine_instance *ctx); extern void rrdeng_prepare_exit(struct rrdengine_instance *ctx); +extern int rrdeng_metric_latest_time_by_uuid(uuid_t *dim_uuid, time_t *first_entry_t, time_t *last_entry_t); #endif /* NETDATA_RRDENGINEAPI_H */ diff --git a/database/rrd.h b/database/rrd.h index b16fcdc16..59d0501bd 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -34,12 +34,24 @@ struct pg_cache_page_index; #include "rrdcalc.h" #include "rrdcalctemplate.h" #include "../streaming/rrdpush.h" + +#ifndef ACLK_NG #include "../aclk/legacy/aclk_rrdhost_state.h" +#else +#include "aclk/aclk.h" +#endif + +enum { + CONTEXT_FLAGS_ARCHIVE = 0x01, + CONTEXT_FLAGS_CHART = 0x02, + CONTEXT_FLAGS_CONTEXT = 0x04 +}; struct context_param { RRDDIM *rd; time_t first_entry_t; time_t last_entry_t; + uint8_t flags; }; #define META_CHART_UPDATED 1 @@ -130,7 +142,7 @@ extern const char *rrd_algorithm_name(RRD_ALGORITHM algorithm); // RRD FAMILY struct rrdfamily { - avl avl; + avl_t avl; const char *family; uint32_t hash_family; @@ -235,7 +247,7 @@ struct rrddim { // ------------------------------------------------------------------------ // binary indexing structures - avl avl; // the binary index - this has to be first member! + avl_t avl; // the binary index - this has to be first member! // ------------------------------------------------------------------------ // the dimension definition @@ -336,6 +348,18 @@ union rrddim_collect_handle { // ---------------------------------------------------------------------------- // iterator state for RRD dimension data queries + +#ifdef ENABLE_DBENGINE +struct rrdeng_query_handle { + struct rrdeng_page_descr *descr; + struct rrdengine_instance *ctx; + struct pg_cache_page_index *page_index; + time_t next_page_time; + time_t now; + unsigned position; +}; +#endif + struct rrddim_query_handle { RRDDIM *rd; time_t start_time; @@ -347,14 +371,7 @@ struct rrddim_query_handle { uint8_t finished; } slotted; // state the legacy code uses #ifdef ENABLE_DBENGINE - struct rrdeng_query_handle { - struct rrdeng_page_descr *descr; - struct rrdengine_instance *ctx; - struct pg_cache_page_index *page_index; - time_t next_page_time; - time_t now; - unsigned position; - } rrdeng; // state the database engine uses + struct rrdeng_query_handle rrdeng; // state the database engine uses #endif }; }; @@ -365,9 +382,9 @@ struct rrddim_query_handle { struct rrddim_volatile { #ifdef ENABLE_DBENGINE uuid_t *rrdeng_uuid; // database engine metric UUID - uuid_t *metric_uuid; // global UUID for this metric (unique_across hosts) struct pg_cache_page_index *page_index; #endif + uuid_t *metric_uuid; // global UUID for this metric (unique_across hosts) union rrddim_collect_handle handle; // ------------------------------------------------------------------------ // function pointers that handle data collection @@ -469,8 +486,8 @@ struct rrdset { // ------------------------------------------------------------------------ // binary indexing structures - avl avl; // the index, with key the id - this has to be first! - avl avlname; // the index, with key the name + avl_t avl; // the index, with key the id - this has to be first! + avl_t avlname; // the index, with key the name // ------------------------------------------------------------------------ // the set configuration @@ -523,7 +540,10 @@ struct rrdset { size_t counter; // the number of times we added values to this database size_t counter_done; // the number of times rrdset_done() has been called - time_t last_accessed_time; // the last time this RRDSET has been accessed + union { + time_t last_accessed_time; // the last time this RRDSET has been accessed + time_t last_entry_t; // the last_entry_t computed for transient RRDSET + }; time_t upstream_resync_time; // the timestamp up to which we should resync clock upstream char *plugin_name; // the name of the plugin that generated this @@ -722,7 +742,7 @@ struct rrdhost_system_info { }; struct rrdhost { - avl avl; // the index of hosts + avl_t avl; // the index of hosts // ------------------------------------------------------------------------ // host information @@ -851,8 +871,8 @@ struct rrdhost { #ifdef ENABLE_DBENGINE struct rrdengine_instance *rrdeng_ctx; // DB engine instance for this host - uuid_t host_uuid; // Global GUID for this host #endif + uuid_t host_uuid; // Global GUID for this host #ifdef ENABLE_HTTPS struct netdata_ssl ssl; //Structure used to encrypt the connection @@ -999,13 +1019,13 @@ extern void rrdhost_free_all(void); extern void rrdhost_save_all(void); extern void rrdhost_cleanup_all(void); -extern void rrdhost_cleanup_orphan_hosts_nolock(RRDHOST *protected); +extern void rrdhost_cleanup_orphan_hosts_nolock(RRDHOST *protected_host); extern void rrdhost_system_info_free(struct rrdhost_system_info *system_info); extern void rrdhost_free(RRDHOST *host); extern void rrdhost_save_charts(RRDHOST *host); extern void rrdhost_delete_charts(RRDHOST *host); -extern int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected, time_t now); +extern int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now); extern void rrdset_update_heterogeneous_flag(RRDSET *st); @@ -1053,7 +1073,7 @@ extern void rrdset_isnot_obsolete(RRDSET *st); // checks if the RRDSET should be offered to viewers #define rrdset_is_available_for_viewers(st) (rrdset_flag_check(st, RRDSET_FLAG_ENABLED) && !rrdset_flag_check(st, RRDSET_FLAG_HIDDEN) && !rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions && (st)->rrd_memory_mode != RRD_MEMORY_MODE_NONE) -#define rrdset_is_available_for_backends(st) (rrdset_flag_check(st, RRDSET_FLAG_ENABLED) && !rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions) +#define rrdset_is_available_for_exporting_and_alarms(st) (rrdset_flag_check(st, RRDSET_FLAG_ENABLED) && !rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions) #define rrdset_is_archived(st) (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions) // get the total duration in seconds of the round robin database @@ -1281,8 +1301,8 @@ extern int rrdfamily_compare(void *a, void *b); extern RRDFAMILY *rrdfamily_create(RRDHOST *host, const char *id); extern void rrdfamily_free(RRDHOST *host, RRDFAMILY *rc); -#define rrdset_index_add(host, st) (RRDSET *)avl_insert_lock(&((host)->rrdset_root_index), (avl *)(st)) -#define rrdset_index_del(host, st) (RRDSET *)avl_remove_lock(&((host)->rrdset_root_index), (avl *)(st)) +#define rrdset_index_add(host, st) (RRDSET *)avl_insert_lock(&((host)->rrdset_root_index), (avl_t *)(st)) +#define rrdset_index_del(host, st) (RRDSET *)avl_remove_lock(&((host)->rrdset_root_index), (avl_t *)(st)) extern RRDSET *rrdset_index_del_name(RRDHOST *host, RRDSET *st); extern void rrdset_free(RRDSET *st); @@ -1312,7 +1332,7 @@ extern void set_host_properties( #ifdef ENABLE_DBENGINE #include "database/engine/rrdengineapi.h" -#include "sqlite/sqlite_functions.h" #endif +#include "sqlite/sqlite_functions.h" #endif /* NETDATA_RRD_H */ diff --git a/database/rrdcalc.c b/database/rrdcalc.c index 935ee9c05..bc91da64f 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -472,7 +472,7 @@ inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, rrdcalc_add_to_host(host, rc); if(!rt->foreachdim) { - RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)rc); + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl_t *)rc); if (rdcmp != rc) { error("Cannot insert the alarm index ID %s",rc->name); } @@ -605,17 +605,17 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) { error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname); } - RRDCALC *rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_health_log, (avl *)rc); + RRDCALC *rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_health_log, (avl_t *)rc); if (rdcmp) { - rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_health_log, (avl *)rc); + rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_health_log, (avl_t *)rc); if (!rdcmp) { error("Cannot remove the health alarm index from health_log"); } } - rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_name, (avl *)rc); + rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_name, (avl_t *)rc); if (rdcmp) { - rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_name, (avl *)rc); + rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_name, (avl_t *)rc); if (!rdcmp) { error("Cannot remove the health alarm index from idx_name"); } @@ -727,7 +727,7 @@ void rrdcalc_labels_unlink() { int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id) { RRDCALC findme; findme.id = alarm_id; - RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_health_log, (avl *)&findme); + RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_health_log, (avl_t *)&findme); if (!rc) { return 0; } @@ -761,7 +761,7 @@ RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name,uint32_t hash) { RRDCALC findme; findme.name = alarm_name; findme.hash = hash; - RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_name, (avl *)&findme); + RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_name, (avl_t *)&findme); return rc; } diff --git a/database/rrdcalc.h b/database/rrdcalc.h index cd0d70049..27ff99a89 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -32,7 +32,7 @@ struct rrdcalc { - avl avl; // the index, with key the id - this has to be first! + avl_t avl; // the index, with key the id - this has to be first! uint32_t id; // the unique id of this alarm uint32_t next_event_id; // the next event id that will be used for this alarm diff --git a/database/rrddim.c b/database/rrddim.c index 6a1408595..b4ea34d2d 100644 --- a/database/rrddim.c +++ b/database/rrddim.c @@ -38,15 +38,15 @@ int rrddim_compare(void* a, void* b) { else return strcmp(((RRDDIM *)a)->id, ((RRDDIM *)b)->id); } -#define rrddim_index_add(st, rd) (RRDDIM *)avl_insert_lock(&((st)->dimensions_index), (avl *)(rd)) -#define rrddim_index_del(st,rd ) (RRDDIM *)avl_remove_lock(&((st)->dimensions_index), (avl *)(rd)) +#define rrddim_index_add(st, rd) (RRDDIM *)avl_insert_lock(&((st)->dimensions_index), (avl_t *)(rd)) +#define rrddim_index_del(st,rd ) (RRDDIM *)avl_remove_lock(&((st)->dimensions_index), (avl_t *)(rd)) static inline RRDDIM *rrddim_index_find(RRDSET *st, const char *id, uint32_t hash) { RRDDIM tmp = { .id = id, .hash = (hash)?hash:simple_hash(id) }; - return (RRDDIM *)avl_search_lock(&(st->dimensions_index), (avl *) &tmp); + return (RRDDIM *)avl_search_lock(&(st->dimensions_index), (avl_t *) &tmp); } @@ -197,7 +197,7 @@ void rrdcalc_link_to_rrddim(RRDDIM *rd, RRDSET *st, RRDHOST *host) { RRDCALC *child = rrdcalc_create_from_rrdcalc(rrdc, host, usename, rd->name); if (child) { rrdcalc_add_to_host(host, child); - RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)child); + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl_t *)child); if (rdcmp != child) { error("Cannot insert the alarm index ID %s",child->name); } @@ -232,9 +232,7 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte rc += rrddim_set_multiplier(st, rd, multiplier); rc += rrddim_set_divisor(st, rd, divisor); if (rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED)) { -#ifdef ENABLE_DBENGINE store_active_dimension(rd->state->metric_uuid); -#endif rd->state->collect_ops.init(rd); rrddim_flag_clear(rd, RRDDIM_FLAG_ARCHIVED); rrddimvar_create(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_OPTION_DEFAULT); @@ -242,14 +240,11 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte rrddimvar_create(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_OPTION_DEFAULT); calc_link_to_rrddim(rd); } - // DBENGINE available and activated? -#ifdef ENABLE_DBENGINE - if (likely(rd->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) && unlikely(rc)) { + if (unlikely(rc)) { debug(D_METADATALOG, "DIMENSION [%s] metadata updated", rd->id); (void)sql_store_dimension(rd->state->metric_uuid, rd->rrdset->chart_uuid, rd->id, rd->name, rd->multiplier, rd->divisor, rd->algorithm); } -#endif rrdset_unlock(st); return rd; } @@ -277,7 +272,7 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte if(likely(rd)) { // we have a file mapped for rd - memset(&rd->avl, 0, sizeof(avl)); + memset(&rd->avl, 0, sizeof(avl_t)); rd->id = NULL; rd->name = NULL; rd->cache_filename = NULL; @@ -396,7 +391,6 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte #ifdef ENABLE_DBENGINE uuid_t *dim_uuid = find_dimension_uuid(st, rd); rrdeng_metric_init(rd, dim_uuid); - store_active_dimension(rd->state->metric_uuid); rd->state->collect_ops.init = rrdeng_store_metric_init; rd->state->collect_ops.store_metric = rrdeng_store_metric_next; rd->state->collect_ops.finalize = rrdeng_store_metric_finalize; @@ -408,6 +402,9 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte rd->state->query_ops.oldest_time = rrdeng_metric_oldest_time; #endif } else { + rd->state->metric_uuid = find_dimension_uuid(st, rd); + if (unlikely(!rd->state->metric_uuid)) + rd->state->metric_uuid = create_dimension_uuid(rd->rrdset, rd); rd->state->collect_ops.init = rrddim_collect_init; rd->state->collect_ops.store_metric = rrddim_collect_store_metric; rd->state->collect_ops.finalize = rrddim_collect_finalize; @@ -418,6 +415,7 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte rd->state->query_ops.latest_time = rrddim_query_latest_time; rd->state->query_ops.oldest_time = rrddim_query_oldest_time; } + store_active_dimension(rd->state->metric_uuid); rd->state->collect_ops.init(rd); // append this dimension if(!st->dimensions) @@ -425,7 +423,7 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte else { RRDDIM *td = st->dimensions; - if(td->algorithm != rd->algorithm || abs(td->multiplier) != abs(rd->multiplier) || abs(td->divisor) != abs(rd->divisor)) { + if(td->algorithm != rd->algorithm || ABS(td->multiplier) != ABS(rd->multiplier) || ABS(td->divisor) != ABS(rd->divisor)) { if(!rrdset_flag_check(st, RRDSET_FLAG_HETEROGENEOUS)) { #ifdef NETDATA_INTERNAL_CHECKS info("Dimension '%s' added on chart '%s' of host '%s' is not homogeneous to other dimensions already present (algorithm is '%s' vs '%s', multiplier is " COLLECTED_NUMBER_FORMAT " vs " COLLECTED_NUMBER_FORMAT ", divisor is " COLLECTED_NUMBER_FORMAT " vs " COLLECTED_NUMBER_FORMAT ").", @@ -476,10 +474,8 @@ void rrddim_free_custom(RRDSET *st, RRDDIM *rd, int db_rotated) if (!rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED)) { uint8_t can_delete_metric = rd->state->collect_ops.finalize(rd); if (can_delete_metric && rd->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { -#ifdef ENABLE_DBENGINE /* This metric has no data and no references */ delete_dimension_uuid(rd->state->metric_uuid); -#endif } } @@ -503,6 +499,7 @@ void rrddim_free_custom(RRDSET *st, RRDDIM *rd, int db_rotated) error("RRDDIM: INTERNAL ERROR: attempt to remove from index dimension '%s' on chart '%s', removed a different dimension.", rd->id, st->id); // free(rd->annotations); + freez(rd->state->metric_uuid); RRD_MEMORY_MODE rrd_memory_mode = rd->rrd_memory_mode; switch(rrd_memory_mode) { @@ -522,11 +519,6 @@ void rrddim_free_custom(RRDSET *st, RRDDIM *rd, int db_rotated) debug(D_RRD_CALLS, "Removing dimension '%s'.", rd->name); freez((void *)rd->id); freez(rd->cache_filename); -#ifdef ENABLE_DBENGINE - if (rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { - freez(rd->state->metric_uuid); - } -#endif freez(rd->state); freez(rd); break; diff --git a/database/rrdfamily.c b/database/rrdfamily.c index f75f0adc3..3d91c3788 100644 --- a/database/rrdfamily.c +++ b/database/rrdfamily.c @@ -12,15 +12,15 @@ int rrdfamily_compare(void *a, void *b) { else return strcmp(((RRDFAMILY *)a)->family, ((RRDFAMILY *)b)->family); } -#define rrdfamily_index_add(host, rc) (RRDFAMILY *)avl_insert_lock(&((host)->rrdfamily_root_index), (avl *)(rc)) -#define rrdfamily_index_del(host, rc) (RRDFAMILY *)avl_remove_lock(&((host)->rrdfamily_root_index), (avl *)(rc)) +#define rrdfamily_index_add(host, rc) (RRDFAMILY *)avl_insert_lock(&((host)->rrdfamily_root_index), (avl_t *)(rc)) +#define rrdfamily_index_del(host, rc) (RRDFAMILY *)avl_remove_lock(&((host)->rrdfamily_root_index), (avl_t *)(rc)) static RRDFAMILY *rrdfamily_index_find(RRDHOST *host, const char *id, uint32_t hash) { RRDFAMILY tmp; tmp.family = id; tmp.hash_family = (hash)?hash:simple_hash(tmp.family); - return (RRDFAMILY *)avl_search_lock(&(host->rrdfamily_root_index), (avl *) &tmp); + return (RRDFAMILY *)avl_search_lock(&(host->rrdfamily_root_index), (avl_t *) &tmp); } RRDFAMILY *rrdfamily_create(RRDHOST *host, const char *id) { diff --git a/database/rrdhost.c b/database/rrdhost.c index 45c314602..ae49036a8 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -31,7 +31,7 @@ RRDHOST *rrdhost_find_by_guid(const char *guid, uint32_t hash) { strncpyz(tmp.machine_guid, guid, GUID_LEN); tmp.hash_machine_guid = (hash)?hash:simple_hash(tmp.machine_guid); - return (RRDHOST *)avl_search_lock(&(rrdhost_root_index), (avl *) &tmp); + return (RRDHOST *)avl_search_lock(&(rrdhost_root_index), (avl_t *) &tmp); } RRDHOST *rrdhost_find_by_hostname(const char *hostname, uint32_t hash) { @@ -53,8 +53,8 @@ RRDHOST *rrdhost_find_by_hostname(const char *hostname, uint32_t hash) { return NULL; } -#define rrdhost_index_add(rrdhost) (RRDHOST *)avl_insert_lock(&(rrdhost_root_index), (avl *)(rrdhost)) -#define rrdhost_index_del(rrdhost) (RRDHOST *)avl_remove_lock(&(rrdhost_root_index), (avl *)(rrdhost)) +#define rrdhost_index_add(rrdhost) (RRDHOST *)avl_insert_lock(&(rrdhost_root_index), (avl_t *)(rrdhost)) +#define rrdhost_index_del(rrdhost) (RRDHOST *)avl_remove_lock(&(rrdhost_root_index), (avl_t *)(rrdhost)) // ---------------------------------------------------------------------------- @@ -298,15 +298,16 @@ RRDHOST *rrdhost_create(const char *hostname, return NULL; } + if (likely(!uuid_parse(host->machine_guid, host->host_uuid))) { + int rc = sql_store_host(&host->host_uuid, hostname, registry_hostname, update_every, os, timezone, tags); + if (unlikely(rc)) + error_report("Failed to store machine GUID to the database"); + } + else + error_report("Host machine GUID %s is not valid", host->machine_guid); + if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { #ifdef ENABLE_DBENGINE - if (likely(!uuid_parse(host->machine_guid, host->host_uuid))) { - int rc = sql_store_host(&host->host_uuid, hostname, registry_hostname, update_every, os, timezone, tags); - if (unlikely(rc)) - error_report("Failed to store machine GUID to the database"); - } - else - error_report("Host machine GUID %s is not valid", host->machine_guid); char dbenginepath[FILENAME_MAX + 1]; int ret; @@ -335,6 +336,11 @@ RRDHOST *rrdhost_create(const char *hostname, fatal("RRD_MEMORY_MODE_DBENGINE is not supported in this platform."); #endif } + else { +#ifdef ENABLE_DBENGINE + host->rrdeng_ctx = &multidb_ctx; +#endif + } // ------------------------------------------------------------------------ // link it and add it to the index @@ -582,8 +588,8 @@ RRDHOST *rrdhost_find_or_create( return host; } -inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected, time_t now) { - if(host != protected +inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now) { + if(host != protected_host && host != localhost && rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN) && host->receiver @@ -594,14 +600,14 @@ inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected, time_t n return 0; } -void rrdhost_cleanup_orphan_hosts_nolock(RRDHOST *protected) { +void rrdhost_cleanup_orphan_hosts_nolock(RRDHOST *protected_host) { time_t now = now_realtime_sec(); RRDHOST *host; restart_after_removal: rrdhost_foreach_write(host) { - if(rrdhost_should_be_removed(host, protected, now)) { + if(rrdhost_should_be_removed(host, protected_host, now)) { info("Host '%s' with machine guid '%s' is obsolete - cleaning up.", host->hostname, host->machine_guid); if (rrdhost_flag_check(host, RRDHOST_FLAG_DELETE_ORPHAN_HOST) @@ -629,11 +635,11 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) { if (gap_when_lost_iterations_above < 1) gap_when_lost_iterations_above = 1; -#ifdef ENABLE_DBENGINE if (unlikely(sql_init_database())) { - return 1; + if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) + return 1; + info("Skipping SQLITE metadata initialization since memory mode is not db engine"); } -#endif health_init(); diff --git a/database/rrdset.c b/database/rrdset.c index d16fe737c..15640d3ed 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -35,7 +35,7 @@ static RRDSET *rrdset_index_find(RRDHOST *host, const char *id, uint32_t hash) { strncpyz(tmp.id, id, RRD_ID_LENGTH_MAX); tmp.hash = (hash)?hash:simple_hash(tmp.id); - return (RRDSET *)avl_search_lock(&(host->rrdset_root_index), (avl *) &tmp); + return (RRDSET *)avl_search_lock(&(host->rrdset_root_index), (avl_t *) &tmp); } // ---------------------------------------------------------------------------- @@ -57,7 +57,7 @@ int rrdset_compare_name(void* a, void* b) { RRDSET *rrdset_index_add_name(RRDHOST *host, RRDSET *st) { void *result; // fprintf(stderr, "ADDING: %s (name: %s)\n", st->id, st->name); - result = avl_insert_lock(&host->rrdset_root_index_name, (avl *) (&st->avlname)); + result = avl_insert_lock(&host->rrdset_root_index_name, (avl_t *) (&st->avlname)); if(result) return rrdset_from_avlname(result); return NULL; } @@ -65,7 +65,7 @@ RRDSET *rrdset_index_add_name(RRDHOST *host, RRDSET *st) { RRDSET *rrdset_index_del_name(RRDHOST *host, RRDSET *st) { void *result; // fprintf(stderr, "DELETING: %s (name: %s)\n", st->id, st->name); - result = (RRDSET *)avl_remove_lock(&((host)->rrdset_root_index_name), (avl *)(&st->avlname)); + result = (RRDSET *)avl_remove_lock(&((host)->rrdset_root_index_name), (avl_t *)(&st->avlname)); if(result) return rrdset_from_avlname(result); return NULL; } @@ -81,7 +81,7 @@ static inline RRDSET *rrdset_index_find_name(RRDHOST *host, const char *name, ui tmp.hash_name = (hash)?hash:simple_hash(tmp.name); // fprintf(stderr, "SEARCHING: %s\n", name); - result = avl_search_lock(&host->rrdset_root_index_name, (avl *) (&(tmp.avlname))); + result = avl_search_lock(&host->rrdset_root_index_name, (avl_t *) (&(tmp.avlname))); if(result) { RRDSET *st = rrdset_from_avlname(result); if(strcmp(st->magic, RRDSET_MAGIC) != 0) @@ -219,11 +219,11 @@ inline void rrdset_update_heterogeneous_flag(RRDSET *st) { rrdset_flag_clear(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); RRD_ALGORITHM algorithm = st->dimensions->algorithm; - collected_number multiplier = abs(st->dimensions->multiplier); - collected_number divisor = abs(st->dimensions->divisor); + collected_number multiplier = ABS(st->dimensions->multiplier); + collected_number divisor = ABS(st->dimensions->divisor); rrddim_foreach_read(rd, st) { - if(algorithm != rd->algorithm || multiplier != abs(rd->multiplier) || divisor != abs(rd->divisor)) { + if(algorithm != rd->algorithm || multiplier != ABS(rd->multiplier) || divisor != ABS(rd->divisor)) { if(!rrdset_flag_check(st, RRDSET_FLAG_HETEROGENEOUS)) { #ifdef NETDATA_INTERNAL_CHECKS info("Dimension '%s' added on chart '%s' of host '%s' is not homogeneous to other dimensions already present (algorithm is '%s' vs '%s', multiplier is " COLLECTED_NUMBER_FORMAT " vs " COLLECTED_NUMBER_FORMAT ", divisor is " COLLECTED_NUMBER_FORMAT " vs " COLLECTED_NUMBER_FORMAT ").", @@ -385,6 +385,7 @@ void rrdset_free(RRDSET *st) { freez(st->state->old_context); free_label_list(st->state->labels.head); freez(st->state); + freez(st->chart_uuid); switch(st->rrd_memory_mode) { case RRD_MEMORY_MODE_SAVE: @@ -397,10 +398,6 @@ void rrdset_free(RRDSET *st) { case RRD_MEMORY_MODE_ALLOC: case RRD_MEMORY_MODE_NONE: case RRD_MEMORY_MODE_DBENGINE: -#ifdef ENABLE_DBENGINE - if (st->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - freez(st->chart_uuid); -#endif freez(st); break; } @@ -660,15 +657,12 @@ RRDSET *rrdset_create_custom( sched_yield(); } } -#ifdef ENABLE_DBENGINE - if (st->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && - (mark_rebuild & (META_CHART_UPDATED | META_PLUGIN_UPDATED | META_MODULE_UPDATED))) { + if (mark_rebuild & (META_CHART_UPDATED | META_PLUGIN_UPDATED | META_MODULE_UPDATED)) { debug(D_METADATALOG, "CHART [%s] metadata updated", st->id); int rc = update_chart_metadata(st->chart_uuid, st, id, name); if (unlikely(rc)) error_report("Failed to update chart metadata in the database"); } -#endif /* Fall-through during switch from archived to active so that the host lock is taken and health is linked */ if (!changed_from_archived_to_active) return st; @@ -744,8 +738,8 @@ RRDSET *rrdset_create_custom( ); if(st) { - memset(&st->avl, 0, sizeof(avl)); - memset(&st->avlname, 0, sizeof(avl)); + memset(&st->avl, 0, sizeof(avl_t)); + memset(&st->avlname, 0, sizeof(avl_t)); memset(&st->rrdvar_root_index, 0, sizeof(avl_tree_lock)); memset(&st->dimensions_index, 0, sizeof(avl_tree_lock)); memset(&st->rrdset_rwlock, 0, sizeof(netdata_rwlock_t)); @@ -925,15 +919,14 @@ RRDSET *rrdset_create_custom( rrdsetcalc_link_matching(st); rrdcalctemplate_link_matching(st); -#ifdef ENABLE_DBENGINE - if (st->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { - st->chart_uuid = find_chart_uuid(host, type, id, name); - if (unlikely(!st->chart_uuid)) - st->chart_uuid = create_chart_uuid(st, id, name); - store_active_chart(st->chart_uuid); - } -#endif + st->chart_uuid = find_chart_uuid(host, type, id, name); + if (unlikely(!st->chart_uuid)) + st->chart_uuid = create_chart_uuid(st, id, name); + else + update_chart_metadata(st->chart_uuid, st, id, name); + + store_active_chart(st->chart_uuid); rrdhost_cleanup_obsolete_charts(host); @@ -1932,6 +1925,15 @@ void rrdset_finalize_labels(RRDSET *st) } else { replace_label_list(labels, new_labels); } + + netdata_rwlock_wrlock(&labels->labels_rwlock); + struct label *lbl = labels->head; + while (lbl) { + sql_store_chart_label(st->chart_uuid, (int)lbl->label_source, lbl->key, lbl->value); + lbl = lbl->next; + } + netdata_rwlock_unlock(&labels->labels_rwlock); + st->state->new_labels = NULL; } diff --git a/database/rrdvar.c b/database/rrdvar.c index 6b824d0d3..25b8ca69e 100644 --- a/database/rrdvar.c +++ b/database/rrdvar.c @@ -27,7 +27,7 @@ int rrdvar_compare(void* a, void* b) { } static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) { - RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl *)(rv)); + RRDVAR *ret = (RRDVAR *)avl_insert_lock(tree, (avl_t *)(rv)); if(ret != rv) debug(D_VARIABLES, "Request to insert RRDVAR '%s' into index failed. Already exists.", rv->name); @@ -35,7 +35,7 @@ static inline RRDVAR *rrdvar_index_add(avl_tree_lock *tree, RRDVAR *rv) { } static inline RRDVAR *rrdvar_index_del(avl_tree_lock *tree, RRDVAR *rv) { - RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl *)(rv)); + RRDVAR *ret = (RRDVAR *)avl_remove_lock(tree, (avl_t *)(rv)); if(!ret) error("Request to remove RRDVAR '%s' from index failed. Not Found.", rv->name); @@ -47,7 +47,7 @@ static inline RRDVAR *rrdvar_index_find(avl_tree_lock *tree, const char *name, u tmp.name = (char *)name; tmp.hash = (hash)?hash:simple_hash(tmp.name); - return (RRDVAR *)avl_search_lock(tree, (avl *)&tmp); + return (RRDVAR *)avl_search_lock(tree, (avl_t *)&tmp); } inline void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv) { diff --git a/database/rrdvar.h b/database/rrdvar.h index 6d1461b2a..ec6e80a43 100644 --- a/database/rrdvar.h +++ b/database/rrdvar.h @@ -32,7 +32,7 @@ typedef enum rrdvar_options { // 2. at each context (RRDFAMILY.rrdvar_root_index) // 3. at each host (RRDHOST.rrdvar_root_index) struct rrdvar { - avl avl; + avl_t avl; char *name; uint32_t hash; diff --git a/database/sqlite/Makefile.am b/database/sqlite/Makefile.am new file mode 100644 index 000000000..babdcf0df --- /dev/null +++ b/database/sqlite/Makefile.am @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in diff --git a/database/sqlite/sqlite_functions.c b/database/sqlite/sqlite_functions.c index ab6c59cfa..694b86330 100644 --- a/database/sqlite/sqlite_functions.c +++ b/database/sqlite/sqlite_functions.c @@ -17,12 +17,14 @@ const char *database_config[] = { "CREATE TABLE IF NOT EXISTS metadata_migration(filename text, file_size, date_created int);", "CREATE INDEX IF NOT EXISTS ind_d1 on dimension (chart_id, id, name);", "CREATE INDEX IF NOT EXISTS ind_c1 on chart (host_id, id, type, name);", + "CREATE TABLE IF NOT EXISTS chart_label(chart_id blob, source_type int, label_key text, " + "label_value text, date_created int, PRIMARY KEY (chart_id, label_key));", "delete from chart_active;", "delete from dimension_active;", - "delete from chart where chart_id not in (select chart_id from dimension);", "delete from host where host_id not in (select host_id from chart);", + "delete from chart_label where chart_id not in (select chart_id from chart);", NULL }; @@ -46,6 +48,31 @@ static int execute_insert(sqlite3_stmt *res) return rc; } +#define MAX_OPEN_STATEMENTS (512) + +static void add_stmt_to_list(sqlite3_stmt *res) +{ + static int idx = 0; + static sqlite3_stmt *statements[MAX_OPEN_STATEMENTS]; + + if (unlikely(!res)) { + while (idx > 0) + sqlite3_finalize(statements[--idx]); + return; + } + + if (unlikely(idx == MAX_OPEN_STATEMENTS)) + return; + statements[idx++] = res; +} + +static int prepare_statement(sqlite3 *database, char *query, sqlite3_stmt **statement) { + int rc = sqlite3_prepare_v2(database, query, -1, statement, 0); + if (likely(rc == SQLITE_OK)) + add_stmt_to_list(*statement); + return rc; +} + /* * Store a chart or dimension UUID in chart_active or dimension_active * The statement that will be prepared determines that @@ -82,7 +109,8 @@ void store_active_chart(uuid_t *chart_uuid) int rc; if (unlikely(!db_meta)) { - error_report("Database has not been initialized"); + if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) + error_report("Database has not been initialized"); return; } @@ -109,7 +137,8 @@ void store_active_dimension(uuid_t *dimension_uuid) int rc; if (unlikely(!db_meta)) { - error_report("Database has not been initialized"); + if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) + error_report("Database has not been initialized"); return; } @@ -141,7 +170,9 @@ int sql_init_database(void) snprintfz(sqlite_database, FILENAME_MAX, "%s/netdata-meta.db", netdata_configured_cache_dir); rc = sqlite3_open(sqlite_database, &db_meta); if (rc != SQLITE_OK) { - error_report("Failed to initialize database at %s", sqlite_database); + error_report("Failed to initialize database at %s, due to \"%s\"", sqlite_database, sqlite3_errstr(rc)); + sqlite3_close(db_meta); + db_meta = NULL; return 1; } @@ -172,9 +203,12 @@ void sql_close_database(void) return; info("Closing SQLite database"); - rc = sqlite3_close(db_meta); + + add_stmt_to_list(NULL); + + rc = sqlite3_close_v2(db_meta); if (unlikely(rc != SQLITE_OK)) - error_report("Error %d while closing the SQLite database", rc); + error_report("Error %d while closing the SQLite database, %s", rc, sqlite3_errstr(rc)); return; } @@ -187,7 +221,7 @@ int find_uuid_type(uuid_t *uuid) int uuid_type = 3; if (unlikely(!res)) { - rc = sqlite3_prepare_v2(db_meta, FIND_UUID_TYPE, -1, &res, 0); + rc = prepare_statement(db_meta, FIND_UUID_TYPE, &res); if (rc != SQLITE_OK) { error_report("Failed to bind prepare statement to find UUID type in the database"); return 0; @@ -218,8 +252,11 @@ uuid_t *find_dimension_uuid(RRDSET *st, RRDDIM *rd) uuid_t *uuid = NULL; int rc; + if (unlikely(!db_meta) && default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) + return NULL; + if (unlikely(!res)) { - rc = sqlite3_prepare_v2(db_meta, SQL_FIND_DIMENSION_UUID, -1, &res, 0); + rc = prepare_statement(db_meta, SQL_FIND_DIMENSION_UUID, &res); if (rc != SQLITE_OK) { error_report("Failed to bind prepare statement to lookup dimension UUID in the database"); return NULL; @@ -299,7 +336,7 @@ void delete_dimension_uuid(uuid_t *dimension_uuid) #endif if (unlikely(!res)) { - rc = sqlite3_prepare_v2(db_meta, DELETE_DIMENSION_UUID, -1, &res, 0); + rc = prepare_statement(db_meta, DELETE_DIMENSION_UUID, &res); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to delete a dimension uuid"); return; @@ -331,8 +368,11 @@ uuid_t *find_chart_uuid(RRDHOST *host, const char *type, const char *id, const c uuid_t *uuid = NULL; int rc; + if (unlikely(!db_meta) && default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) + return NULL; + if (unlikely(!res)) { - rc = sqlite3_prepare_v2(db_meta, SQL_FIND_CHART_UUID, -1, &res, 0); + rc = prepare_statement(db_meta, SQL_FIND_CHART_UUID, &res); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to lookup chart UUID in the database"); return NULL; @@ -388,6 +428,9 @@ int update_chart_metadata(uuid_t *chart_uuid, RRDSET *st, const char *id, const { int rc; + if (unlikely(!db_meta) && default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) + return 0; + rc = sql_store_chart( chart_uuid, &st->rrdhost->host_uuid, st->type, id, name, st->family, st->context, st->title, st->units, st->plugin_name, st->module_name, st->priority, st->update_every, st->chart_type, st->rrd_memory_mode, st->entries); @@ -427,12 +470,14 @@ int sql_store_host( int rc; if (unlikely(!db_meta)) { + if (default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) + return 0; error_report("Database has not been initialized"); return 1; } if (unlikely((!res))) { - rc = sqlite3_prepare_v2(db_meta, SQL_STORE_HOST, -1, &res, 0); + rc = prepare_statement(db_meta, SQL_STORE_HOST, &res); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to store host, rc = %d", rc); return 1; @@ -493,16 +538,18 @@ int sql_store_chart( const char *context, const char *title, const char *units, const char *plugin, const char *module, long priority, int update_every, int chart_type, int memory_mode, long history_entries) { - static __thread sqlite3_stmt *res; + static __thread sqlite3_stmt *res = NULL; int rc, param = 0; if (unlikely(!db_meta)) { + if (default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) + return 0; error_report("Database has not been initialized"); return 1; } if (unlikely(!res)) { - rc = sqlite3_prepare_v2(db_meta, SQL_STORE_CHART, -1, &res, 0); + rc = prepare_statement(db_meta, SQL_STORE_CHART, &res); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to store chart, rc = %d", rc); return 1; @@ -530,11 +577,12 @@ int sql_store_chart( goto bind_fail; param++; - if (name) { + if (name && *name) rc = sqlite3_bind_text(res, 5, name, -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - } + else + rc = sqlite3_bind_null(res, 5); + if (unlikely(rc != SQLITE_OK)) + goto bind_fail; param++; rc = sqlite3_bind_text(res, 6, family, -1, SQLITE_STATIC); @@ -620,12 +668,14 @@ int sql_store_dimension( int rc; if (unlikely(!db_meta)) { + if (default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) + return 0; error_report("Database has not been initialized"); return 1; } if (unlikely(!res)) { - rc = sqlite3_prepare_v2(db_meta, SQL_STORE_DIMENSION, -1, &res, 0); + rc = prepare_statement(db_meta, SQL_STORE_DIMENSION, &res); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to store dimension, rc = %d", rc); return 1; @@ -733,7 +783,7 @@ void sql_rrdset2json(RRDHOST *host, BUFFER *wb) rc = sqlite3_bind_blob(res_chart, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind host parameter to fetch archived charts"); - return; + goto failed; } rc = sqlite3_prepare_v2(db_meta, SELECT_DIMENSION, -1, &res_dim, 0); @@ -883,25 +933,41 @@ failed: return; } -#define SELECT_HOST "select host_id, registry_hostname, update_every, os, timezone, tags from host where hostname = @hostname;" +#define SELECT_HOST "select host_id, registry_hostname, update_every, os, timezone, tags from host where hostname = @hostname order by rowid desc;" +#define SELECT_HOST_BY_UUID "select host_id, registry_hostname, update_every, os, timezone, tags from host where host_id = @host_id ;" RRDHOST *sql_create_host_by_uuid(char *hostname) { int rc; RRDHOST *host = NULL; + uuid_t host_uuid; sqlite3_stmt *res = NULL; - rc = sqlite3_prepare_v2(db_meta, SELECT_HOST, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to fetch host"); - return NULL; + rc = uuid_parse(hostname, host_uuid); + if (!rc) { + rc = sqlite3_prepare_v2(db_meta, SELECT_HOST_BY_UUID, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to fetch host by uuid"); + return NULL; + } + rc = sqlite3_bind_blob(res, 1, &host_uuid, sizeof(host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter to fetch host information"); + goto failed; + } } - - rc = sqlite3_bind_text(res, 1, hostname, -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind hostname parameter to fetch host information"); - return NULL; + else { + rc = sqlite3_prepare_v2(db_meta, SELECT_HOST, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to fetch host by hostname"); + return NULL; + } + rc = sqlite3_bind_text(res, 1, hostname, -1, SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind hostname parameter to fetch host information"); + goto failed; + } } rc = sqlite3_step(res); @@ -916,13 +982,17 @@ RRDHOST *sql_create_host_by_uuid(char *hostname) host = callocz(1, sizeof(RRDHOST)); set_host_properties(host, sqlite3_column_int(res, 2), RRD_MEMORY_MODE_DBENGINE, hostname, - (char *) sqlite3_column_text(res, 1), (const char *) uuid_str, + (char *) sqlite3_column_text(res, 1), (const char *) uuid_str, (char *) sqlite3_column_text(res, 3), (char *) sqlite3_column_text(res, 5), (char *) sqlite3_column_text(res, 4), NULL, NULL); uuid_copy(host->host_uuid, *((uuid_t *) sqlite3_column_blob(res, 0))); - host->system_info = NULL; + host->system_info = callocz(1, sizeof(*host->system_info));; + rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED); +#ifdef ENABLE_DBENGINE + host->rrdeng_ctx = &multidb_ctx; +#endif failed: rc = sqlite3_finalize(res); @@ -1020,3 +1090,251 @@ void add_migrated_file(char *path, uint64_t file_size) return; } + +#define SQL_INS_CHART_LABEL "insert or replace into chart_label " \ + "(chart_id, source_type, label_key, label_value, date_created) " \ + "values (@chart, @source, @label, @value, strftime('%s'));" + +void sql_store_chart_label(uuid_t *chart_uuid, int source_type, char *label, char *value) +{ + sqlite3_stmt *res = NULL; + int rc; + + if (unlikely(!db_meta)) { + if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) + error_report("Database has not been initialized"); + return; + } + + rc = sqlite3_prepare_v2(db_meta, SQL_INS_CHART_LABEL, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement store chart labels"); + return; + } + + rc = sqlite3_bind_blob(res, 1, chart_uuid, sizeof(*chart_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind chart_id parameter to store label information"); + goto failed; + } + + rc = sqlite3_bind_int(res, 2, source_type); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind type parameter to store label information"); + goto failed; + } + + rc = sqlite3_bind_text(res, 3, label, -1, SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind label parameter to store label information"); + goto failed; + } + + rc = sqlite3_bind_text(res, 4, value, -1, SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind value parameter to store label information"); + goto failed; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) + error_report("Failed to store chart label entry, rc = %d", rc); + +failed: + if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) + error_report("Failed to finalize the prepared statement when storing chart label information"); + + return; +} + +int find_dimension_first_last_t(char *machine_guid, char *chart_id, char *dim_id, + uuid_t *uuid, time_t *first_entry_t, time_t *last_entry_t, uuid_t *rrdeng_uuid) +{ +#ifdef ENABLE_DBENGINE + int rc; + uuid_t legacy_uuid; + uuid_t multihost_legacy_uuid; + time_t dim_first_entry_t, dim_last_entry_t; + + rc = rrdeng_metric_latest_time_by_uuid(uuid, &dim_first_entry_t, &dim_last_entry_t); + if (unlikely(rc)) { + rrdeng_generate_legacy_uuid(dim_id, chart_id, &legacy_uuid); + rc = rrdeng_metric_latest_time_by_uuid(&legacy_uuid, &dim_first_entry_t, &dim_last_entry_t); + if (likely(rc)) { + rrdeng_convert_legacy_uuid_to_multihost(machine_guid, &legacy_uuid, &multihost_legacy_uuid); + rc = rrdeng_metric_latest_time_by_uuid(&multihost_legacy_uuid, &dim_first_entry_t, &dim_last_entry_t); + if (likely(!rc)) + uuid_copy(*rrdeng_uuid, multihost_legacy_uuid); + } + else + uuid_copy(*rrdeng_uuid, legacy_uuid); + } + else + uuid_copy(*rrdeng_uuid, *uuid); + + if (likely(!rc)) { + *first_entry_t = MIN(*first_entry_t, dim_first_entry_t); + *last_entry_t = MAX(*last_entry_t, dim_last_entry_t); + } + return rc; +#else + UNUSED(machine_guid); + UNUSED(chart_id); + UNUSED(dim_id); + UNUSED(uuid); + UNUSED(first_entry_t); + UNUSED(last_entry_t); + UNUSED(rrdeng_uuid); + return 1; +#endif +} + +#ifdef ENABLE_DBENGINE +static RRDDIM *create_rrdim_entry(RRDSET *st, char *id, char *name, uuid_t *metric_uuid) +{ + RRDDIM *rd = callocz(1, sizeof(*rd)); + rd->rrdset = st; + rd->last_stored_value = NAN; + rrddim_flag_set(rd, RRDDIM_FLAG_NONE); + rd->state = mallocz(sizeof(*rd->state)); + rd->rrd_memory_mode = RRD_MEMORY_MODE_DBENGINE; + rd->state->query_ops.init = rrdeng_load_metric_init; + rd->state->query_ops.next_metric = rrdeng_load_metric_next; + rd->state->query_ops.is_finished = rrdeng_load_metric_is_finished; + rd->state->query_ops.finalize = rrdeng_load_metric_finalize; + rd->state->query_ops.latest_time = rrdeng_metric_latest_time; + rd->state->query_ops.oldest_time = rrdeng_metric_oldest_time; + rd->state->rrdeng_uuid = mallocz(sizeof(uuid_t)); + uuid_copy(*rd->state->rrdeng_uuid, *metric_uuid); + rd->state->metric_uuid = rd->state->rrdeng_uuid; + rd->id = strdupz(id); + rd->name = strdupz(name); + return rd; +} +#endif + +#define SELECT_CHART_CONTEXT "select d.dim_id, d.id, d.name, c.id, c.type, c.name, c.update_every, c.chart_id from chart c, " \ + "dimension d, host h " \ + "where d.chart_id = c.chart_id and c.host_id = h.host_id and c.host_id = @host_id and c.context = @context " \ + "order by c.chart_id asc, c.type||c.id desc;" + +#define SELECT_CHART_SINGLE "select d.dim_id, d.id, d.name, c.id, c.type, c.name, c.update_every, c.chart_id, c.context from chart c, " \ + "dimension d, host h " \ + "where d.chart_id = c.chart_id and c.host_id = h.host_id and c.host_id = @host_id and c.type||'.'||c.id = @chart " \ + "order by c.chart_id asc, c.type||'.'||c.id desc;" + +void sql_build_context_param_list(struct context_param **param_list, RRDHOST *host, char *context, char *chart) +{ +#ifdef ENABLE_DBENGINE + int rc; + + if (unlikely(!param_list) || host->rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) + return; + + if (unlikely(!(*param_list))) { + *param_list = mallocz(sizeof(struct context_param)); + (*param_list)->first_entry_t = LONG_MAX; + (*param_list)->last_entry_t = 0; + (*param_list)->rd = NULL; + (*param_list)->flags = CONTEXT_FLAGS_ARCHIVE; + if (chart) + (*param_list)->flags |= CONTEXT_FLAGS_CHART; + else + (*param_list)->flags |= CONTEXT_FLAGS_CONTEXT; + } + + sqlite3_stmt *res = NULL; + + if (context) + rc = sqlite3_prepare_v2(db_meta, SELECT_CHART_CONTEXT, -1, &res, 0); + else + rc = sqlite3_prepare_v2(db_meta, SELECT_CHART_SINGLE, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to fetch host archived charts"); + return; + } + + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host parameter to fetch archived charts"); + goto failed; + } + + if (context) + rc = sqlite3_bind_text(res, 2, context, -1, SQLITE_STATIC); + else + rc = sqlite3_bind_text(res, 2, chart, -1, SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host parameter to fetch archived charts"); + goto failed; + } + + RRDSET *st = NULL; + char machine_guid[GUID_LEN + 1]; + uuid_unparse_lower(host->host_uuid, machine_guid); + uuid_t rrdeng_uuid; + uuid_t chart_id; + + while (sqlite3_step(res) == SQLITE_ROW) { + char id[512]; + sprintf(id, "%s.%s", sqlite3_column_text(res, 3), sqlite3_column_text(res, 1)); + + if (!st || uuid_compare(*(uuid_t *)sqlite3_column_blob(res, 7), chart_id)) { + if (unlikely(st && !st->counter)) { + freez(st->context); + freez((char *) st->name); + freez(st); + } + st = callocz(1, sizeof(*st)); + char n[RRD_ID_LENGTH_MAX + 1]; + + snprintfz( + n, RRD_ID_LENGTH_MAX, "%s.%s", (char *)sqlite3_column_text(res, 4), + (char *)sqlite3_column_text(res, 3)); + st->name = strdupz(n); + st->update_every = sqlite3_column_int(res, 6); + st->counter = 0; + if (chart) { + st->context = strdupz((char *)sqlite3_column_text(res, 8)); + strncpyz(st->id, chart, RRD_ID_LENGTH_MAX); + } + uuid_copy(chart_id, *(uuid_t *)sqlite3_column_blob(res, 7)); + st->last_entry_t = 0; + st->rrdhost = host; + } + + if (unlikely(find_dimension_first_last_t(machine_guid, (char *)st->name, (char *)sqlite3_column_text(res, 1), + (uuid_t *)sqlite3_column_blob(res, 0), &(*param_list)->first_entry_t, &(*param_list)->last_entry_t, + &rrdeng_uuid))) + continue; + + st->counter++; + st->last_entry_t = MAX(st->last_entry_t, (*param_list)->last_entry_t); + + RRDDIM *rd = create_rrdim_entry(st, (char *)sqlite3_column_text(res, 1), (char *)sqlite3_column_text(res, 2), &rrdeng_uuid); + rd->next = (*param_list)->rd; + (*param_list)->rd = rd; + } + if (st) { + if (!st->counter) { + freez(st->context); + freez((char *)st->name); + freez(st); + } + else + if (!st->context && context) + st->context = strdupz(context); + } + +failed: + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize the prepared statement when reading archived charts"); +#else + UNUSED(param_list); + UNUSED(host); + UNUSED(context); + UNUSED(chart); +#endif + return; +} diff --git a/database/sqlite/sqlite_functions.h b/database/sqlite/sqlite_functions.h index f0b4b775e..d2bee75d2 100644 --- a/database/sqlite/sqlite_functions.h +++ b/database/sqlite/sqlite_functions.h @@ -58,5 +58,6 @@ extern void add_migrated_file(char *path, uint64_t file_size); extern void db_unlock(void); extern void db_lock(void); extern void delete_dimension_uuid(uuid_t *dimension_uuid); - +extern void sql_store_chart_label(uuid_t *chart_uuid, int source_type, char *label, char *value); +extern void sql_build_context_param_list(struct context_param **param_list, RRDHOST *host, char *context, char *chart); #endif //NETDATA_SQLITE_FUNCTIONS_H diff --git a/docs/Running-behind-apache.md b/docs/Running-behind-apache.md index 8810dc8fc..d1dc0e088 100644 --- a/docs/Running-behind-apache.md +++ b/docs/Running-behind-apache.md @@ -255,10 +255,9 @@ errors while accessing the dashboard. DOSPageCount 30 ``` -Restart Apache with `sudo service apache2 restart`, or the appropriate method to restart services on your system, to +Restart Apache with `sudo systemctl restart apache2`, or the appropriate method to restart services on your system, to reload its configuration with your new values. - ### Virtual host To adjust the `DOSPageCount` for a specific virtual host, open your virtual host config, which can be found at diff --git a/docs/Running-behind-caddy.md b/docs/Running-behind-caddy.md index c1d57504a..a20fd0535 100644 --- a/docs/Running-behind-caddy.md +++ b/docs/Running-behind-caddy.md @@ -5,11 +5,11 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/docs/Running-beh # Netdata via Caddy -To run Netdata via [Caddy's proxying,](https://caddyserver.com/docs/proxy) set your Caddyfile up like this: +To run Netdata via [Caddy v2 proxying,](https://caddyserver.com/docs/caddyfile/directives/reverse_proxy) set your Caddyfile up like this: ```caddyfile netdata.domain.tld { - proxy / localhost:19999 + reverse_proxy localhost:19999 } ``` @@ -19,8 +19,8 @@ To run Netdata in a subfolder: ```caddyfile netdata.domain.tld { - proxy /netdata/ localhost:19999 { - without /netdata + handle_path /netdata/* { + reverse_proxy localhost:19999 } } ``` diff --git a/docs/anonymous-statistics.md b/docs/anonymous-statistics.md index 70c502d06..27b48e13c 100644 --- a/docs/anonymous-statistics.md +++ b/docs/anonymous-statistics.md @@ -1,13 +1,13 @@ # Anonymous statistics -Starting with v1.12, Netdata collects anonymous usage information by default and sends it to Google Analytics. We use -the statistics gathered from this information for two purposes: +Netdata collects anonymous usage information by default using the open-source product analytics platform [PostHog](https://github.com/PostHog/posthog). We self-host our PostHog instance, which means your data is never sent or processed by any third parties outside of the Netdata infrastructure. We use the statistics gathered from this information for two purposes: 1. **Quality assurance**, to help us understand if Netdata behaves as expected, and to help us classify repeated issues with certain distributions or environments. @@ -15,49 +15,38 @@ the statistics gathered from this information for two purposes: 2. **Usage statistics**, to help us interpret how people use the Netdata agent in real-world environments, and to help us identify how our development/design decisions influence the community. -Netdata sends information to Google Analytics via two different channels: +Netdata collects usage information via two different channels: -- Google Tag Manager fires when you access an agent's dashboard. -- The Netdata daemon executes the [`anonymous-statistics.sh` - script](https://github.com/netdata/netdata/blob/6469cf92724644f5facf343e4bdd76ac0551a418/daemon/anonymous-statistics.sh.in) - when Netdata starts, stops cleanly, or fails. +- **Agent dashboard**: We use the [PostHog JavaScript integration](https://posthog.com/docs/integrations/js-integration) (with sensitive event attributes overwritten to be anonymized) to send product usage events when you access an [Agent's dashboard](/web/gui/README.md). +- **Agent backend**: The `netdata` daemon executes the [`anonymous-statistics.sh`](https://github.com/netdata/netdata/blob/6469cf92724644f5facf343e4bdd76ac0551a418/daemon/anonymous-statistics.sh.in) script when Netdata starts, stops cleanly, or fails. You can opt-out from sending anonymous statistics to Netdata through three different [opt-out mechanisms](#opt-out). -## Google tag manager +## Agent Dashboard - PostHog JavaScript -Google tag manager (GTM) is the recommended way of collecting statistics for new implementations using GA. Unlike the -older API, the logic of when to send information to GA and what information to send is controlled centrally. +When you kick off an Agent dashboard session by visiting `http://NODE:19999`, Netdata will initialiszes a PostHog session and masks various event attributes. -We have configured GTM to trigger the tag only when the variable `anonymous_statistics` is true. The value of this -variable is controlled via the [opt-out mechanism](#opt-out). - -To ensure anonymity of the stored information, we have configured GTM's GA variable "Fields to set" as follows: +_Note_: You can see the relevant code in the [dashboard repository](https://github.com/netdata/dashboard/blob/master/src/domains/global/sagas.ts#L107) where the `window.posthog.register()` call is made. -| Field name | Value | -| -------------- | -------------------------------------------------- | -| page | netdata-dashboard | -| hostname | dashboard.my-netdata.io | -| anonymizeIp | true | -| title | Netdata dashboard | -| campaignSource | {{machine_guid}} | -| campaignMedium | web | -| referrer | | -| Page URL | | -| Page Hostname | | -| Page Path | /netdata-dashboard | -| location | | +```JavaScript +window.posthog.register({ + distinct_id: machineGuid, + $ip: "127.0.0.1", + $current_url: "agent dashboard", + $pathname: "netdata-dashboard", + $host: "dashboard.netdata.io", +}) +``` -In addition, the Netdata-generated unique machine guid is sent to GA via a custom dimension. -You can verify the effect of these settings by examining the GA `collect` request parameters. +In the above snippet a Netdata PostHog session is initialized and the `ip`, `current_url`, `pathname` and `host` attributes are set to constant values for all events that may be sent during the session. This way, information like the IP or hostname of the Agent will not be sent as part of the product usage event data. -The only thing that's impossible for us to prevent from being **sent** is the URL in the "Referrer" Header of the -browser request to GA. However, the settings above ensure that all **stored** URLs and host names are anonymized. +We have configured the dashboard to trigger the PostHog JavaScript code only when the variable `anonymous_statistics` is true. The value of this +variable is controlled via the [opt-out mechanism](#opt-out). -## Anonymous Statistics Script +## Agent Backend - Anonymous Statistics Script Every time the daemon is started or stopped and every time a fatal condition is encountered, Netdata uses the anonymous -statistics script to collect system information and send it to GA via an http call. The information collected for all +statistics script to collect system information and send it to the Netdata PostHog via an http call. The information collected for all events is: - Netdata version @@ -103,7 +92,12 @@ Each of these opt-out processes does the following: - Prevents the daemon from executing the anonymous statistics script. - Forces the anonymous statistics script to exit immediately. -- Stops the Google Tag Manager Javascript snippet, which remains on the dashboard, from firing and sending any data to - Google Analytics. +- Stops the PostHog Javascript snippet, which remains on the dashboard, from firing and sending any data to the Netdata PostHog. + +## Migration from Google Analytics and Google Tag Manager. + +Prior to v1.29.4 we used Google Analytics to capture this information. This led to discomfort with some of our users in sending any product usage data to a third party like Google. It was also not even that useful in terms of generating the insights we needed to help catch bugs early and find opportunities for product improvement as Google Analytics does not allow its users access to the raw underlying data without paying a significant amount of money which would be infeasible for a project like Netdata. + +While we migrate fully away from Google Analytics to PostHog there maybe be a small period of time where we run both in parallel before we remove all Google Analytics related code. This is to ensure we can fully test and validate the Netdata PostHog implementation before fully defaulting to it. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fanonymous-statistics&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/docs/collect/enable-configure.md b/docs/collect/enable-configure.md index 33d7a7bb4..584d23715 100644 --- a/docs/collect/enable-configure.md +++ b/docs/collect/enable-configure.md @@ -32,7 +32,8 @@ Within this file, you can either disable the orchestrator entirely (`enabled: ye enable/disable it with `yes` and `no` settings. Uncomment any line you change to ensure the Netdata daemon reads it on start. -After you make your changes, restart the Agent with `service netdata restart`. +After you make your changes, restart the Agent with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. ## Configure a collector @@ -51,7 +52,8 @@ according to your needs. In addition, every collector's documentation shows the configure that collector. Uncomment any line you change to ensure the collector's orchestrator or the Netdata daemon read it on start. -After you make your changes, restart the Agent with `service netdata restart`. +After you make your changes, restart the Agent with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. ## What's next? diff --git a/docs/collect/system-metrics.md b/docs/collect/system-metrics.md index 72aa5714b..599d6f583 100644 --- a/docs/collect/system-metrics.md +++ b/docs/collect/system-metrics.md @@ -48,8 +48,9 @@ windows_exporter-0.14.0-amd64.exe --collectors.enabled="cpu,memory,net,logical_d Next, [configure the WMI collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/wmi#configuration) to point to the URL -and port of your exposed endpoint. Restart Netdata with `service netdata restart` and you'll start seeing Windows system -metrics, such as CPU utilization, memory, bandwidth per NIC, number of processes, and much more. +and port of your exposed endpoint. Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. You'll start seeing Windows system metrics, such as CPU +utilization, memory, bandwidth per NIC, number of processes, and much more. For information about collecting metrics from applications _running on Windows systems_, see the [application metrics doc](/docs/collect/application-metrics.md#collect-metrics-from-applications-running-on-windows). diff --git a/docs/configure/nodes.md b/docs/configure/nodes.md index d0a6fd7a7..c6e58cd8b 100644 --- a/docs/configure/nodes.md +++ b/docs/configure/nodes.md @@ -158,7 +158,7 @@ You can also take what you've learned about node configuration to enable or enha ### Related reference documentation -- [Netdata Agent ยท Daemon](docs/agent/daemon) +- [Netdata Agent ยท Daemon](/daemon/README.md) - [Netdata Agent ยท Health monitoring](/health/README.md) - [Netdata Agent ยท Notifications](/health/notifications/README.md) diff --git a/docs/export/enable-connector.md b/docs/export/enable-connector.md index 9789de2d8..1ddecfd2c 100644 --- a/docs/export/enable-connector.md +++ b/docs/export/enable-connector.md @@ -47,9 +47,10 @@ Use the following configuration as a starting point. Copy and paste it into `exp Replace `my_opentsdb_http_instance` with an instance name of your choice, and change the `destination` setting to the IP address or hostname of your OpenTSDB database. -Restart your Agent with `sudo systemctl restart netdata` to begin exporting to your OpenTSDB database. The Netdata Agent -exports metrics _beginning from the time the process starts_, and because it exports as metrics are collected, you -should start seeing data in your external database after only a few seconds. +Restart your Agent with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to begin exporting to your OpenTSDB database. The +Netdata Agent exports metrics _beginning from the time the process starts_, and because it exports as metrics are +collected, you should start seeing data in your external database after only a few seconds. Any further configuration is optional, based on your needs and the configuration of your OpenTSDB database. See the [OpenTSDB connector doc](/exporting/opentsdb/README.md) and [exporting engine @@ -68,9 +69,10 @@ Use the following configuration as a starting point. Copy and paste it into `exp Replace `my_graphite_instance` with an instance name of your choice, and change the `destination` setting to the IP address or hostname of your Graphite-supported database. -Restart your Agent with `sudo systemctl restart netdata` to begin exporting to your Graphite-supported database. Because -the Agent exports metrics as they're collected, you should start seeing data in your external database after only a few -seconds. +Restart your Agent with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to begin exporting to your Graphite-supported database. +Because the Agent exports metrics as they're collected, you should start seeing data in your external database after +only a few seconds. Any further configuration is optional, based on your needs and the configuration of your Graphite-supported database. See [exporting engine reference](/exporting/README.md#configuration) for details. diff --git a/docs/get/README.md b/docs/get/README.md index f89472de5..4bfc4878e 100644 --- a/docs/get/README.md +++ b/docs/get/README.md @@ -38,7 +38,8 @@ from its source code. Copy the script, paste it into your node's terminal, and hit `Enter`. -Open your favorite browser and navigate to `http://localhost:19999` or `http://REMOTE-HOST:19999` to open the dashboard. +Open your favorite browser and navigate to `http://localhost:19999` or `http://NODE:19999`, replacing `NODE` with the +hostname or IP address of your system, to open the local Agent dashboard.
Watch how the one-line installer works @@ -91,7 +92,7 @@ platform to see specific instructions. Even more options available in our [packaging documentation](/packaging/installer/README.md#alternative-methods). -## Claim your node on Netdata Cloud +## Claim your node to Netdata Cloud You need to [claim](/claim/README.md) your nodes to see them in Netdata Cloud. Claiming establishes a secure TLS connection to Netdata Cloud using the [Agent-Cloud link](/aclk/README.md), and proves you have write and administrative diff --git a/docs/getting-started.md b/docs/getting-started.md index 1ccab4247..e80b80eed 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -211,16 +211,12 @@ You can use these features together or separately—the decision is up to yo When you install Netdata, it's configured to start at boot, and stop and restart/shutdown. You shouldn't need to start or stop Netdata manually, but you will probably need to restart Netdata at some point. -- To **start** Netdata, open a terminal and run `service netdata start`. -- To **stop** Netdata, run `service netdata stop`. -- To **restart** Netdata, run `service netdata restart`. +- To **start** Netdata, open a terminal and run `sudo systemctl start netdata`. +- To **stop** Netdata, run `sudo systemctl stop netdata`. +- To **restart** Netdata, run `sudo systemctl restart netdata`. -The `service` command is a wrapper script that tries to use your system's preferred method of starting or stopping -Netdata based on your system. But, if either of those commands fails, try using the equivalent commands for `systemd` -and `init.d`: - -- **systemd**: `systemctl start netdata`, `systemctl stop netdata`, `systemctl restart netdata` -- **init.d**: `/etc/init.d/netdata start`, `/etc/init.d/netdata stop`, `/etc/init.d/netdata restart` +See our doc on [starting, stopping, and restarting](/docs/configure/start-stop-restart.md) the Netdata Agent for +details. ## What's next? diff --git a/docs/guides/collect-apache-nginx-web-logs.md b/docs/guides/collect-apache-nginx-web-logs.md index 215ced3ef..1ffa9d3b6 100644 --- a/docs/guides/collect-apache-nginx-web-logs.md +++ b/docs/guides/collect-apache-nginx-web-logs.md @@ -52,8 +52,8 @@ Find the `web_log` line, uncomment it, and set it to `web_log: no`. Next, open t Find the `web_log` line again, uncomment it, and set it to `web_log: yes`. -Finally, restart Netdata with `service netdata restart`, or the appropriate method for your system. You should see -metrics in your Netdata dashboard! +Finally, restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. You should see metrics in your Netdata dashboard! ![Example of real-time web server log metrics in Netdata's dashboard](https://user-images.githubusercontent.com/1153921/69448130-2980c280-0d15-11ea-9fa5-6dcff25a92c3.png) @@ -120,8 +120,9 @@ jobs: log_type: auto ``` -Restart Netdata with `service netdata restart` or the appropriate method for your system. Netdata should pick up your -web server's access log and begin showing real-time charts! +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. Netdata should pick up your web server's access log and +begin showing real-time charts! ### Custom log formats and fields diff --git a/docs/guides/collect-unbound-metrics.md b/docs/guides/collect-unbound-metrics.md index 299464745..a6a4de02f 100644 --- a/docs/guides/collect-unbound-metrics.md +++ b/docs/guides/collect-unbound-metrics.md @@ -54,8 +54,9 @@ configuring the collector. You may not need to do any more configuration to have Netdata collect your Unbound metrics. If you followed the steps above to enable `remote-control` and make your Unbound files readable by Netdata, that should -be enough. Restart Netdata with `service netdata restart`, or the appropriate method for your system. You should see -Unbound metrics in your Netdata dashboard! +be enough. Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. You should see Unbound metrics in your Netdata +dashboard! ![Some charts showing Unbound metrics in real-time](https://user-images.githubusercontent.com/1153921/69659974-93160f00-103c-11ea-88e6-27e9efcf8c0d.png) @@ -98,7 +99,8 @@ jobs: Netdata will attempt to read `unbound.conf` to get the appropriate `address`, `cumulative`, `use_tls`, `tls_cert`, and `tls_key` parameters. -Restart Netdata with `service netdata restart`, or the appropriate method for your system. +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. ### Manual setup for a remote Unbound server diff --git a/docs/guides/export/export-netdata-metrics-graphite.md b/docs/guides/export/export-netdata-metrics-graphite.md index 9a4a4f5ca..ad18980b5 100644 --- a/docs/guides/export/export-netdata-metrics-graphite.md +++ b/docs/guides/export/export-netdata-metrics-graphite.md @@ -115,8 +115,8 @@ the port accordingly. ... ``` -We'll not worry about the rest of the settings for now. Restart the Agent using `sudo service netdata restart`, or the -appropriate method for your system, to spin up the exporting engine. +We'll not worry about the rest of the settings for now. Restart the Agent using `sudo systemctl restart netdata`, or the +[appropriate method](/docs/configure/start-stop-restart.md) for your system, to spin up the exporting engine. ## See and organize Netdata metrics in Graphite diff --git a/docs/guides/monitor-cockroachdb.md b/docs/guides/monitor-cockroachdb.md index fd0e7db64..0ff9f3c77 100644 --- a/docs/guides/monitor-cockroachdb.md +++ b/docs/guides/monitor-cockroachdb.md @@ -30,9 +30,9 @@ configuring CockroachDB. Netdata only needs to regularly query the database's `_ display them on the dashboard. If your CockroachDB instance is accessible through `http://localhost:8080/` or `http://127.0.0.1:8080`, your setup is -complete. Restart Netdata with `service netdata restart`, or use the [appropriate -method](../getting-started.md#start-stop-and-restart-netdata) for your system, and refresh your browser. You should see -CockroachDB metrics in your Netdata dashboard! +complete. Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, and refresh your browser. You should see CockroachDB +metrics in your Netdata dashboard!
CPU utilization charts from a CockroachDB database monitored by Netdata diff --git a/docs/guides/monitor-hadoop-cluster.md b/docs/guides/monitor-hadoop-cluster.md index 1ca2c03e1..dce56b7c7 100644 --- a/docs/guides/monitor-hadoop-cluster.md +++ b/docs/guides/monitor-hadoop-cluster.md @@ -161,10 +161,10 @@ jobs: address : 203.0.113.10:2182 ``` -Finally, restart Netdata. +Finally, [restart Netdata](/docs/configure/start-stop-restart.md). ```sh -sudo service restart netdata +sudo systemctl restart netdata ``` Upon restart, Netdata should recognize your HDFS/Zookeeper servers, enable the HDFS and Zookeeper modules, and begin diff --git a/docs/guides/monitor/anomaly-detection.md b/docs/guides/monitor/anomaly-detection.md index bb9dbc829..2fa4896c6 100644 --- a/docs/guides/monitor/anomaly-detection.md +++ b/docs/guides/monitor/anomaly-detection.md @@ -79,9 +79,10 @@ yourself if it doesn't already exist. Either way, the final result should look l anomalies: yes ``` -[Restart the Agent](/docs/configure/start-stop-restart.md) with `sudo systemctl restart netdata` to start up the -anomalies collector. By default, the model training process runs every 30 minutes, and uses the previous 4 hours of -metrics to establish a baseline for health and performance across the default included charts. +[Restart the Agent](/docs/configure/start-stop-restart.md) with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to start up the anomalies collector. By default, the +model training process runs every 30 minutes, and uses the previous 4 hours of metrics to establish a baseline for +health and performance across the default included charts. > ๐Ÿ’ก The anomaly collector may need 30-60 seconds to finish its initial training and have enough data to start > generating anomaly scores. You may need to refresh your browser tab for the **Anomalies** section to appear in menus @@ -106,7 +107,7 @@ involve tweaking the behavior of the ML training itself. doesn't have historical metrics going back that far, consider [changing the metrics retention policy](/docs/store/change-metrics-storage.md) or reducing this window. - `custom_models`: A way to define custom models that you want anomaly probabilities for, including multi-node or - streaming setups. More on custom models in part 3 of this guide series. + streaming setups. > โš ๏ธ Setting `charts_regex` with many charts or `train_n_secs` to a very large number will have an impact on the > resources and time required to train a model for every chart. The actual performance implications depend on the @@ -172,20 +173,19 @@ example, it's time to apply that knowledge to other mission-critical parts of yo what to monitor next, check out our list of [collectors](/collectors/COLLECTORS.md) to see what kind of metrics Netdata can collect from your systems, containers, and applications. -For a more user-friendly anomaly detection experience, try out the [Metric -Correlations](https://learn.netdata.cloud/docs/cloud/insights/metric-correlations) feature in Netdata Cloud. Metric -Correlations runs only at your requests, removing unrelated charts from the dashboard to help you focus on root cause -analysis. +Keep on moving to [part 2](/docs/guides/monitor/visualize-monitor-anomalies.md), which covers the charts and alarms +Netdata creates for unsupervised anomaly detection. -Stay tuned for the next two parts of this guide, which provide more real-world context for the anomalies collector. -First, maximize the immediate value you get from anomaly detection by tracking preconfigured alarms, visualizing -anomalies in charts, and building a new dashboard tailored to your applications. Then, learn about creating custom ML -models, which help you holistically monitor an application or service by monitoring anomalies across a _cluster of -charts_. +For a different troubleshooting experience, try out the [Metric +Correlations](https://learn.netdata.cloud/docs/cloud/insights/metric-correlations) feature in Netdata Cloud. Metric +Correlations helps you perform faster root cause analysis by narrowing a dashboard to only the charts most likely to be +related to an anomaly. ### Related reference documentation - [Netdata Agent ยท Anomalies collector](/collectors/python.d.plugin/anomalies/README.md) +- [Netdata Agent ยท Nginx collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/nginx) +- [Netdata Agent ยท web log collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/weblog) - [Netdata Cloud ยท Metric Correlations](https://learn.netdata.cloud/docs/cloud/insights/metric-correlations) [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fguides%2Fmonitor%2Fanomaly-detectionl&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/docs/guides/monitor/kubernetes-k8s-netdata.md b/docs/guides/monitor/kubernetes-k8s-netdata.md index 40af0e94e..c5cb2c1bc 100644 --- a/docs/guides/monitor/kubernetes-k8s-netdata.md +++ b/docs/guides/monitor/kubernetes-k8s-netdata.md @@ -1,11 +1,25 @@ -# Monitor a Kubernetes cluster with Netdata +# Kubernetes monitoring with Netdata: Overview and visualizations + +At Netdata, we've built Kubernetes monitoring tools that add visibility without complexity while also helping you +actively troubleshoot anomalies or outages. This guide walks you through each of the visualizations and offers best +practices on how to use them to start Kubernetes monitoring in a matter of minutes, not hours or days. + +Netdata's Kubernetes monitoring solution uses a handful of [complementary tools and +collectors](#related-reference-documentation) for peeling back the many complex layers of a Kubernetes cluster, +_entirely for free_. These methods work together to give you every metric you need to troubleshoot performance or +availability issues across your Kubernetes infrastructure. + +## Challenge While Kubernetes (k8s) might simplify the way you deploy, scale, and load-balance your applications, not all clusters come with "batteries included" when it comes to monitoring. Doubly so for a monitoring stack that helps you actively @@ -18,261 +32,223 @@ customization, or integration with your preferred alerting methods. Without this visibility, it's like you built an entire house and _then_ smashed your way through the finished walls to add windows. -At Netdata, we're working to build Kubernetes monitoring tools that add visibility without complexity while also helping -you actively troubleshoot anomalies or outages. Better yet, this toolkit includes a few complementary collectors that -let you monitor the many layers of a Kubernetes cluster entirely for free. - -We already have a few complementary tools and collectors for monitoring the many layers of a Kubernetes cluster, -_entirely for free_. These methods work together to help you troubleshoot performance or availability issues across -your k8s infrastructure. - -- A [Helm chart](https://github.com/netdata/helmchart), which bootstraps a Netdata Agent pod on every node in your - cluster, plus an additional parent pod for storing metrics and managing alarm notifications. -- A [service discovery plugin](https://github.com/netdata/agent-service-discovery), which discovers and creates - configuration files for [compatible - applications](https://github.com/netdata/helmchart#service-discovery-and-supported-services) and any endpoints - covered by our [generic Prometheus - collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/prometheus). With these - configuration files, Netdata collects metrics from any compatible applications as they run _inside_ of a pod. - Service discovery happens without manual intervention as pods are created, destroyed, or moved between nodes. -- A [Kubelet collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubelet), which runs - on each node in a k8s cluster to monitor the number of pods/containers, the volume of operations on each container, - and more. -- A [kube-proxy collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubeproxy), which - also runs on each node and monitors latency and the volume of HTTP requests to the proxy. -- A [cgroups collector](/collectors/cgroups.plugin/README.md), which collects CPU, memory, and bandwidth metrics for - each container running on your k8s cluster. - -By following this guide, you'll learn how to discover, explore, and take away insights from each of these layers in your -Kubernetes cluster. Let's get started. - -## Prerequisites - -To follow this guide, you need: - -- A working cluster running Kubernetes v1.9 or newer. -- The [kubectl](https://kubernetes.io/docs/reference/kubectl/overview/) command line tool, within [one minor version +## Solution + +In this tutorial, you'll learn how to navigate Netdata's Kubernetes monitoring features, using +[robot-shop](https://github.com/instana/robot-shop) as an example deployment. Deploying robot-shop is purely optional. +You can also follow along with your own Kubernetes deployment if you choose. While the metrics might be different, the +navigation and best practices are the same for every cluster. + +## What you need to get started + +To follow this tutorial, you need: + +- A free Netdata Cloud account. [Sign up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) if you don't have one + already. +- A working cluster running Kubernetes v1.9 or newer, with a Netdata deployment and claimed parent/child nodes. See + our [Kubernetes deployment process](/packaging/installer/methods/kubernetes.md) for details on deployment and + claiming. +- The [`kubectl`](https://kubernetes.io/docs/reference/kubectl/overview/) command line tool, within [one minor version difference](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin) of your cluster, on an administrative system. - The [Helm package manager](https://helm.sh/) v3.0.0 or newer on the same administrative system. -**You need to install the Netdata Helm chart on your cluster** before you proceed. See our [Kubernetes installation -process](/packaging/installer/methods/kubernetes.md) for details. +### Install the `robot-shop` demo (optional) -This guide uses a 3-node cluster, running on Digital Ocean, as an example. This cluster runs CockroachDB, Redis, and -Apache, which we'll use as examples of how to monitor a Kubernetes cluster with Netdata. +Begin by downloading the robot-shop code and using `helm` to create a new deployment. ```bash -kubectl get nodes -NAME STATUS ROLES AGE VERSION -pool-0z7557lfb-3fnbf Ready 51m v1.17.5 -pool-0z7557lfb-3fnbx Ready 51m v1.17.5 -pool-0z7557lfb-3fnby Ready 51m v1.17.5 - -kubectl get pods -NAME READY STATUS RESTARTS AGE -cockroachdb-0 1/1 Running 0 44h -cockroachdb-1 1/1 Running 0 44h -cockroachdb-2 1/1 Running 1 44h -cockroachdb-init-q7mp6 0/1 Completed 0 44h -httpd-6f6cb96d77-4zlc9 1/1 Running 0 2m47s -httpd-6f6cb96d77-d9gs6 1/1 Running 0 2m47s -httpd-6f6cb96d77-xtpwn 1/1 Running 0 11m -netdata-child-5p2m9 2/2 Running 0 42h -netdata-child-92qvf 2/2 Running 0 42h -netdata-child-djc6w 2/2 Running 0 42h -netdata-parent-0 1/1 Running 0 42h -redis-6bb94d4689-6nn6v 1/1 Running 0 73s -redis-6bb94d4689-c2fk2 1/1 Running 0 73s -redis-6bb94d4689-tjcz5 1/1 Running 0 88s +git clone git@github.com:instana/robot-shop.git +cd robot-shop/K8s/helm +kubectl create ns robot-shop +helm install robot-shop --namespace robot-shop . ``` -## Explore Netdata's Kubernetes charts +Running `kubectl get pods` shows both the Netdata and robot-shop deployments. -The Helm chart installs and enables everything you need for visibility into your k8s cluster, including the service -discovery plugin, Kubelet collector, kube-proxy collector, and cgroups collector. - -To get started, open your browser and navigate to your cluster's Netdata dashboard. See our [Kubernetes installation -instructions](/packaging/installer/methods/kubernetes.md) for how to access the dashboard based on your cluster's -configuration. - -You'll see metrics from the parent pod as soon as you navigate to the dashboard: - -![The Netdata dashboard when monitoring a Kubernetes -cluster](https://user-images.githubusercontent.com/1153921/85343043-c6206400-b4a0-11ea-8de6-cf2c6837c456.png) - -Remember that the parent pod is responsible for storing metrics from all the child pods and sending alarms. +```bash +kubectl get pods --all-namespaces +NAMESPACE NAME READY STATUS RESTARTS AGE +default netdata-child-29f9c 2/2 Running 0 10m +default netdata-child-8xphf 2/2 Running 0 10m +default netdata-child-jdvds 2/2 Running 0 11m +default netdata-parent-554c755b7d-qzrx4 1/1 Running 0 11m +kube-system aws-node-jnjv8 1/1 Running 0 17m +kube-system aws-node-svzdb 1/1 Running 0 17m +kube-system aws-node-ts6n2 1/1 Running 0 17m +kube-system coredns-559b5db75d-f58hp 1/1 Running 0 22h +kube-system coredns-559b5db75d-tkzj2 1/1 Running 0 22h +kube-system kube-proxy-9p9cd 1/1 Running 0 17m +kube-system kube-proxy-lt9ss 1/1 Running 0 17m +kube-system kube-proxy-n75t9 1/1 Running 0 17m +robot-shop cart-b4bbc8fff-t57js 1/1 Running 0 14m +robot-shop catalogue-8b5f66c98-mr85z 1/1 Running 0 14m +robot-shop dispatch-67d955c7d8-lnr44 1/1 Running 0 14m +robot-shop mongodb-7f65d86c-dsslc 1/1 Running 0 14m +robot-shop mysql-764c4c5fc7-kkbnf 1/1 Running 0 14m +robot-shop payment-67c87cb7d-5krxv 1/1 Running 0 14m +robot-shop rabbitmq-5bb66bb6c9-6xr5b 1/1 Running 0 14m +robot-shop ratings-94fd9c75b-42wvh 1/1 Running 0 14m +robot-shop redis-0 0/1 Pending 0 14m +robot-shop shipping-7d69cb88b-w7hpj 1/1 Running 0 14m +robot-shop user-79c445b44b-hwnm9 1/1 Running 0 14m +robot-shop web-8bb887476-lkcjx 1/1 Running 0 14m +``` -Take note of the **Replicated Nodes** menu, which shows not only the parent pod, but also the three child pods. This -example cluster has three child pods, but the number of child pods depends entirely on the number of nodes in your -cluster. +## Explore Netdata's Kubernetes monitoring charts -You'll use the links in the **Replicated Nodes** menu to navigate between the various pods in your cluster. Let's do -that now to explore the pod-level Kubernetes monitoring Netdata delivers. +The Netdata Helm chart deploys and enables everything you need for monitoring Kubernetes on every layer. Once you deploy +Netdata and claim your cluster's nodes, you're ready to check out the visualizations **with zero configuration**. -### Pods +To get started, [sign in](https://app.netdata.cloud/sign-in?cloudRoute=/spaces) to your Netdata Cloud account. Head over +to the War Room you claimed your cluster to, if not **General**. -Click on any of the nodes under **netdata-parent-0**. Netdata redirects you to a separate instance of the Netdata -dashboard, run by the Netdata child pod, which visualizes thousands of metrics from that node. +Netdata Cloud is already visualizing your Kubernetes metrics, streamed in real-time from each node, in the +[Overview](https://learn.netdata.cloud/docs/cloud/visualize/overview): -![The Netdata dashboard monitoring a pod in a Kubernetes -cluster](https://user-images.githubusercontent.com/1153921/85348461-85c8e200-b4b0-11ea-85fa-e88046e94719.png) +![Netdata's Kubernetes monitoring +dashboard](https://user-images.githubusercontent.com/1153921/109037415-eafc5500-7687-11eb-8773-9b95941e3328.png) -From this dashboard, you can see all the familiar charts showing the health and performance of an individual node, just -like you would if you installed Netdata on a single physical system. Explore CPU, memory, bandwidth, networking, and -more. +Let's walk through monitoring each layer of a Kubernetes cluster using the Overview as our framework. -You can use the menus on the right-hand side of the dashboard to navigate between different sections of charts and -metrics. +## Cluster and node metrics -For example, click on the **Applications** section to view per-application metrics, collected by -[apps.plugin](/collectors/apps.plugin/README.md). The first chart you see is **Apps CPU Time (100% = 1 core) -(apps.cpu)**, which shows the CPU utilization of various applications running on the node. You shouldn't be surprised to -find Netdata processes (`netdata`, `sd-agent`, and more) alongside Kubernetes processes (`kubelet`, `kube-proxy`, and -`containers`). +The gauges and time-series charts you see right away in the Overview show aggregated metrics from every node in your +cluster. -![Per-application monitoring on a Kubernetes -cluster](https://user-images.githubusercontent.com/1153921/85348852-ad6c7a00-b4b1-11ea-95b4-5952bd0e9d98.png) +For example, the `apps.cpu` chart (in the **Applications** menu item), visualizes the CPU utilization of various +applications/services running on each of the nodes in your cluster. The **X Nodes** dropdown shows which nodes +contribute to the chart and links to jump a single-node dashboard for further investigation. -Beneath the **Applications** section, you'll begin to see sections for **k8s kubelet**, **k8s kubeproxy**, and long -strings that start with **k8s**, which are sections for metrics collected by -[`cgroups.plugin`](/collectors/cgroups.plugin/README.md). Let's skip over those for now and head further down to see -Netdata's service discovery in action. +![Per-application monitoring in a Kubernetes +cluster](https://user-images.githubusercontent.com/1153921/109042169-19c8fa00-768d-11eb-91a7-1a7afc41fea2.png) -### Service discovery (services running inside of pods) +For example, the chart above shows a spike in the CPU utilization from `rabbitmq` every minute or so, along with a +baseline CPU utilization of 10-15% across the cluster. -Thanks to Netdata's service discovery feature, you monitor containerized applications running in k8s pods with zero -configuration or manual intervention. Service discovery is like a watchdog for created or deleted pods, recognizing the -service they run based on the image name and port and immediately attempting to apply a logical default configuration. +Read about the [Overview](https://learn.netdata.cloud/docs/cloud/visualize/overview) and some best practices on [viewing +an overview of your infrastructure](/docs/visualize/overview-infrastructure.md) for details on using composite charts to +drill down into per-node performance metrics. -Service configuration supports [popular -applications](https://github.com/netdata/helmchart#service-discovery-and-supported-services), plus any endpoints covered -by our [generic Prometheus collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/prometheus), -which are automatically added or removed from Netdata as soon as the pods are created or destroyed. +## Pod and container metrics -You can find these service discovery sections near the bottom of the menu. The names for these sections follow a -pattern: the name of the detected service, followed by a string of the module name, pod TUID, service type, port -protocol, and port number. See the graphic below to help you identify service discovery sections. +Click on the **Kubernetes xxxxxxx...** section to jump down to Netdata Cloud's unique Kubernetes visualizations for view +real-time resource utilization metrics from your Kubernetes pods and containers. -![Showing the difference between cgroups and service discovery -sections](https://user-images.githubusercontent.com/1153921/85443711-73998300-b546-11ea-9b3b-2dddfe00bdf8.png) +![Navigating to the Kubernetes monitoring +visualizations](https://user-images.githubusercontent.com/1153921/109049195-349f6c80-7695-11eb-8902-52a029dca77f.png) -For example, the first service discovery section shows metrics for a pod running an Apache web server running on port 80 -in a pod named `httpd-6f6cb96d77-xtpwn`. +### Health map -> If you don't see any service discovery sections, it's either because your services are not compatible with service -> discovery or you changed their default configuration, such as the listening port. See the [list of supported -> services](https://github.com/netdata/helmchart#service-discovery-and-supported-services) for details about whether -> your installed services are compatible with service discovery, or read the [configuration -> instructions](/packaging/installer/methods/kubernetes.md#configure-service-discovery) to change how it discovers the -> supported services. +The first visualization is the [health map](https://learn.netdata.cloud/docs/cloud/visualize/kubernetes#health-map), +which places each container into its own box, then varies the intensity of their color to visualize the resource +utilization. By default, the health map shows the **average CPU utilization as a percentage of the configured limit** +for every container in your cluster. -Click on any of these service discovery sections to see metrics from that particular service. For example, click on the -**Apache apache-default httpd-6f6cb96d77-xtpwn httpd tcp 80** section brings you to a series of charts populated by the -[Apache collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/apache) itself. +![The Kubernetes health map in Netdata +Cloud](https://user-images.githubusercontent.com/1153921/109050085-3f0e3600-7696-11eb-988f-52cb187f53ea.png) -With service discovery, you can now see valuable metrics like requests, bandwidth, workers, and more for this pod. +Let's explore the most colorful box by hovering over it. -![Apache metrics collected via service -discovery](https://user-images.githubusercontent.com/1153921/85443905-a5aae500-b546-11ea-99f0-be20ba796feb.png) +![Hovering over a +container](https://user-images.githubusercontent.com/1153921/109049544-a8417980-7695-11eb-80a7-109b4a645a27.png) -The same goes for metrics coming from the CockroachDB pod running on this same node. +The **Context** tab shows `rabbitmq-5bb66bb6c9-6xr5b` as the container's image name, which means this container is +running a [RabbitMQ](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/rabbitmq) workload. -![CockroachDB metrics collected via service -discovery](https://user-images.githubusercontent.com/1153921/85444316-0e925d00-b547-11ea-83ba-b834275cb419.png) +Click the **Metrics** tab to see real-time metrics from that container. Unsurprisingly, it shows a spike in CPU +utilization at regular intervals. -Service discovery helps you monitor the health of specific applications running on your Kubernetes cluster, which in -turn gives you a complete resource when troubleshooting your infrastructure's health and performance. +![Viewing real-time container +metrics](https://user-images.githubusercontent.com/1153921/109050482-aa580800-7696-11eb-9e3e-d3bdf0f3eff7.png) -### Kubelet +### Time-series charts -Let's head back up the menu to the **k8s kubelet** section. Kubelet is an agent that runs on every node in a cluster. It -receives a set of PodSpecs from the Kubernetes Control Plane and ensures the pods described there are both running and -healthy. Think of it as a manager for the various pods on that node. +Beneath the health map is a variety of time-series charts that help you visualize resource utilization over time, which +is useful for targeted troubleshooting. -Monitoring each node's Kubelet can be invaluable when diagnosing issues with your Kubernetes cluster. For example, you -can see when the volume of running containers/pods has dropped. +The default is to display metrics grouped by the `k8s_namespace` label, which shows resource utilization based on your +different namespaces. -![Charts showing pod and container removal during a scale -down](https://user-images.githubusercontent.com/1153921/85598613-9ab48b00-b600-11ea-827e-d9ec7779e2d4.png) +![Time-series Kubernetes monitoring in Netdata +Cloud](https://user-images.githubusercontent.com/1153921/109075210-126a1680-76b6-11eb-918d-5acdcdac152d.png) -This drop might signal a fault or crash in a particular Kubernetes service or deployment (see `kubectl get services` or -`kubectl get deployments` for more details). If the number of pods increases, it may be because of something more -benign, like another member of your team scaling up a service with `kubectl scale`. +Each composite chart has a [definition bar](https://learn.netdata.cloud/docs/cloud/visualize/overview#definition-bar) +for complete customization. For example, grouping the top chart by `k8s_container_name` reveals new information. -You can also view charts for the Kubelet API server, the volume of runtime/Docker operations by type, -configuration-related errors, and the actual vs. desired numbers of volumes, plus a lot more. +![Changing time-series charts](https://user-images.githubusercontent.com/1153921/109075212-139b4380-76b6-11eb-836f-939482ae55fc.png) -Kubelet metrics are collected and visualized thanks to the [kubelet -collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubelet), which is enabled with -zero configuration on most Kubernetes clusters with standard configurations. +## Service metrics -### kube-proxy +Netdata has a [service discovery plugin](https://github.com/netdata/agent-service-discovery), which discovers and +creates configuration files for [compatible +services](https://github.com/netdata/helmchart#service-discovery-and-supported-services) and any endpoints covered by +our [generic Prometheus collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/prometheus). +Netdata uses these files to collect metrics from any compatible application as they run _inside_ of a pod. Service +discovery happens without manual intervention as pods are created, destroyed, or moved between nodes. -Scroll down into the **k8s kubeproxy** section to see metrics about the network proxy that runs on each node in your -Kubernetes cluster. kube-proxy allows for pods to communicate with each other and accept sessions from outside your -cluster. +Service metrics show up on the Overview as well, beneath the **Kubernetes** section, and are labeled according to the +service in question. For example, the **RabbitMQ** section has numerous charts from the [`rabbitmq` +collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/rabbitmq): -With Netdata, you can monitor how often your k8s proxies are syncing proxy rules between nodes. Dramatic changes in -these figures could indicate an anomaly in your cluster that's worthy of further investigation. +![Finding service discovery +metrics](https://user-images.githubusercontent.com/1153921/109054511-2eac8a00-769b-11eb-97f1-da93acb4b5fe.png) -kube-proxy metrics are collected and visualized thanks to the [kube-proxy -collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubeproxy), which is enabled with -zero configuration on most Kubernetes clusters with standard configurations. +> The robot-shop cluster has more supported services, such as MySQL, which are not visible with zero configuration. This +> is usually because of services running on non-default ports, using non-default names, or required passwords. Read up +> on [configuring service discovery](/packaging/installer/methods/kubernetes.md#configure-service-discovery) to collect +> more service metrics. -### Containers +Service metrics are essential to infrastructure monitoring, as they're the best indicator of the end-user experience, +and key signals for troubleshooting anomalies or issues. -We can finally talk about the final piece of Kubernetes monitoring: containers. Each Kubernetes pod is a set of one or -more cooperating containers, sharing the same namespace, all of which are resourced and tracked by the cgroups feature -of the Linux kernel. Netdata automatically detects and monitors each running container by interfacing with the cgroups -feature itself. +## Kubernetes components -You can find these sections beneath **Users**, **k8s kubelet**, and **k8s kubeproxy**. Below, a number of containers -devoted to running services like CockroachDB, Apache, Redis, and more. +Netdata also automatically collects metrics from two essential Kubernetes processes. -![A number of sections devoted to -containers](https://user-images.githubusercontent.com/1153921/85480217-74e1a480-b574-11ea-9da7-dd975e0fde0c.png) +### kubelet -Let's look at the section devoted to the container that runs the Apache pod named `httpd-6f6cb96d77-xtpwn`, as described -in the previous part on [service discovery](#service-discovery-services-running-inside-of-pods). +The **k8s kubelet** section visualizes metrics from the Kubernetes agent responsible for managing every pod on a given +node. This also happens without any configuration thanks to the [kubelet +collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubelet). -![cgroups metrics for an Apache -container/pod](https://user-images.githubusercontent.com/1153921/85480516-03562600-b575-11ea-92ae-dd605bf04106.png) +Monitoring each node's kubelet can be invaluable when diagnosing issues with your Kubernetes cluster. For example, you +can see if the number of running containers/pods has dropped, which could signal a fault or crash in a particular +Kubernetes service or deployment (see `kubectl get services` or `kubectl get deployments` for more details). If the +number of pods increases, it may be because of something more benign, like another team member scaling up a +service with `kubectl scale`. -At first glance, these sections might seem redundant. You might ask, "Why do I need both a service discovery section -_and_ a container section? It's just one pod, after all!" +You can also view charts for the Kubelet API server, the volume of runtime/Docker operations by type, +configuration-related errors, and the actual vs. desired numbers of volumes, plus a lot more. -The difference is that while the service discovery section shows _Apache_ metrics, the equivalent cgroups section shows -that container's CPU, memory, and bandwidth usage. You can use the two sections in conjunction to monitor the health and -performance of your pods and the services they run. +### kube-proxy -For example, let's say you get an alarm notification from `netdata-parent-0` saying the -`ea287694-0f22-4f39-80aa-2ca066caf45a` container (also known as the `httpd-6f6cb96d77-xtpwn` pod) is using 99% of its -available RAM. You can then hop over to the **Apache apache-default httpd-6f6cb96d77-xtpwn httpd tcp 80** section to -further investigate why Apache is using an unexpected amount of RAM. +The **k8s kube-proxy** section displays metrics about the network proxy that runs on each node in your Kubernetes +cluster. kube-proxy lets pods communicate with each other and accept sessions from outside your cluster. Its metrics are +collected by the [kube-proxy +collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubeproxy). -All container metrics, whether they're managed by Kubernetes or the Docker service directly, are collected by the -[cgroups collector](/collectors/cgroups.plugin/README.md). Because this collector integrates with the cgroups Linux -kernel feature itself, monitoring containers requires zero configuration on most Kubernetes clusters. +With Netdata, you can monitor how often your k8s proxies are syncing proxy rules between nodes. Dramatic changes in +these figures could indicate an anomaly in your cluster that's worthy of further investigation. ## What's next? -After following this guide, you should have a more comprehensive understanding of how to monitor your Kubernetes cluster -with Netdata. With this setup, you can monitor the health and performance of all your nodes, pods, services, and k8s -agents. Pre-configured alarms will tell you when something goes awry, and this setup gives you every per-second metric -you need to make informed decisions about your cluster. +After reading this guide, you should now be able to monitor any Kubernetes cluster with Netdata, including nodes, pods, +containers, services, and more. -The best part of monitoring a Kubernetes cluster with Netdata is that you don't have to worry about constantly running -complex `kubectl` commands to see hundreds of highly granular metrics from your nodes. And forget about using `kubectl -exec -it pod bash` to start up a shell on a pod to find and diagnose an issue with any given pod on your cluster. +With the health map, time-series charts, and the ability to drill down into individual nodes, you can see hundreds of +per-second metrics with zero configuration and less time remembering all the `kubectl` options. Netdata moves with your +cluster, automatically picking up new nodes or services as your infrastructure scales. And it's entirely free for +clusters of all sizes. -And with service discovery, all your compatible pods will automatically appear and disappear as they scale up, move, or -scale down across your cluster. +### Related reference documentation -To monitor your Kubernetes cluster with Netdata, start by [installing the Helm -chart](/packaging/installer/methods/kubernetes.md) if you haven't already. The Netdata Agent is open source and entirely -free for every cluster and every organization, whether you have 10 or 10,000 pods. A few minutes and one `helm install` -later and you'll have started on the path of building an effective platform for troubleshooting the next performance or -availability issue on your Kubernetes cluster. +- [Netdata Helm chart](https://github.com/netdata/helmchart) +- [Netdata service discovery](https://github.com/netdata/agent-service-discovery) +- [Netdata Agent ยท `kubelet` + collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubelet) +- [Netdata Agent ยท `kube-proxy` + collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubeproxy) +- [Netdata Agent ยท `cgroups.plugin`](/collectors/cgroups.plugin/README.md) [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fguides%2Fmonitor%2Fkubernetes-k8s-netdata.md&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/docs/guides/monitor/lamp-stack.md b/docs/guides/monitor/lamp-stack.md new file mode 100644 index 000000000..f11dfe5bd --- /dev/null +++ b/docs/guides/monitor/lamp-stack.md @@ -0,0 +1,249 @@ + + +# LAMP stack monitoring (Linux, Apache, MySQL, PHP) with Netdata + +The LAMP stack is the "hello world" for deploying dynamic web applications. It's fast, flexible, and reliable, which +means a developer or sysadmin won't go far in their career without interacting with the stack and its services. + +_LAMP_ is an acronym of the core services that make up the web application: **L**inux, **A**pache, **M**ySQL, and +**P**HP. + +- [Linux](https://en.wikipedia.org/wiki/Linux) is the operating system running the whole stack. +- [Apache](https://httpd.apache.org/) is a web server that responds to HTTP requests from users and returns web pages. +- [MySQL](https://www.mysql.com/) is a database that stores and returns information based on queries from the web + application. +- [PHP](https://www.php.net/) is a scripting language used to query the MySQL database and build new pages. + +LAMP stacks are the foundation for tons of end-user applications, with [Wordpress](https://wordpress.org/) being the +most popular. + +## Challenge + +You've already deployed a LAMP stack, either in testing or production. You want to monitor every service's performance +and availability to ensure the best possible experience for your end-users. You might also be particularly interested in +using a free, open-source monitoring tool. + +Depending on your monitoring experience, you may not even know what metrics you're looking for, much less how to build +dashboards using a query language. You need a robust monitoring experience that has the metrics you need without a ton +of required setup. + +## Solution + +In this tutorial, you'll set up robust LAMP stack monitoring with Netdata in just a few minutes. When you're done, +you'll have one dashboard to monitor every part of your web application, including each essential LAMP stack service. + +This dashboard updates every second with new metrics, and pairs those metrics up with preconfigured alarms to keep you +informed of any errors or odd behavior. + +## What you need to get started + +To follow this tutorial, you need: + +- A physical or virtual Linux system, which we'll call a _node_. +- A functional LAMP stack. There's plenty of tutorials for installing a LAMP stack, like [this + one](https://www.digitalocean.com/community/tutorials/how-to-install-linux-apache-mysql-php-lamp-stack-ubuntu-18-04) + from Digital Ocean. +- Optionally, a [Netdata Cloud](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) account, which you can use to view + metrics from multiple nodes in one dashboard, and a whole lot more, for free. + +## Install the Netdata Agent + +If you don't have the free, open-source [Netdata Agent](/docs/get/README.md) installed on your node yet, get started +with a [single kickstart command](/packaging/installer/methods/kickstart.md): + +```bash +bash <(curl -Ss https://my-netdata.io/kickstart.sh) +``` + +The Netdata Agent is now collecting metrics from your node every second. You don't need to jump into the dashboard yet, +but if you're curious, open your favorite browser and navigate to `http://localhost:19999` or `http://NODE:19999`, +replacing `NODE` with the hostname or IP address of your system. + +## Enable hardware and Linux system monitoring + +There's nothing you need to do to enable [system monitoring](/docs/collect/system-metrics.md) and Linux monitoring with +the Netdata Agent, which autodetects metrics from CPUs, memory, disks, networking devices, and Linux processes like +systemd without any configuration. If you're using containers, Netdata automatically collects resource utilization +metrics from each using the [cgroups data collector](/collectors/cgroups.plugin/README.md). + +## Enable Apache monitoring + +Let's begin by configuring Apache to work with Netdata's [Apache data +collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/apache). + +Actually, there's nothing for you to do to enable Apache monitoring with Netdata. + +Apache comes with `mod_status` enabled by default these days, and Netdata is smart enough to look for metrics at that +endpoint without you configuring it. Netdata is already collecting [`mod_status` +metrics](https://httpd.apache.org/docs/2.4/mod/mod_status.html), which is just _part_ of your web server monitoring. + +## Enable web log monitoring + +The Netdata Agent also comes with a [web log +collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/weblog), which reads Apache's access +log file, procesess each line, and converts them into per-second metrics. On Debian systems, it reads the file at +`/var/log/apache2/access.log`. + +At installation, the Netdata Agent adds itself to the [`adm` +group](https://wiki.debian.org/SystemGroups#Groups_without_an_associated_user), which gives the `netdata` process the +right privileges to read Apache's log files. In other words, you don't need to do anything to enable Apache web log +monitoring. + +## Enable MySQL monitoring + +Because your MySQL database is password-protected, you do need to tell MySQL to allow the `netdata` user to connect to +without a password. Netdata's [MySQL data +collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/mysql) collects metrics in _read-only_ +mode, without being able to alter or affect operations in any way. + +First, log into the MySQL shell. Then, run the following three commands, one at a time: + +```mysql +CREATE USER 'netdata'@'localhost'; +GRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'localhost'; +FLUSH PRIVILEGES; +``` + +Run `sudo systemctl restart netdata`, or the [appropriate alternative for your +system](/docs/configure/start-stop-restart.md), to collect dozens of metrics every second for robust MySQL monitoring. + +## Enable PHP monitoring + +Unlike Apache or MySQL, PHP isn't a service that you can monitor directly, unless you instrument a PHP-based application +with [StatsD](/collectors/statsd.plugin/README.md). + +However, if you use [PHP-FPM](https://php-fpm.org/) in your LAMP stack, you can monitor that process with our [PHP-FPM +data collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/phpfpm). + +Open your PHP-FPM configuration for editing, replacing `7.4` with your version of PHP: + +```bash +sudo nano /etc/php/7.4/fpm/pool.d/www.conf +``` + +> Not sure what version of PHP you're using? Run `php -v`. + +Find the line that reads `;pm.status_path = /status` and remove the `;` so it looks like this: + +```conf +pm.status_path = /status +``` + +Next, add a new `/status` endpoint to Apache. Open the Apache configuration file you're using for your LAMP stack. + +```bash +sudo nano /etc/apache2/sites-available/your_lamp_stack.conf +``` + +Add the following to the end of the file, again replacing `7.4` with your version of PHP: + +```apache +ProxyPass "/status" "unix:/run/php/php7.4-fpm.sock|fcgi://localhost" +``` + +Save and close the file. Finally, restart the PHP-FPM, Apache, and Netdata processes. + +```bash +sudo systemctl restart php7.4-fpm.service +sudo systemctl restart apache2 +sudo systemctl restart netdata +``` + +As the Netdata Agent starts up again, it automatically connects to the new `127.0.0.1/status` page and collects +per-second PHP-FPM metrics to get you started with PHP monitoring. + +## View LAMP stack metrics + +If the Netdata Agent isn't already open in your browser, open a new tab and navigate to `http://localhost:19999` or +`http://NODE:19999`, replacing `NODE` with the hostname or IP address of your system. + +> If you [signed up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) for Netdata Cloud earlier, you can also view +> the exact same LAMP stack metrics there, plus additional features, like drag-and-drop custom dashboards. Be sure to +> [claim your node](/docs/get/README.md#claim-your-node-to-netdata-cloud) to start streaming metrics to your browser +> through Netdata Cloud. + +Netdata automatically organizes all metrics and charts onto a single page for easy navigation. Peek at gauges to see +overall system performance, then scroll down to see more. Click-and-drag with your mouse to pan _all_ charts back and +forth through different time intervals, or hold `SHIFT` and use the scrollwheel (or two-finger scroll) to zoom in and +out. Check out our doc on [interacting with charts](/docs/visualize/interact-dashboards-charts.md) for all the details. + +![The Netdata +dashboard](https://user-images.githubusercontent.com/1153921/109520555-98e17800-7a69-11eb-86ec-16f689da4527.png) + +The **System Overview** section, which you can also see in the right-hand menu, contains key hardware monitoring charts, +including CPU utilization, memory page faults, network monitoring, and much more. The **Applications** section shows you +exactly which Linux processes are using the most system resources. + +Next, let's check out LAMP-specific metrics. You should see four relevant sections: **Apache local**, **MySQL local**, +**PHP-FPM local**, and **web log apache**. Click on any of these to see metrics from each service in your LAMP stack. + +![LAMP stack monitoring in +Netdata](https://user-images.githubusercontent.com/1153921/109516332-49994880-7a65-11eb-807c-3cba045582e6.png) + +### Key LAMP stack monitoring charts + +Here's a quick reference for what charts you might want to focus on after setting up Netdata. + +| Chart name / context | Type | Why? | +|-------------------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| System Load Average (`system.load`) | Hardware monitoring | A good baseline load average is `0.7`, while `1` (on a 1-core system, `2` on a 2-core system, and so on) means resources are "perfectly" utilized. Higher load indicates a bottleneck somewhere in your system. | +| System RAM (`system.ram`) | Hardware monitoring | Look at the `free` dimension. If that drops to `0`, your system will use swap memory and slow down. | +| Uptime (`apache_local.uptime`) | Apache monitoring | This chart should always be "climbing," indicating a continuous uptime. Investigate any drops back to `0`. | +| Requests By Type (`web_log_apache.requests_by_type`) | Apache monitoring | Check for increases in the `error` or `bad` dimensions, which could indicate users arriving at broken pages or PHP returning errors. | +| Queries (`mysql_local.queries`) | MySQL monitoring | Queries is the total number of queries (queries per second, QPS). Check this chart for sudden spikes or drops, which indicate either increases in traffic/demand or bottlenecks in hardware performance. | +| Active Connections (`mysql_local.connections_active`) | MySQL monitoring | If the `active` dimension nears the `limit`, your MySQL database will bottleneck responses. | +| Performance (phpfpm_local.performance) | PHP monitoring | The `slow requests` dimension lets you know if any requests exceed the configured `request_slowlog_timeout`. If so, users might be having a less-than-ideal experience. | + +## Get alarms for LAMP stack errors + +The Netdata Agent comes with hundreds of pre-configured alarms to help you keep tabs on your system, including 19 alarms +designed for smarter LAMP stack monitoring. + +Click the ๐Ÿ”” icon in the top navigation to [see active alarms](/docs/monitor/view-active-alarms.md). The **Active** tabs +shows any alarms currently triggered, while the **All** tab displays a list of _every_ pre-configured alarm. The + +![An example of LAMP stack +alarms](https://user-images.githubusercontent.com/1153921/109524120-5883f900-7a6d-11eb-830e-0e7baaa28163.png) + +[Tweak alarms](/docs/monitor/configure-alarms.md) based on your infrastructure monitoring needs, and to see these alarms +in other places, like your inbox or a Slack channel, [enable a notification +method](/docs/monitor/enable-notifications.md). + +## What's next? + +You've now set up robust monitoring for your entire LAMP stack: Linux, Apache, MySQL, and PHP (-FPM, to be exact). These +metrics will help you keep tabs on the performance and availability of your web application and all its essential +services. The per-second metrics granularity means you have the most accurate information possible for troubleshooting +any LAMP-related issues. + +Another powerful way to monitor the availability of a LAMP stack is the [`httpcheck` +collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/httpcheck), which pings a web server at +a regular interval and tells you whether if and how quickly it's responding. The `response_match` option also lets you +monitor when the web server's response isn't what you expect it to be, which might happen if PHP-FPM crashes, for +example. + +The best way to use the `httpcheck` collector is from a separate node from the one running your LAMP stack, which is why +we're not covering it here, but it _does_ work in a single-node setup. Just don't expect it to tell you if your whole +node crashed. + +If you're planning on managing more than one node, or want to take advantage of advanced features, like finding the +source of issues faster with [Metric Correlations](https://learn.netdata.cloud/docs/cloud/insights/metric-correlations), +[sign up](https://app.netdata.cloud/sign-up?cloudRoute=/spaces) for a free Netdata Cloud account. + +### Related reference documentation + +- [Netdata Agent ยท Get Netdata](/docs/get/README.md) +- [Netdata Agent ยท Apache data collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/apache) +- [Netdata Agent ยท Web log collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/weblog) +- [Netdata Agent ยท MySQL data collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/mysql) +- [Netdata Agent ยท PHP-FPM data collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/phpfpm) + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fguides%2Fmonitor%2Flamp-stack&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) \ No newline at end of file diff --git a/docs/guides/monitor/pi-hole-raspberry-pi.md b/docs/guides/monitor/pi-hole-raspberry-pi.md index a180466fb..dc5e0b314 100644 --- a/docs/guides/monitor/pi-hole-raspberry-pi.md +++ b/docs/guides/monitor/pi-hole-raspberry-pi.md @@ -83,9 +83,9 @@ As far as configuring Netdata to monitor Pi-hole metrics, there's nothing you ac collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/pihole) will autodetect the new service running on your Raspberry Pi and immediately start collecting metrics every second. -Restart Netdata with `sudo service netdata restart` to start Netdata, which will then recognize that Pi-hole is running -and start a per-second collection job. When you refresh your Netdata dashboard or load it up again in a new tab, you'll -see a new entry in the menu for **Pi-hole** metrics. +Restart Netdata with `sudo systemctl restart netdata`, which will then recognize that Pi-hole is running and start a +per-second collection job. When you refresh your Netdata dashboard or load it up again in a new tab, you'll see a new +entry in the menu for **Pi-hole** metrics. ## Use Netdata to explore and monitor your Raspberry Pi and Pi-hole @@ -119,7 +119,7 @@ cd /etc/netdata sudo ./edit-config charts.d.conf ``` -Uncomment the `sensors=force` line and save the file. Restart Netdata with `sudo service netdata restart` to enable +Uncomment the `sensors=force` line and save the file. Restart Netdata with `sudo systemctl restart netdata` to enable Raspberry Pi temperature sensor monitoring. ### Storing historical metrics on your Raspberry Pi diff --git a/docs/guides/monitor/process.md b/docs/guides/monitor/process.md index 893e6b704..0f7c6861a 100644 --- a/docs/guides/monitor/process.md +++ b/docs/guides/monitor/process.md @@ -169,8 +169,9 @@ postgres: postgres* sql: mariad* postmaster* oracle_* ora_* sqlservr ``` -Restart Netdata with `service netdata restart`, or the appropriate method for your system, to start collecting -utilization metrics from your application. Time to [visualize your process metrics](#visualize-process-metrics). +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to start collecting utilization metrics from your +application. Time to [visualize your process metrics](#visualize-process-metrics). ### Custom applications @@ -194,8 +195,9 @@ custom-app: custom-app ... ``` -Restart Netdata with `service netdata restart`, or the appropriate method for your system, to start collecting -utilization metrics from your application. +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to start collecting utilization metrics from your +application. ## Visualize process metrics diff --git a/docs/guides/monitor/raspberry-pi-anomaly-detection.md b/docs/guides/monitor/raspberry-pi-anomaly-detection.md new file mode 100644 index 000000000..f5587a89b --- /dev/null +++ b/docs/guides/monitor/raspberry-pi-anomaly-detection.md @@ -0,0 +1,127 @@ + + +# Unsupervised anomaly detection for Raspberry Pi monitoring + +We love IoT and edge at Netdata, we also love machine learning. Even better if we can combine the two to ease the pain +of monitoring increasingly complex systems. + +We recently explored what might be involved in enabling our Python-based [anomalies +collector](/collectors/python.d.plugin/anomalies/README.md) on a Raspberry Pi. To our delight, it's actually quite +straightforward! + +Read on to learn all the steps and enable unsupervised anomaly detection on your on Raspberry Pi(s). + +> Spoiler: It's just a couple of extra commands that will make you feel like a pro. + +## What you need to get started + +- A Raspberry Pi running Raspbian, which we'll call a _node_. +- The [open-source Netdata Agent](https://github.com/netdata/netdata). If you don't have it installed on your node yet, + [get it now](/docs/get/README.md). + +## Install dependencies + +First make sure Netdata is using Python 3 when it runs Python-based data collectors. + +Next, open `netdata.conf` using [`edit-config`](/docs/configure/nodes.md#use-edit-config-to-edit-configuration-files) +from within the [Netdata config directory](/docs/configure/nodes.md#the-netdata-config-directory). Scroll down to the +`[plugin:python.d]` section to pass in the `-ppython3` command option. + +```conf +[plugin:python.d] + # update every = 1 + command options = -ppython3 +``` + +Next, install some of the underlying libraries used by the Python packages the collector depends upon. + +```bash +sudo apt install llvm-9 libatlas3-base libgfortran5 libatlas-base-dev +``` + +Now you're ready to install the Python packages used by the collector itself. First, become the `netdata` user. + +```bash +sudo su -s /bin/bash netdata +``` + +Then pass in the location to find `llvm` as an environment variable for `pip3`. + +```bash +LLVM_CONFIG=llvm-config-9 pip3 install --user llvmlite numpy==1.20.1 netdata-pandas==0.0.32 numba==0.50.1 scikit-learn==0.23.2 pyod==0.8.3 +``` + +## Enable the anomalies collector + +Now you're ready to enable the collector and [restart Netdata](/docs/configure/start-stop-restart.md). + +```bash +sudo ./edit-config python.d.conf +# set `anomalies: no` to `anomalies: yes` + +# restart netdata +sudo systemctl restart netdata +``` + +And that should be it! Wait a minute or two, refresh your Netdata dashboard, you should see the default anomalies +charts under the **Anomalies** section in the dashboard's menu. + +![Anomaly detection on the Raspberry +Pi](https://user-images.githubusercontent.com/1153921/110149717-9d749c00-7d9b-11eb-853c-e041a36f0a41.png) + +## Overhead on system + +Of course one of the most important considerations when trying to do anomaly detection at the edge (as opposed to in a +centralized cloud somewhere) is the resource utilization impact of running a monitoring tool. + +With the default configuration, the anomalies collector uses about 6.5% of CPU at each run. During the retraining step, +CPU utilization jumps to between 20-30% for a few seconds, but you can [configure +retraining](/collectors/python.d.plugin/anomalies/README.md#configuration) to happen less often if you wish. + +![CPU utilization of anomaly detection on the Raspberry +Pi](https://user-images.githubusercontent.com/1153921/110149718-9d749c00-7d9b-11eb-9af8-46e2032cd1d0.png) + +In terms of the runtime of the collector, it was averaging around 250ms during each prediction step, jumping to about +8-10 seconds during a retraining step. This jump equates only to a small gap in the anomaly charts for a few seconds. + +![Execution time of anomaly detection on the Raspberry +Pi](https://user-images.githubusercontent.com/1153921/110149715-9cdc0580-7d9b-11eb-826d-faf6f620621a.png) + +The last consideration then is the amount of RAM the collector needs to store both the models and some of the data +during training. By default, the anomalies collector, along with all other running Python-based collectors, uses about +100MB of system memory. + +![RAM utilization of anomaly detection on the Raspberry +Pi](https://user-images.githubusercontent.com/1153921/110149720-9e0d3280-7d9b-11eb-883d-b1d4d9b9b5e1.png) + +## What's next? + +So, all in all, with a small little bit of extra set up and a small overhead on the Pi itself, the anomalies collector +looks like a potentially useful addition to enable unsupervised anomaly detection on your Pi. + +See our two-part guide series for a more complete picture of configuring the anomalies collector, plus some best +practices on using the charts it automatically generates: + +- [_Detect anomalies in systems and applications_](/docs/guides/monitor/anomaly-detection.md) +- [_Monitor and visualize anomalies with Netdata_](/docs/guides/monitor/visualize-monitor-anomalies.md) + +If you're using your Raspberry Pi for other purposes, like blocking ads/trackers with Pi-hole, check out our companions +Pi guide: [_Monitor Pi-hole (and a Raspberry Pi) with Netdata_](/docs/guides/monitor/pi-hole-raspberry-pi.md). + +Once you've had a chance to give unsupervised anomaly detection a go, share your use cases and let us know of any +feedback on our [community forum](https://community.netdata.cloud/t/anomalies-collector-feedback-megathread/767). + +### Related reference documentation + +- [Netdata Agent ยท Get Netdata](/docs/get/README.md) +- [Netdata Agent ยท Anomalies collector](/collectors/python.d.plugin/anomalies/README.md) + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fguides%2Fmonitor%2Fraspberry-pi-anomaly-detection&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/docs/guides/monitor/statsd.md b/docs/guides/monitor/statsd.md new file mode 100644 index 000000000..9b1de3047 --- /dev/null +++ b/docs/guides/monitor/statsd.md @@ -0,0 +1,297 @@ + + +# StatsD Guide + +StatsD is a protocol and server implementation, first introduced at Etsy, to aggregate and summarize application metrics. With StatsD, applications are instrumented by developers using the libraries that already exist for the language, without caring about managing the data. The StatsD server is in charge of receiving the metrics, performing some simple processing on them, and then pushing them to the time-series database (TSDB) for long-term storage and visualization. + +Netdata is a fully-functional StatsD server and TSDB implementation, so you can instantly visualize metrics by simply sending them to Netdata using the built-in StatsD server. + +In this guide, we'll go through a scenario of visualizing our data in Netdata in a matter of seconds using [k6](https://k6.io), an open-source tool for automating load testing that outputs metrics to the StatsD format. + +Although we'll use k6 as the use-case, the same principles can be applied to every application that supports the StatsD protocol. Simply enable the StatsD output and point it to the node that runs Netdata, which is `localhost` in this case. + +In general, the process for creating a StatsD collector can be summarized in 2 steps: + +- Run an experiment by sending StatsD metrics to Netdata, without any prior configuration. This will create a chart per metric (called private charts) and will help you verify that everything works as expected from the application side of things. + - Make sure to reload the dashboard tab **after** you start sending data to Netdata. +- Create a configuration file for your app using [edit-config](https://learn.netdata.cloud/guides/step-by-step/step-04): `sudo ./edit-config statsd.d/myapp.conf` + - Each app will have it's own section in the right-hand menu. + +Now, let's see the above process in detail. + +## Prerequisites + +- A node with the [Netdata Agent](https://learn.netdata.cloud/docs/get#install-the-netdata-agent) installed. +- An application to instrument. For this guide, that will be [k6](https://k6.io/docs/getting-started/installation). + +## Understanding the metrics + +The real in instrumenting an application with StatsD for you is to decide what metrics you want to visualize and how you want them grouped. In other words, you need decide which metrics will be grouped in the same charts and how the charts will be grouped on Netdata's dashboard. + +Start with documentation for the particular application that you want to monitor (or the technological stack that you are using). In our case, the [k6 documentation](https://k6.io/docs/using-k6/metrics/) has a whole page dedicated to the metrics output by k6, along with descriptions. + +If you are using StatsD to monitor an existing application, you don't have much control over these metrics. For example, k6 has a type called `trend`, which is identical to timers and histograms. Thus, _k6 is clearly dictating_ which metrics can be used as histograms and simple gauges. + +On the other hand, if you are instrumenting your own code, you will need to not only decide what are the "things" that you want to measure, but also decide which StatsD metric type is the appropriate for each. + +## Use private charts to see all available metrics + +In Netdata, every metric will receive its own chart, called a `private chart`. Although in the final implementation this is something that we will disable, since it can create considerable noise (imagine having 100s of metrics), itโ€™s very handy while building the configuration file. + +You can get a quick visual representation of the metrics and their type (e.g itโ€™s a gauge, a timer, etc.). + +An important thing to notice is that StatsD has different types of metrics, as illustrated in the [Netdata documentation](https://learn.netdata.cloud/docs/agent/collectors/statsd.plugin#metrics-supported-by-netdata). Histograms and timers support mathematical operations to be performed on top of the baseline metric, like reporting the `average` of the value. + +Here are some examples of default private charts. You can see that the histogram private charts will visualize all the available operations. + +**Gauge private chart** + +![Gauge metric example](https://i.imgur.com/Sr5nJEV.png) + +**Histogram private chart** + +![Timer metric example](https://i.imgur.com/P4p0hvq.png) + +## Create a new StatsD configuration file + +Start by creating a new configuration file under the `statsd.d/` folder in the [Netdata config directory](/docs/configure/nodes.md#the-netdata-config-directory). Use [`edit-config`](/docs/configure/nodes.md#use-edit-config-to-edit-configuration-files) to create a new file called `k6.conf`. + +```bash= +sudo ./edit-config statsd.d/k6.conf +``` + +Copy the following configuration into your file as a starting point. + +```conf +[app] + name = k6 + metrics = k6* + private charts = yes + gaps when not collected = no + memory mode = dbengine +``` + +Next, you need is to understand how to organize metrics in Netdataโ€™s StatsD. + +### Synthetic charts + +Netdata lets you group the metrics exposed by your instrumented application with _synthetic charts_. + +First, create a `[dictionary]` section to transform the names of the metrics into human-readable equivalents. `http_req_blocked`, `http_req_connecting`, `http_req_receiving`, and `http_reqs` are all metrics exposed by k6. + +``` +[dictionary] + http_req_blocked = Blocked HTTP Requests + http_req_connecting = Connecting HTTP Requests + http_req_receiving = Receiving HTTP Requests + http_reqs = Total HTTP requests +``` + +Continue this dictionary process with any other metrics you want to collect with Netdata. + +### Families and context + +Families and context are additional ways to group metrics. Families control the submenu at right-hand menu and it's a subcategory of the section. Given the metrics given by K6, we are organizing them in 2 major groups, or `families`: `k6 native metrics` and `http metrics`. + +Context is a second way to group metrics, when the metrics are of the same nature but different origin. In our case, if we ran several different load testing experiments side-by-side, we could define the same app, but different context (e.g `http_requests.experiment1`, `http_requests.experiment2`). + +Find more details about family and context in our [documentation](/web/README.md#families). + +### Dimension + +Now, having decided on how we are going to group the charts, we need to define how we are going to group metrics into different charts. This is particularly important, since we decide: + +- What metrics **not** to show, since they are not useful for our use-case. +- What metrics to consolidate into the same charts, so as to reduce noice and increase visual correlation. + +The dimension option has this syntax: `dimension = [pattern] METRIC NAME TYPE MULTIPLIER DIVIDER OPTIONS` + +- **pattern**: A keyword that tells the StatsD server the `METRIC` string is actually a [simple pattern].(/libnetdata/simple_pattern/README.md). We don't simple patterns in the example, but if we wanted to visualize all the `http_req` metrics, we could have a single dimension: `dimension = pattern 'k6.http_req*' last 1 1`. Find detailed examples with patterns in our [documentation](/collectors/statsd.plugin/README.md#dimension-patterns). +- **METRIC** The id of the metric as it comes from the client. You can easily find this in the private charts above, for example: `k6.http_req_connecting`. +- **NAME**: The name of the dimension. You can use the dictionary to expand this to something more human-readable. +- **TYPE**: + - For all charts: + - `events`: The number of events (data points) received by the StatsD server + - `last`: The last value that the server received + - For histograms and timers: + - `min`, `max`, `sum`, `average`, `percentile`, `median`, `stddev`: This is helpful if you want to see different representations of the same value. You can find an example at the `[iteration_duration]` above. Note that the baseline `metric` is the same, but the `name` of the dimension is different, since we use the baseline, but we perform a computation on it, creating a different final metric for visualization(dimension). +- **MULTIPLIER DIVIDER**: Handy if you want to convert Kilobytes to Megabytes or you want to give negative value. The second is handy for better visualization of send/receive. You can find an example at the **packets** submenu of the **IPv4 Networking Section**. + +> โ• If you define a chart, run Netdata to visualize metrics, and then add or remove a dimension from that chart, this will result in a new chart with the same name, confusing Netdata. If you change the dimensions of the chart, please make sure to also change the `name` of that chart, since it serves as the `id` of that chart in Netdata's storage. (e.g http_req --> http_req_1). + +### Finalize your StatsD configuration file + +It's time to assemble all the pieces together and create the synthetic charts that will consist our application dashboard in Netdata. We can do it in a few simple steps: + +- Decide which metrics we want to use (we have viewed all of them as private charts). For example, we want to use `k6.http_requests`, `k6.vus`, etc. +- Decide how we want organize them in different synthetic charts. For example, we want `k6.http_requests`, `k6.vus` on their own, but `k6.http_req_blocked` and `k6.http_req_connecting` on the same chart. +- For each synthetic chart, we define a **unique** name and a human readable title. +- We decide at which `family` (submenu section) we want each synthetic chart to belong to. For example, here we have defined 2 families: `http requests`, `k6_metrics`. +- If we have multiple instances of the same metric, we can define different contexts, (Optional). +- We define a dimension according to the syntax we highlighted above. +- We define a type for each synthetic chart (line, area, stacked) +- We define the units for each synthetic chart. + +Following the above steps, we append to the `k6.conf` that we defined above, the following configuration: + +``` +[http_req_total] + name = http_req_total + title = Total HTTP Requests + family = http requests + context = k6.http_requests + dimension = k6.http_reqs http_reqs last 1 1 sum + type = line + units = requests/s + +[vus] + name = vus + title = Virtual Active Users + family = k6_metrics + dimension = k6.vus vus last 1 1 + dimension = k6.vus_max vus_max last 1 1 + type = line + unit = vus + +[iteration_duration] + name = iteration_duration_2 + title = Iteration duration + family = k6_metrics + dimension = k6.iteration_duration iteration_duration last 1 1 + dimension = k6.iteration_duration iteration_duration_max max 1 1 + dimension = k6.iteration_duration iteration_duration_min min 1 1 + dimension = k6.iteration_duration iteration_duration_avg avg 1 1 + type = line + unit = s + +[dropped_iterations] + name = dropped_iterations + title = Dropped Iterations + family = k6_metrics + dimension = k6.dropped_iterations dropped_iterations last 1 1 + units = iterations + type = line + +[data] + name = data + title = K6 Data + family = k6_metrics + dimension = k6.data_received data_received last 1 1 + dimension = k6.data_sent data_sent last -1 1 + units = kb/s + type = area + +[http_req_status] + name = http_req_status + title = HTTP Requests Status + family = http requests + dimension = k6.http_req_blocked http_req_blocked last 1 1 + dimension = k6.http_req_connecting http_req_connecting last 1 1 + units = ms + type = line + +[http_req_duration] + name = http_req_duration + title = HTTP requests duration + family = http requests + dimension = k6.http_req_sending http_req_sending last 1 1 + dimension = k6.http_req_waiting http_req_waiting last 1 1 + dimension = k6.http_req_receiving http_req_receiving last 1 1 + units = ms + type = stacked +``` + +> Take note that Netdata will report the rate for metrics and counters, even if k6 or another application sends an _absolute_ number. For example, k6 sends absolute HTTP requests with `http_reqs`, but Netdat visualizes that in `requests/second`. + +To enable this StatsD configuration, [restart Netdata](/docs/configure/start-stop-restart.md). + +## Final touches + +At this point, you have used StatsD to gather metrics for k6, creating a whole new section in your Netdata dashboard in the process. Uil can further customize the icon of the particular section, as well as the description for each chart. + +To edit the section, please follow the Netdata [documentation](https://learn.netdata.cloud/docs/agent/web/gui#customizing-the-local-dashboard). + +While the following configuration will be placed in a new file, as the documentation suggests, it is instructing to use `dashboard_info.js` as a template. Open the file and see how the rest of sections and collectors have been defined. + +```javascript= +netdataDashboard.menu = { + 'k6': { + title: 'K6 Load Testing', + icon: '', + info: 'k6 is an open-source load testing tool and cloud service providing the best developer experience for API performance testing.' + }, + . + . + . +``` + +We can then add a description for each chart. Simply find the following section in `dashboard_info.js` to understand how a chart definitions are used: + +```javascript= +netdataDashboard.context = { + 'system.cpu': { + info: function (os) { + void (os); + return 'Total CPU utilization (all cores). 100% here means there is no CPU idle time at all. You can get per core usage at the CPUs section and per application usage at the Applications Monitoring section.' + + netdataDashboard.sparkline('
Keep an eye on iowait ', 'system.cpu', 'iowait', '%', '. If it is constantly high, your disks are a bottleneck and they slow your system down.') + + netdataDashboard.sparkline('
An important metric worth monitoring, is softirq ', 'system.cpu', 'softirq', '%', '. A constantly high percentage of softirq may indicate network driver issues.'); + }, + valueRange: "[0, 100]" + }, +``` + +Afterwards, you can open your `custom_dashboard_info.js`, as suggested in the documentation linked above, and add something like the following example: + +```javascript= +netdataDashboard.context = { + 'k6.http_req_duration': { + info: "Total time for the request. It's equal to http_req_sending + http_req_waiting + http_req_receiving (i.e. how long did the remote server take to process the request and respond, without the initial DNS lookup/connection times)" + }, + +``` +The chart is identified as ``.``. + +These descriptions can greatly help the Netdata user who is monitoring your application in the midst of an incident. + +The `info` field supports `html`, embedding useful links and instructions in the description. + +## Vendoring a new collector + +After all this hussle, not only did we illustrate how to visualize any data source in Netdata using the StatsD protocol, but we have also created a new collector in the process. + +While using the same underlying collector-StatsD-every new `myapp.conf` file will in essence create a new data source and dashboard section for Netdata. While Netdata will load all the configuration files by default, it will **not** create dashboard sections or charts, unless it start receiving data for that particular data source. This means that we can now share our collector with the rest of the Netdata community. + +If you want to contribute or you need any help in developing your collector, we have a whole [Forum Category](https://community.netdata.cloud/c/agent-development/9) dedicated to contributing to the Netdata Agent. + +### Making a PR to the netdata/netdata repository + +- Make sure you follow the contributing guide and read our Code of Conduct +- Fork the netdata/netdata repository +- Place the configuration file inside `netdata/collectors/statsd.plugin` +- Add a reference in `netdata/collectors/statsd.plugin/Makefile.am`. For example, if we contribute the `k6.conf` file: +```Makefile +dist_statsdconfig_DATA = \ + example.conf \ + k6.conf \ + $(NULL) +``` + +## What's next? + +In this tutorial, you learned how to monitor an application using Netdata's StatsD implementation. + +Netdata allows you easily visualize any StatsD metric without any configuration, since it creates a private metric per chart by default. But to make your implementation more robust, you also learned how to group metrics by family and context, and create multiple dimensions. With these tools, you can quickly instrument any application with StatsD to monitor its performance and availability with per-second metrics. + +### Related reference documentation + +- [Netdata Agent ยท StatsD](/collectors/statsd.plugin/README.md) + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fguides%2Fmonitor%2Fstatsdr&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/docs/guides/monitor/visualize-monitor-anomalies.md b/docs/guides/monitor/visualize-monitor-anomalies.md index f37dadc62..681ba8390 100644 --- a/docs/guides/monitor/visualize-monitor-anomalies.md +++ b/docs/guides/monitor/visualize-monitor-anomalies.md @@ -136,9 +136,6 @@ unsupervised anomaly detection, or would like to see something added to it. You that works well for monitoring some other popular application, like MySQL, PostgreSQL, Redis, or anything else we [support through collectors](/collectors/COLLECTORS.md). -In part 3 of this series on unsupervised anomaly detection using Netdata, we'll create a custom model to apply -unsupervised anomaly detection to an entire mission-critical application. Stay tuned! - ### Related reference documentation - [Netdata Agent ยท Anomalies collector](/collectors/python.d.plugin/anomalies/README.md) diff --git a/docs/guides/python-collector.md b/docs/guides/python-collector.md new file mode 100644 index 000000000..f327da322 --- /dev/null +++ b/docs/guides/python-collector.md @@ -0,0 +1,486 @@ + + +# Develop a custom data collector in Python + +The Netdata Agent uses [data collectors](/docs/collect/how-collectors-work.md) to fetch metrics from hundreds of system, +container, and service endpoints. While the Netdata team and community has built [powerful +collectors](/collectors/COLLECTORS.md) for most system, container, and service/application endpoints, there are plenty +of custom applications that can't be monitored by default. + +## Problem + +You have a custom application or infrastructure that you need to monitor, but no open-source monitoring tool offers a +prebuilt method for collecting your required metric data. + +## Solution + +In this tutorial, you'll learn how to leverage the [Python programming language](https://www.python.org/) to build a +custom data collector for the Netdata Agent. Follow along with your own dataset, using the techniques and best practices +covered here, or use the included examples for collecting and organizing eithre random or weather data. + +## What you need to get started + +- A physical or virtual Linux system, which we'll call a _node_. +- A working installation of the free, open-source [Netdata Agent](/docs/get/README.md). + +## Jobs and elements of a Python collector + +A Python collector for Netdata is a Python script that gathers data from an external source and transforms these data +into charts to be displayed by Netdata dashboard. The basic jobs of the plugin are: + +- Gather the data from the service/application. +- Create the required charts. +- Parse the data to extract or create the actual data to be represented. +- Assign the correct values to the charts +- Set the order for the charts to be displayed. +- Give the charts data to Netdata for visualization. + +The basic elements of a Netdata collector are: + +- `ORDER[]`: A list containing the charts to be displayed. +- `CHARTS{}`: A dictionary containing the details for the charts to be displayed. +- `data{}`: A dictionary containing the values to be displayed. +- `get_data()`: The basic function of the plugin which will retrun to Netdata the correct values. + +Let's walk through these jobs and elements as independent elements first, then apply them to example Python code. + +### Determine how to gather metrics data + +Netdata can collect data from any program that can print to stdout. Common input sources for collectors can be logfiles, +HTTP requests, executables, and more. While this tutorial will offer some example inputs, your custom application will +have different inputs and metrics. + +A great deal of the work in developing a Netdata collector is investigating the target application and understanding +which metrics it exposes and how to + +### Create charts + +For the data to be represented in the Netdata dashboard, you need to create charts. Charts (in general) are defined by +several characteristics: title, legend, units, type, and presented values. Each chart is represented as a dictionary +entry: + +```python +chart= { + "chart_name": + { + "options": [option_list], + "lines": [ + [dimension_list] + ] + } + } +``` + +Use the `options` field to set the chart's options, which is a list in the form `options: [name, title, units, family, +context, charttype]`, where: + +- `name`: The name of the chart. +- `title` : The title to be displayed in the chart. +- `units` : The units for this chart. +- `family`: An identifier used to group charts together (can be null). +- `context`: An identifier used to group contextually similar charts together. The best practice is to provide a context + that is `A.B`, with `A` being the name of the collector, and `B` being the name of the specific metric. +- `charttype`: Either `line`, `area`, or `stacked`. If null line is the default value. + +You can read more about `family` and `context` in the [web dashboard](/web/README.md#families) doc. + +Once the chart has been defined, you should define the dimensions of the chart. Dimensions are basically the metrics to +be represented in this chart and each chart can have more than one dimension. In order to define the dimensions, the +"lines" list should be filled in with the required dimensions. Each dimension is a list: + +`dimension: [id, name, algorithm, multiplier, divisor]` +- `id` : The id of the dimension. Mandatory unique field (string) required in order to set a value. +- `name`: The name to be presented in the chart. If null id will be used. +- `algorithm`: Can be absolute or incremental. If null absolute is used. Incremental shows the difference from the + previous value. +- `multiplier`: an integer value to divide the collected value, if null, 1 is used +- `divisor`: an integer value to divide the collected value, if null, 1 is used + +The multiplier/divisor fields are used in cases where the value to be displayed should be decimal since Netdata only +gathers integer values. + +### Parse the data to extract or create the actual data to be represented + +Once the data is received, your collector should process it in order to get the values required. If, for example, the +received data is a JSON string, you should parse the data to get the required data to be used for the charts. + +### Assign the correct values to the charts + +Once you have process your data and get the required values, you need to assign those values to the charts you created. +This is done using the `data` dictionary, which is in the form: + +`"data": {dimension_id: value }`, where: +- `dimension_id`: The id of a defined dimension in a created chart. +- `value`: The numerical value to associate with this dimension. + +### Set the order for the charts to be displayed + +Next, set the order of chart appearance with the `ORDER` list, which is in the form: + +`"ORDER": [chart_name_1,chart_name_2, โ€ฆ., chart_name_X]`, where: +- `chart_name_x`: is the chart name to be shown in X order. + +### Give the charts data to Netdata for visualization + +Our plugin should just rerun the data dictionary. If everything is set correctly the charts should be updated with the +correct values. + +## Framework classes + +The `python.d` plugin has a number of framework classes that can be used to speed up the development of your python +collector. Your class can inherit one of these framework classes, which have preconfigured methods. + +For example, the snippet bellow is from the [RabbitMQ +collector](https://github.com/netdata/netdata/blob/91f3268e9615edd393bd43de4ad8068111024cc9/collectors/python.d.plugin/rabbitmq/rabbitmq.chart.py#L273). +This collector uses an HTTP endpoint and uses the `UrlService` framework class, which only needs to define an HTTP +endpoint for data collection. + +```python +class Service(UrlService): + def __init__(self, configuration=None, name=None): + UrlService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + self.url = '{0}://{1}:{2}'.format( + configuration.get('scheme', 'http'), + configuration.get('host', '127.0.0.1'), + configuration.get('port', 15672), + ) + self.node_name = str() + self.vhost = VhostStatsBuilder() + self.collected_vhosts = set() + self.collect_queues_metrics = configuration.get('collect_queues_metrics', False) + self.debug("collect_queues_metrics is {0}".format("enabled" if self.collect_queues_metrics else "disabled")) + if self.collect_queues_metrics: + self.queue = QueueStatsBuilder() + self.collected_queues = set() +``` + +In our use-case, we use the `SimpleService` framework, since there is no framework class that suits our needs. + +You can read more about the [framework classes](/collectors/python.d.plugin/README.md#how-to-write-a-new-module) from +the Netdata documentation. + +## An example collector using weather station data + +Let's build a custom Python collector for visualizing data from a weather monitoring station. + +### Determine how to gather metrics data + +This example assumes you can gather metrics data through HTTP requests to a web server, and that the data provided are +numeric values for temperature, humidity and pressure. It also assumes you can get the `min`, `max`, and `average` +values for these metrics. + +### Chart creation + +First, create a single chart that shows the latest temperature metric: + +```python +CHARTS = { + "temp_current": { + "options": ["my_temp", "Temperature", "Celsius", "TEMP", "weather_station.temperature", "line"], + "lines": [ + ["current_temp_id","current_temperature"] + ] + } +} +``` + +## Parse the data to extract or create the actual data to be represented + +A standard practice would be to either get the data on JSON format or transform them to JSON format. We use a dictionary +to give this format and issue random values to simulate received data. + +The following code iterates through the names of the expected values and creates a dictionary with the name of the value +as `key`, and a random value as `value`. + +```python + weather_data=dict() + weather_metrics=[ + "temp","av_temp","min_temp","max_temp", + "humid","av_humid","min_humid","max_humid", + "pressure","av_pressure","min_pressure","max_pressure", + ] + + def populate_data(self): + for metric in self.weather_metrics: + self.weather_data[metric]=random.randint(0,100) +``` + +### Assign the correct values to the charts + +Our chart has a dimension called `current_temp_id`, which should have the temperature value received. + +```python +data['current_temp_id'] = self.weather_data["temp"] +``` + +### Set the order for the charts to be displayed + +```python +ORDER = [ + "temp_current" +] +``` + +### Give the charts data to Netdata for visualization + +```python +return data +``` + +A snapshot of the chart created by this plugin: + +![A snapshot of the chart created by this plugin](https://i.imgur.com/2tR9KvF.png) + +Here's the current source code for the data collector: + +```python +# -*- coding: utf-8 -*- +# Description: howto weather station netdata python.d module +# Author: Panagiotis Papaioannou (papajohn-uop) +# SPDX-License-Identifier: GPL-3.0-or-later + +from bases.FrameworkServices.SimpleService import SimpleService + +import random + +NETDATA_UPDATE_EVERY=1 +priority = 90000 + +ORDER = [ + "temp_current" +] + +CHARTS = { + "temp_current": { + "options": ["my_temp", "Temperature", "Celsius", "TEMP", "weather_station.temperature", "line"], + "lines": [ + ["current_temperature"] + ] + } +} + +class Service(SimpleService): + def __init__(self, configuration=None, name=None): + SimpleService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + #values to show at graphs + self.values=dict() + + @staticmethod + def check(): + return True + + weather_data=dict() + weather_metrics=[ + "temp","av_temp","min_temp","max_temp", + "humid","av_humid","min_humid","max_humid", + "pressure","av_pressure","min_pressure","max_pressure", + ] + + def logMe(self,msg): + self.debug(msg) + + def populate_data(self): + for metric in self.weather_metrics: + self.weather_data[metric]=random.randint(0,100) + + def get_data(self): + #The data dict is basically all the values to be represented + # The entries are in the format: { "dimension": value} + #And each "dimension" shoudl belong to a chart. + data = dict() + + self.populate_data() + + data['current_temperature'] = self.weather_data["temp"] + + return data +``` + +## Add more charts to the existing weather station collector + +To enrich the example, add another chart the collector which to present the humidity metric. + +Add a new entry in the `CHARTS` dictionary with the definition for the new chart. + +```python +CHARTS = { + 'temp_current': { + 'options': ['my_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature', 'line'], + 'lines': [ + ['current_temperature'] + ] + }, + 'humid_current': { + 'options': ['my_humid', 'Humidity', '%', 'HUMIDITY', 'weather_station.humidity', 'line'], + 'lines': [ + ['current_humidity'] + ] + } +} +``` + +The data has already been created and parsed by the `weather_data=dict()` function, so you only need to populate the +`current_humidity` dimension `self.weather_data["humid"]`. + +```python + data['current_temperature'] = self.weather_data["temp"] + data['current_humidity'] = self.weather_data["humid"] +``` + +Next, put the new `humid_current` chart into the `ORDER` list: + +```python +ORDER = [ + 'temp_current', + 'humid_current' +] +``` + +[Restart Netdata](/docs/configure/start-stop-restart.md) with `sudo systemctl restart netdata` to see the new humidity +chart: + +![A snapshot of the modified chart](https://i.imgur.com/XOeCBmg.png) + +Next, time to add one more chart that visualizes the average, minimum, and maximum temperature values. + +Add a new entry in the `CHARTS` dictionary with the definition for the new chart. Since you want three values +represented in this this chart, add three dimensions. You shoudl also use the same `FAMILY` value in the charts (`TEMP`) +so that those two charts are grouped together. + +```python +CHARTS = { + 'temp_current': { + 'options': ['my_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature', 'line'], + 'lines': [ + ['current_temperature'] + ] + }, + 'temp_stats': { + 'options': ['stats_temp', 'Temperature', 'Celsius', 'TEMP', 'weather_station.temperature_stats', 'line'], + 'lines': [ + ['min_temperature'], + ['max_temperature'], + ['avg_temperature'] + ] + }, + 'humid_current': { + 'options': ['my_humid', 'Humidity', '%', 'HUMIDITY', 'weather_station.humidity', 'line'], + 'lines': [ + ['current_humidity'] + ] + } + +} +``` + +As before, initiate new dimensions and add data to them: + +```python + data['current_temperature'] = self.weather_data["temp"] + data['min_temperature'] = self.weather_data["min_temp"] + data['max_temperature'] = self.weather_data["max_temp"] + data['avg_temperature`'] = self.weather_data["av_temp"] + data['current_humidity'] = self.weather_data["humid"] +``` + +Finally, set the order for the `temp_stats` chart: + +```python +ORDER = [ + 'temp_current', + โ€˜temp_statsโ€™ + 'humid_current' +] +``` + +[Restart Netdata](/docs/configure/start-stop-restart.md) with `sudo systemctl restart netdata` to see the new +min/max/average temperature chart with multiple dimensions: + +![A snapshot of the modified chart](https://i.imgur.com/g7E8lnG.png) + +## Add a configuration file + +The last piece of the puzzle to create a fully robust Python collector is the configuration file. Python.d uses +configuration in [YAML](https://www.tutorialspoint.com/yaml/yaml_basics.htm) format and is used as follows: + +- Create a configuration file in the same directory as the `.chart.py`. Name it `.conf`. +- Define a `job`, which is an instance of the collector. It is useful when you want to collect data from different + sources with different attributes. For example, we could gather data from 2 different weather stations, which use + different temperature measures: Fahrenheit and Celcius. +- You can define many different jobs with the same name, but with different attributes. Netdata will try each job + serially and will stop at the first job that returns data. If multiple jobs have the same name, only one of them can + run. This enables you to define different "ways" to fetch data from a particular data source so that the collector has + more chances to work out-of-the-box. For example, if the data source supports both `HTTP` and `linux socket`, you can + define 2 jobs named `local`, with each using a different method. +- Check the `postgresql` collector configuration file on + [GitHub](https://github.com/netdata/netdata/blob/master/collectors/python.d.plugin/postgres/postgres.conf) to get a + sense of the structure. + +```yaml +weather_station_1: + name: 'Greece' + endpoint: 'https://endpoint_1.com' + port: 67 + type: 'celcius' +weather_station_2: + name: 'Florida USA' + endpoint: 'https://endpoint_2.com' + port: 67 + type: 'fahrenheit' +``` + +Next, access the above configuration variables in the `__init__` function: + +```python +def __init__(self, configuration=None, name=None): + SimpleService.__init__(self, configuration=configuration, name=name) + self.endpoint = self.configuration.get('endpoint', ) +``` + +Because you initiate the `framework class` (e.g `SimpleService.__init__`), the configuration will be available +throughout the whole `Service` class of your module, as `self.configuration`. Finally, note that the `configuration.get` +function takes 2 arguments, one with the name of the configuration field and one with a default value in case it doesn't +find the configuration field. This allows you to define sane defaults for your collector. + +Moreover, when creating the configuration file, create a large comment section that describes the configuration +variables and inform the user about the defaults. For example, take a look at the `postgresql` collector on +[GitHub](https://github.com/netdata/netdata/blob/master/collectors/python.d.plugin/postgres/postgres.conf). + +You can read more about the configuration file on the [`python.d.plugin` +documentation](https://learn.netdata.cloud/docs/agent/collectors/python.d.plugin). + +## What's next? + +Find the source code for the above examples on [GitHub](https://github.com/papajohn-uop/netdata). + +Now we you ready to start developing our Netdata python Collector and share it with the rest of the Netdata community. + +- If you need help while developing your collector, join our [Netdata + Community](https://community.netdata.cloud/c/agent-development/9) to chat about it. +- Follow the + [checklist](https://learn.netdata.cloud/docs/agent/collectors/python.d.plugin#pull-request-checklist-for-python-plugins) + to contribute the collector to the Netdata Agent [repository](https://github.com/netdata/netdata). +- Check out the [example](https://github.com/netdata/netdata/tree/master/collectors/python.d.plugin/example) Python + collector, which is a minimal example collector you could also use as a starting point. Once comfortable with that, + then browse other [existing collectors](https://github.com/netdata/netdata/tree/master/collectors/python.d.plugin) + that might have similarities to what you want to do. +- If you're developing a proof of concept (PoC), consider migrating the collector in Golang + ([go.d.plugin](https://github.com/netdata/go.d.plugin)) once you validate its value in production. Golang is more + performant, easier to maintain, and simpler for users since it doesn't require a particular runtime on the node to + execute (Python plugins require Python on the machine to be executed). Netdata uses Go as the platform of choice for + production-grade collectors. +- Celebrate! You have contributed to an open-source project with hundreds of thousands of users! + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fguides%2Fpython-collector&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/docs/guides/step-by-step/step-04.md b/docs/guides/step-by-step/step-04.md index 0495145f4..41431b1d0 100644 --- a/docs/guides/step-by-step/step-04.md +++ b/docs/guides/step-by-step/step-04.md @@ -95,8 +95,8 @@ section and give it the value of `1`. test = 1 ``` -Restart Netdata with `service restart netdata` or the [appropriate -alternative](/docs/getting-started.md#start-stop-and-restart-netdata) for your system. +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. Now, open up your browser and navigate to `http://HOST:19999/netdata.conf`. You'll see that Netdata has recognized that our fake option isn't valid and added a notice that Netdata will ignore it. diff --git a/docs/guides/step-by-step/step-05.md b/docs/guides/step-by-step/step-05.md index 5e627632d..30ab329cd 100644 --- a/docs/guides/step-by-step/step-05.md +++ b/docs/guides/step-by-step/step-05.md @@ -69,8 +69,8 @@ the `warn` and `crit` lines to the values of your choosing. For example: crit: $this > (($status == $CRITICAL) ? (75) : (85)) ``` -You _can_ [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata) to enable your tune, but you can -also reload _only_ the health monitoring component using one of the available [methods](/health/QUICKSTART.md#reload-health-configuration). +You _can_ restart Netdata with `sudo systemctl restart netdata`, to enable your tune, but you can also reload _only_ the +health monitoring component using one of the available [methods](/health/QUICKSTART.md#reload-health-configuration). You can also tune any other aspect of the default alarms. To better understand how each line in a health entity works, read our [health documentation](/health/README.md). diff --git a/docs/guides/step-by-step/step-06.md b/docs/guides/step-by-step/step-06.md index 160b1b1f6..618886076 100644 --- a/docs/guides/step-by-step/step-06.md +++ b/docs/guides/step-by-step/step-06.md @@ -7,8 +7,8 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/docs/guides/step When Netdata _starts_, it auto-detects dozens of **data sources**, such as database servers, web servers, and more. -To auto-detect and collect metrics from a source you just installed, you need to [restart -Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). +To auto-detect and collect metrics from a source you just installed, you need to restart Netdata using `sudo systemctl +restart netdata`, or the [appropriate method](/docs/configure/start-stop-restart.md) for your system. However, auto-detection only works if you installed the source using its standard installation procedure. If Netdata isn't collecting metrics after a restart, your source probably isn't configured @@ -99,9 +99,9 @@ Next, edit your `/etc/nginx/sites-enabled/default` file to include a `location` } ``` -Restart Netdata using `service netdata restart` or the [correct -alternative](/docs/getting-started.md#start-stop-and-restart-netdata) for your system, and Netdata will auto-detect -metrics from your Nginx web server! +Restart Netdata using `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, and Netdata will auto-detect metrics from your Nginx web +server! While not necessary for most auto-detection and collection purposes, you can also configure the Nginx collector itself by editing its configuration file: diff --git a/docs/guides/step-by-step/step-09.md b/docs/guides/step-by-step/step-09.md index 636ffea1f..c5b2ecd54 100644 --- a/docs/guides/step-by-step/step-09.md +++ b/docs/guides/step-by-step/step-09.md @@ -62,7 +62,8 @@ metrics your Agent collects, and more. dbengine disk space = 512 ``` -After you've made your changes, [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). +After you've made your changes, restart Netdata using `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. To confirm the database engine is working, go to your Netdata dashboard and click on the **Netdata Monitoring** menu on the right-hand side. You can find `dbengine` metrics after `queries`. @@ -142,9 +143,10 @@ Add the following section to the file: collection = netdata_metrics ``` -[Restart](/docs/getting-started.md#start-stop-and-restart-netdata) Netdata to enable the MongoDB exporting connector. -Click on the **Netdata Monitoring** menu and check out the **exporting my mongo instance** sub-menu. You should start -seeing these charts fill up with data about the exporting process! +Restart Netdata using `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to enable the MongoDB exporting connector. Click on the +**Netdata Monitoring** menu and check out the **exporting my mongo instance** sub-menu. You should start seeing these +charts fill up with data about the exporting process! ![image](https://user-images.githubusercontent.com/1153921/70443852-25171200-1a56-11ea-8be3-494544b1c295.png) diff --git a/docs/guides/troubleshoot/monitor-debug-applications-ebpf.md b/docs/guides/troubleshoot/monitor-debug-applications-ebpf.md index 342193c58..13efa20e8 100644 --- a/docs/guides/troubleshoot/monitor-debug-applications-ebpf.md +++ b/docs/guides/troubleshoot/monitor-debug-applications-ebpf.md @@ -57,8 +57,9 @@ dev: custom-app ... ``` -Restart Netdata with `sudo service netdata restart` or the appropriate method for your system to begin seeing metrics -for this particular group+process. You can also add additional processes to the same group. +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to begin seeing metrics for this particular +group+process. You can also add additional processes to the same group. You can set up `apps_groups.conf` to more show more precise eBPF metrics for any application or service running on your system, even if it's a standard package like Redis, Apache, or any other [application/service Netdata collects @@ -105,7 +106,8 @@ Replace `entry` with `return`: network viewer = yes ``` -Restart Netdata with `sudo service netdata restart` or the appropriate method for your system. +Restart Netdata with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system. ## Get familiar with per-application eBPF metrics and charts diff --git a/docs/monitor/view-active-alarms.md b/docs/monitor/view-active-alarms.md index 8837e48ad..63ddfdde1 100644 --- a/docs/monitor/view-active-alarms.md +++ b/docs/monitor/view-active-alarms.md @@ -9,17 +9,14 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/docs/monitor/vie Every Netdata Agent comes with hundreds of pre-installed health alarms designed to notify you when an anomaly or performance issue affects your node or the applications it runs. -As soon as you launch a Netdata Agent and [claim it](/docs/get/README.md#claim-your-node-on-netdata-cloud), you can view -active alarms in both the local dashboard and Netdata Cloud. +## Netdata Cloud -## View active alarms in Netdata Cloud - -You can see active alarms from any node in your infrastructure in two ways: Click on the bell ๐Ÿ”” icon in the top -navigation, or click on the first column of any node's row in Nodes. This column's color changes based on the node's -[health status](/health/REFERENCE.md#alarm-statuses): gray is `CLEAR`, yellow is `WARNING`, and red is `CRITICAL`. +A War Room's [alarms indicator](https://learn.netdata.cloud/docs/cloud/war-rooms#indicators) displays the number of active `critical` (red) and +`warning` (yellow) alerts for the nodes in this War Room. Click on either the critical or warning badges to open a +pre-filtered modal displaying only those types of [active alarms](https://learn.netdata.cloud/docs/cloud/monitoring/alarms). ![The Alarms panel in Netdata -Cloud](https://user-images.githubusercontent.com/1153921/93541137-70761f00-f90a-11ea-89ef-7948c6213200.png) +Cloud](https://user-images.githubusercontent.com/1153921/108564747-d2bfbb00-72c0-11eb-97b9-5863ad3324eb.png) The Alarms panel lists all active alarms for nodes within that War Room, and tells you which chart triggered the alarm, what that chart's current value is, the alarm that triggered it, and when the alarm status first began. @@ -35,10 +32,10 @@ The active alarm information gives you details about the alarm that's been trigg configuration, how it calculates warning or critical alarms, and which configuration file you could edit on that node if you want to tweak or disable the alarm to better suit your needs. -![Screenshot from 2020-09-17 -17-21-29](https://user-images.githubusercontent.com/1153921/93541139-710eb580-f90a-11ea-809d-25afe1270108.png) +![Active alarm details in Netdata +Cloud](https://user-images.githubusercontent.com/1153921/108564813-f08d2000-72c0-11eb-80c8-b2af22a751fd.png) -## View active alarms in the Netdata Agent +## Local Netdata Agent dashboard Find the bell ๐Ÿ”” icon in the top navigation to bring up a modal that shows currently raised alarms, all running alarms, and the alarms log. Here is an example of a raised `system.cpu` alarm, followed by the full list and alarm log: diff --git a/docs/quickstart/infrastructure.md b/docs/quickstart/infrastructure.md index 0e355f373..8ec9b75cb 100644 --- a/docs/quickstart/infrastructure.md +++ b/docs/quickstart/infrastructure.md @@ -62,8 +62,8 @@ inviting others, you can better synchronize with your team or colleagues to unde When something goes wrong, you'll be ready to collaboratively troubleshoot complex performance problems from a single pane of glass. -To invite new users, click on **Invite Users** in the left-hand navigation panel beneath your Space's name. Choose which -War Rooms to add this user to, then click **Send**. +To [invite new users](https://learn.netdata.cloud/docs/cloud/manage/invite-your-team), click on **Invite Users** in the +Space management Area. Choose which War Rooms to add this user to, then click **Send**. If your team members have trouble signing in, direct them to the [Netdata Cloud sign in](https://learn.netdata.cloud/docs/cloud/manage/sign-in) doc. @@ -76,7 +76,7 @@ Overview features composite charts, which display aggregated metrics from every are streamed on-demand from individual nodes and composited onto a single, familiar dashboard. ![The War Room -Overview](https://user-images.githubusercontent.com/1153921/102651377-b1f4b100-4129-11eb-8e60-d2995d258c16.png) +Overview](https://user-images.githubusercontent.com/1153921/108732681-09791980-74eb-11eb-9ba2-98cb1b6608de.png) Read more about the Overview in the [infrastructure overview](/docs/visualize/overview-infrastructure.md) doc. @@ -101,7 +101,7 @@ complex issues by aggregating correlated charts from any number of nodes. For ex from every node in your infrastructure on a single dashboard. ![An example system CPU -dashboard](https://user-images.githubusercontent.com/1153921/95915568-2db63400-0d5c-11eb-92cc-3c61cb6519dd.png) +dashboard](https://user-images.githubusercontent.com/1153921/108732974-4b09c480-74eb-11eb-87a2-c67e569c08b6.png) Read more about [creating new dashboards](/docs/visualize/create-dashboards.md) for more details about the process and additional tips on best leveraging the feature to help you troubleshoot complex performance problems. diff --git a/docs/store/change-metrics-storage.md b/docs/store/change-metrics-storage.md index 0e2db1369..6dde22c04 100644 --- a/docs/store/change-metrics-storage.md +++ b/docs/store/change-metrics-storage.md @@ -54,7 +54,8 @@ multihost disk space` setting. Change it to the value recommended above. For exa dbengine multihost disk space = 1024 ``` -Save the file and restart the Agent with `service netdata restart` to change the database engine's size. +Save the file and restart the Agent with `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, to change the database engine's size. ## What's next? diff --git a/docs/visualize/overview-infrastructure.md b/docs/visualize/overview-infrastructure.md index 675abd745..288bfc515 100644 --- a/docs/visualize/overview-infrastructure.md +++ b/docs/visualize/overview-infrastructure.md @@ -7,22 +7,21 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/docs/visualize/o # See an overview of your infrastructure In Netdata Cloud, your nodes are organized into War Rooms. One of the two available views for a War Room is the -**Overview**, which uses composite charts to display real-time, aggregated metrics from all the nodes (or a filtered -selection) in a given War Room. +[**Overview**](https://learn.netdata.cloud/docs/cloud/visualize/overview), which uses composite charts to display +real-time, aggregated metrics from all the nodes (or a filtered selection) in a given War Room. With Overview's composite charts, you can see your infrastructure from a single pane of glass, discover trends or anomalies, then drill down with filtering or single-node dashboards to see more. In the screenshot below, each chart visualizes average or sum metrics values from across 5 distributed nodes. +Netdata also supports robust Kubernetes monitoring using the Overview. Read our [deployment +doc](/packaging/installer/methods/kubernetes.md) for details on visualizing Kubernetes metrics in Netdata Cloud. + ![The War Room -Overview](https://user-images.githubusercontent.com/1153921/102651377-b1f4b100-4129-11eb-8e60-d2995d258c16.png) +Overview](https://user-images.githubusercontent.com/1153921/108732681-09791980-74eb-11eb-9ba2-98cb1b6608de.png) ## Using the Overview -> โš ๏ธ In order for nodes to contribute to composite charts, and thus the Overview UI, they must run v1.26.0 or later of -> the Netdata Agent. See our [update docs](/packaging/installer/UPDATE.md) for the preferred update method based on how -> you installed the Agent. - The Overview uses roughly the same interface as local Agent dashboards or single-node dashboards in Netdata Cloud. By showing all available metrics from all your nodes in a single interface, Netdata Cloud helps you visualize the overall health of your infrastructure. Best of all, you don't have to worry about creating your own dashboards just to get @@ -79,9 +78,6 @@ contributing. Click on the link icon next to a given node to quickly _jump to the same chart in that node's single-node dashboard_ in Netdata Cloud. -![Jumping to a single-node dashboard in Netdata -Cloud](https://user-images.githubusercontent.com/1153921/99317327-1e2a7f00-2823-11eb-8fc3-76f260ced86a.gif) - You can use single-node dashboards in Netdata Cloud to drill down on specific issues, scrub backward in time to investigate historical data, and see like metrics presented meaningfully to help you troubleshoot performance problems. All of the familiar [interactions](/docs/visualize/interact-dashboards-charts.md) are available, as is adding any chart @@ -92,7 +88,7 @@ to a [new dashboard](/docs/visualize/create-dashboards.md). You can also use the **Nodes view** to monitor the health status and user-configurable key metrics from multiple nodes in a War Room. Read the [Nodes view doc](https://learn.netdata.cloud/docs/cloud/visualize/nodes) for details. -![The Nodes view](https://user-images.githubusercontent.com/1153921/95909704-cb593580-0d53-11eb-88fa-a3416ab09849.png) +![The Nodes view](https://user-images.githubusercontent.com/1153921/108733066-5fe65800-74eb-11eb-98e0-abaccd36deaf.png) ## What's next? @@ -100,10 +96,16 @@ To troubleshoot complex performance issues using Netdata, you need to understand visualizations. Learn more about [interaction](/docs/visualize/interact-dashboards-charts.md) to see historical metrics, highlight timeframes for targeted analysis, and more. +If you're a Kubernetes user, read about Netdata's [Kubernetes +visualizations](https://learn.netdata.cloud/docs/cloud/visualize/kubernetes) for details about the health map and +time-series k8s charts, and our tutorial, [_Kubernetes monitoring with Netdata: Overview and +visualizations_](/docs/guides/monitor/kubernetes-k8s-netdata.md), for a full walkthrough. + ### Related reference documentation -- [Netdata Cloud ยท War Rooms](https://learn.netdata.cloud/docs/cloud/war-rooms) -- [Netdata Cloud ยท Overview](https://learn.netdata.cloud/docs/cloud/visualize/overview) -- [Netdata Cloud ยท Nodes view](https://learn.netdata.cloud/docs/cloud/visualize/nodes) +- [Netdata Cloud ยท War Rooms](https://learn.netdata.cloud/docs/cloud/war-rooms) +- [Netdata Cloud ยท Overview](https://learn.netdata.cloud/docs/cloud/visualize/overview) +- [Netdata Cloud ยท Nodes view](https://learn.netdata.cloud/docs/cloud/visualize/nodes) +- [Netdata Cloud ยท Kubernetes visualizations](https://learn.netdata.cloud/docs/cloud/visualize/kubernetes) [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fvisualize%2Foverview-infrastructure&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/exporting/check_filters.c b/exporting/check_filters.c index cfe0b4ce4..8d70c6f68 100644 --- a/exporting/check_filters.c +++ b/exporting/check_filters.c @@ -64,7 +64,7 @@ int rrdset_is_exportable(struct instance *instance, RRDSET *st) } } - if(unlikely(!rrdset_is_available_for_backends(st))) { + if(unlikely(!rrdset_is_available_for_exporting_and_alarms(st))) { debug(D_BACKEND, "BACKEND: not sending chart '%s' of host '%s', because it is not available for backends.", st->id, host->hostname); return 0; } diff --git a/exporting/prometheus/prometheus.c b/exporting/prometheus/prometheus.c index 371f5a520..c10d94b90 100644 --- a/exporting/prometheus/prometheus.c +++ b/exporting/prometheus/prometheus.c @@ -37,7 +37,7 @@ inline int can_send_rrdset(struct instance *instance, RRDSET *st) } } - if (unlikely(!rrdset_is_available_for_backends(st))) { + if (unlikely(!rrdset_is_available_for_exporting_and_alarms(st))) { debug( D_BACKEND, "EXPORTING: not sending chart '%s' of host '%s', because it is not available for exporting.", @@ -660,7 +660,8 @@ static void rrd_stats_api_v1_charts_allmetrics_prometheus( rd->algorithm == RRD_ALGORITHM_PCENT_OVER_DIFF_TOTAL) { p.type = "counter"; p.relation = "delta gives"; - p.suffix = "_total"; + if (!prometheus_collector) + p.suffix = "_total"; } if (homogeneous) { diff --git a/health/Makefile.am b/health/Makefile.am index 399d6df5a..0802dc750 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -29,7 +29,6 @@ dist_healthconfig_DATA = \ health.d/anomalies.conf \ health.d/apache.conf \ health.d/apcupsd.conf \ - health.d/apps_plugin.conf \ health.d/backend.conf \ health.d/bcache.conf \ health.d/beanstalkd.conf \ diff --git a/health/health.c b/health/health.c index b81361e8a..0793100a6 100644 --- a/health/health.c +++ b/health/health.c @@ -966,12 +966,14 @@ void *health_main(void *ptr) { } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) { if(rc->old_status == RRDCALC_STATUS_CRITICAL) { - repeat_every = rc->crit_repeat_every; + repeat_every = 1; } else if (rc->old_status == RRDCALC_STATUS_WARNING) { - repeat_every = rc->warn_repeat_every; + repeat_every = 1; } } } + } else { + continue; } if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf index a1301ce8a..0753c6e5d 100644 --- a/health/health.d/adaptec_raid.conf +++ b/health/health.d/adaptec_raid.conf @@ -1,24 +1,24 @@ # logical device status check -template: adapter_raid_ld_status - on: adapter_raid.ld_status - lookup: max -5s +template: adaptec_raid_ld_status + on: adaptec_raid.ld_status + lookup: max -10s foreach * units: bool every: 10s crit: $this > 0 delay: down 5m multiplier 1.5 max 1h - info: at least 1 logical device is failed or degraded + info: logical device status is failed or degraded to: sysadmin # physical device state check -template: adapter_raid_pd_state - on: adapter_raid.pd_state - lookup: max -5s +template: adaptec_raid_pd_state + on: adaptec_raid.pd_state + lookup: max -10s foreach * units: bool every: 10s crit: $this > 0 delay: down 5m multiplier 1.5 max 1h - info: at least 1 physical device is not in online state + info: physical device state is not online to: sysadmin diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf index a2d248efe..c4c96eaf9 100644 --- a/health/health.d/anomalies.conf +++ b/health/health.d/anomalies.conf @@ -1,17 +1,17 @@ # raise a warning alarm if an anomaly probability is consistently above 50% -template: anomaly_probabilities +template: anomalies_anomaly_probabilities on: anomalies.probability lookup: average -2m foreach * every: 1m warn: $this > 50 - info: average anomaly probability > 50% for last 2 minutes + info: average anomaly probability over the last 2 minutes # raise a warning alarm if an anomaly flag is consistently firing -template: anomaly_flags +template: anomalies_anomaly_flags on: anomalies.anomaly lookup: sum -2m foreach * every: 1m warn: $this > 10 - info: count of anomalies > 10 for last 2 minutes + info: number of anomalies in the last 2 minutes diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 4f86037ba..12384fac6 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -1,6 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent -template: 10min_ups_load +template: apcupsd_10min_ups_load on: apcupsd.load os: * hosts: * @@ -10,12 +10,12 @@ template: 10min_ups_load warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 10m multiplier 1.5 max 1h - info: average UPS load for the last 10 minutes + info: average UPS load over the last 10 minutes to: sitemgr # Discussion in https://github.com/netdata/netdata/pull/3928: # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. -template: ups_charge +template: apcupsd_ups_charge on: apcupsd.charge os: * hosts: * @@ -25,7 +25,7 @@ template: ups_charge warn: $this < 100 crit: $this < (($status == $CRITICAL) ? (60) : (50)) delay: down 10m multiplier 1.5 max 1h - info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors + info: average UPS charge over the last minute to: sitemgr template: apcupsd_last_collected_secs diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf deleted file mode 100644 index 9a27bc6ba..000000000 --- a/health/health.d/apps_plugin.conf +++ /dev/null @@ -1,15 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -# disabled due to https://github.com/netdata/netdata/issues/10327 -# -# alarm: used_file_descriptors -# on: apps.files -# hosts: * -# calc: $fdperc -# units: % -# every: 5s -# warn: $this > (($status >= $WARNING) ? (75) : (80)) -# crit: $this > (($status == $CRITICAL) ? (85) : (90)) -# delay: down 5m multiplier 1.5 max 1h -# info: Peak percentage of file descriptors used -# to: sysadmin diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf index e51b8aa5f..8089dc94e 100644 --- a/health/health.d/backend.conf +++ b/health/health.d/backend.conf @@ -6,7 +6,7 @@ every: 1m warn: $this > 0 delay: down 5m multiplier 1.5 max 1h - info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf. + info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf. to: sysadmin # make sure we are sending data to backend @@ -31,26 +31,3 @@ delay: down 5m multiplier 1.5 max 1h info: percentage of metrics sent to the backend server to: dba - - alarm: backend_metrics_lost - on: netdata.backend_metrics - units: metrics - calc: abs($lost) - every: 10s - crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) - delay: down 5m multiplier 1.5 max 1h - info: number of metrics lost due to repeating failures to contact the backend server - to: dba - - -# this chart has been removed from netdata -# alarm: backend_slow -# on: netdata.backend_latency -# units: % -# calc: $latency * 100 / ($update_every * 1000) -# every: 10s -# warn: $this > 50 -# crit: $this > 100 -# delay: down 5m multiplier 1.5 max 1h -# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata -# to: dba diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index f0da9ac5e..d5fccf4f7 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -1,13 +1,14 @@ template: bcache_cache_errors on: disk.bcache_cache_read_races - lookup: sum -10m unaligned absolute + lookup: sum -1m unaligned absolute units: errors every: 1m warn: $this > 0 - crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) ) - delay: down 1h multiplier 1.5 max 2h - info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing) + delay: up 2m down 1h multiplier 1.5 max 2h + info: number of times data was read from the cache, \ + the bucket was reused and invalidated in the last 10 minutes \ + (when this occurs the data is reread from the backing device) to: sysadmin template: bcache_cache_dirty @@ -16,7 +17,8 @@ template: bcache_cache_dirty units: % every: 1m warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) delay: up 1m down 1h multiplier 1.5 max 2h - info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small) + info: percentage of cache space used for dirty data and metadata \ + (this usually means your SSD cache is too small) to: sysadmin diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 30dc27328..0c428ecbc 100644 --- a/health/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf @@ -1,6 +1,6 @@ # get the number of buried jobs in all queues -template: server_buried_jobs +template: beanstalk_server_buried_jobs on: beanstalk.current_jobs calc: $buried units: jobs @@ -8,12 +8,14 @@ template: server_buried_jobs warn: $this > 0 crit: $this > 10 delay: up 0 down 5m multiplier 1.2 max 1h - info: the number of buried jobs aggregated across all tubes + info: number of buried jobs across all tubes. \ + You need to manually kick them so they can be processed. \ + Presence of buried jobs in a tube does not affect new jobs. to: sysadmin # get the number of buried jobs per queue -#template: tube_buried_jobs +#template: beanstalk_tube_buried_jobs # on: beanstalk.jobs # calc: $buried # units: jobs @@ -26,7 +28,7 @@ template: server_buried_jobs # get the current number of tubes -#template: number_of_tubes +#template: beanstalk_number_of_tubes # on: beanstalk.current_tubes # calc: $tubes # every: 10s diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf index 4145e77cd..5cc7a72f7 100644 --- a/health/health.d/bind_rndc.conf +++ b/health/health.d/bind_rndc.conf @@ -1,9 +1,9 @@ - template: bind_rndc_stats_file_size +template: bind_rndc_stats_file_size on: bind_rndc.stats_size units: megabytes every: 60 calc: $stats_size warn: $this > 512 crit: $this > 1024 - info: Bind stats file is very large! Consider to create logrotate conf file for it! + info: BIND statistics-file size to: sysadmin diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf index 43c588db6..25b7f1994 100644 --- a/health/health.d/boinc.conf +++ b/health/health.d/boinc.conf @@ -12,7 +12,7 @@ families: * warn: $this > 0 crit: $this > 1 delay: up 1m down 5m multiplier 1.5 max 1h - info: the total number of compute errors over the past 10 minutes + info: average number of compute errors over the last 10 minutes to: sysadmin # Warn on lots of upload errors @@ -27,7 +27,7 @@ families: * warn: $this > 0 crit: $this > 1 delay: up 1m down 5m multiplier 1.5 max 1h - info: the average number of failed uploads over the past 10 minutes + info: average number of failed uploads over the last 10 minutes to: sysadmin # Warn on the task queue being empty @@ -42,7 +42,7 @@ families: * warn: $this < 1 crit: $this < 0.1 delay: up 5m down 10m multiplier 1.5 max 1h - info: the total number of locally available tasks + info: average number of total tasks over the last 10 minutes to: sysadmin # Warn on no active tasks with a non-empty queue @@ -58,5 +58,5 @@ families: * warn: $this < 1 crit: $this < 0.1 delay: up 5m down 10m multiplier 1.5 max 1h - info: the total number of active tasks + info: average number of active tasks over the last 10 minutes to: sysadmin diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf index b27aa544f..93ab8748a 100644 --- a/health/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf @@ -10,7 +10,7 @@ families: * warn: $this > (($status >= $WARNING) ? (90) : (95)) crit: $this > (($status == $CRITICAL) ? (95) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of allocated BTRFS physical disk space + info: percentage of allocated BTRFS physical disk space to: sysadmin template: btrfs_data @@ -24,7 +24,7 @@ families: * warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS data space + info: utilization of BTRFS data space to: sysadmin template: btrfs_metadata @@ -38,7 +38,7 @@ families: * warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS metadata space + info: utilization of BTRFS metadata space to: sysadmin template: btrfs_system @@ -52,6 +52,5 @@ families: * warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS system space + info: utilization of BTRFS system space to: sysadmin - diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf index de16f7b6f..cdbab0f67 100644 --- a/health/health.d/ceph.conf +++ b/health/health.d/ceph.conf @@ -1,13 +1,12 @@ # low ceph disk available -template: cluster_space_usage +template: ceph_cluster_space_usage on: ceph.general_usage - calc: $avail * 100 / ($avail + $used) + calc: $used * 100 / ($used + $avail) units: % - every: 10s - warn: $this < 10 - crit: $this < 1 + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 5m multiplier 1.2 max 1h - info: ceph disk usage is almost full + info: cluster disk space utilization to: sysadmin - diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 79ece53f9..c0a16f154 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -11,7 +11,7 @@ template: cgroup_10min_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: cpu utilization for the last 10 minutes + info: average cgroup CPU utilization over the last 10 minutes to: sysadmin template: cgroup_ram_in_use @@ -24,18 +24,5 @@ template: cgroup_ram_in_use warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: RAM used by cgroup - to: sysadmin - -template: cgroup_ram_and_swap_in_use - on: cgroup.mem_usage - os: linux - hosts: * - calc: ($ram + $swap) * 100 / $memory_and_swap_limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - info: RAM and Swap used by cgroup + info: cgroup memory utilization to: sysadmin diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf index 8ab2c9d0f..47773d04c 100644 --- a/health/health.d/cockroachdb.conf +++ b/health/health.d/cockroachdb.conf @@ -22,7 +22,7 @@ template: cockroachdb_used_storage_capacity warn: $this > (($status >= $WARNING) ? (80) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: entire disk usage percentage + info: storage capacity utilization to: dba template: cockroachdb_used_usable_storage_capacity @@ -33,7 +33,7 @@ template: cockroachdb_used_usable_storage_capacity warn: $this > (($status >= $WARNING) ? (80) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: usable space usage percentage + info: storage usable space utilization to: dba # Replication @@ -67,7 +67,7 @@ template: cockroachdb_open_file_descriptors_limit every: 10s warn: $this > 80 delay: down 15m multiplier 1.5 max 1h - info: open file descriptors usage percentage + info: open file descriptors utilization (against softlimit) to: dba # SQL diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf index fa8189856..32c69f8f5 100644 --- a/health/health.d/cpu.conf +++ b/health/health.d/cpu.conf @@ -11,7 +11,7 @@ template: 10min_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal) + info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) to: sysadmin template: 10min_cpu_iowait @@ -24,7 +24,7 @@ template: 10min_cpu_iowait warn: $this > (($status >= $WARNING) ? (20) : (40)) crit: $this > (($status == $CRITICAL) ? (40) : (50)) delay: down 15m multiplier 1.5 max 1h - info: average CPU wait I/O for the last 10 minutes + info: average CPU iowait time over the last 10 minutes to: sysadmin template: 20min_steal_cpu @@ -37,7 +37,7 @@ template: 20min_steal_cpu warn: $this > (($status >= $WARNING) ? (5) : (10)) crit: $this > (($status == $CRITICAL) ? (20) : (30)) delay: down 1h multiplier 1.5 max 2h - info: average CPU steal time for the last 20 minutes + info: average CPU steal time over the last 20 minutes to: sysadmin ## FreeBSD @@ -51,5 +51,5 @@ template: 10min_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: average cpu utilization for the last 10 minutes (excluding nice) + info: average CPU utilization over the last 10 minutes (excluding nice) to: sysadmin diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 274673e3e..3e51d37ec 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -10,7 +10,7 @@ lookup: sum -10m unaligned of fs_errors every: 10s crit: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc) to: sysadmin alarm: 10min_dbengine_global_io_errors @@ -22,7 +22,7 @@ lookup: sum -10m unaligned of io_errors every: 10s crit: $this > 0 delay: down 1h multiplier 1.5 max 3h - info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) + info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc) to: sysadmin alarm: 10min_dbengine_global_flushing_warnings @@ -34,7 +34,8 @@ lookup: sum -10m unaligned of pg_cache_over_half_dirty_events every: 10s warn: $this > 0 delay: down 1h multiplier 1.5 max 3h - info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks + info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \ + Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks. to: sysadmin alarm: 10min_dbengine_global_flushing_errors @@ -46,5 +47,6 @@ lookup: sum -10m unaligned of flushing_pressure_deletions every: 10s crit: $this != 0 delay: down 1h multiplier 1.5 max 3h - info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks + info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \ + Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks. to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 9c194ced2..d0cd60cfc 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -20,7 +20,7 @@ families: !/dev !/dev/* !/run !/run/* * warn: $this > (($status >= $WARNING ) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h - info: current disk space usage + info: disk space utilization to: sysadmin template: disk_inode_usage @@ -34,7 +34,7 @@ families: !/dev !/dev/* !/run !/run/* * warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h - info: current disk inode usage + info: disk inode utilization to: sysadmin @@ -49,35 +49,35 @@ families: !/dev !/dev/* !/run !/run/* * # we will use it in the next template to find # the hours remaining -template: disk_fill_rate - on: disk.space - os: linux freebsd - hosts: * -families: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: GB/hour - info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour +# template: disk_fill_rate +# on: disk.space +# os: linux freebsd +# hosts: * +# families: * +# lookup: min -10m at -50m unaligned of avail +# calc: ($this - $avail) / (($now - $after) / 3600) +# every: 1m +# units: GB/hour +# info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour # calculate the hours remaining # if the disk continues to fill # in this rate -template: out_of_disk_space_time - on: disk.space - os: linux freebsd - hosts: * -families: * - calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour - to: sysadmin +# template: out_of_disk_space_time +# on: disk.space +# os: linux freebsd +# hosts: * +# families: * +# calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) +# units: hours +# every: 10s +# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) +# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) +# delay: down 15m multiplier 1.2 max 1h +# info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour +# to: sysadmin # ----------------------------------------------------------------------------- @@ -91,34 +91,34 @@ families: * # we will use it in the next template to find # the hours remaining -template: disk_inode_rate - on: disk.inodes - os: linux freebsd - hosts: * -families: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: inodes/hour - info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour +# template: disk_inode_rate +# on: disk.inodes +# os: linux freebsd +# hosts: * +# families: * +# lookup: min -10m at -50m unaligned of avail +# calc: ($this - $avail) / (($now - $after) / 3600) +# every: 1m +# units: inodes/hour +# info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour # calculate the hours remaining # if the disk inodes are allocated # in this rate -template: out_of_disk_inodes_time - on: disk.inodes - os: linux freebsd - hosts: * -families: * - calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour - to: sysadmin +# template: out_of_disk_inodes_time +# on: disk.inodes +# os: linux freebsd +# hosts: * +# families: * +# calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) +# units: hours +# every: 10s +# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) +# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) +# delay: down 15m multiplier 1.2 max 1h +# info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour +# to: sysadmin # ----------------------------------------------------------------------------- @@ -141,8 +141,8 @@ families: * warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h - info: the percentage of time the disk was busy, during the last 10 minutes - to: sysadmin + info: average percentage of time the disk was busy over the last 10 minutes + to: silent # raise an alarm if the disk backlog @@ -163,5 +163,5 @@ families: * warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h - info: average of the kernel estimated disk backlog, for the last 10 minutes - to: sysadmin + info: average disk backlog size over the last 10 minutes + to: silent diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf index 113c950e6..64770b986 100644 --- a/health/health.d/dns_query.conf +++ b/health/health.d/dns_query.conf @@ -8,5 +8,5 @@ template: dns_query_time_query_time every: 10s warn: $this == nan delay: up 20s down 5m multiplier 1.5 max 1h - info: query round trip time + info: average DNS query round trip time over the last 10 seconds to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index ecf3b84a8..dff1f07d4 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -6,7 +6,7 @@ template: dnsmasq_dhcp_dhcp_range_utilization units: % calc: $used warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) delay: down 5m - info: dhcp-range utilization above threshold! + info: DHCP range utilization to: sysadmin diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf index 729906cdb..122d82b8a 100644 --- a/health/health.d/dockerd.conf +++ b/health/health.d/dockerd.conf @@ -4,5 +4,5 @@ template: docker_unhealthy_containers every: 10s lookup: average -10s crit: $this > 0 - info: number of unhealthy containers + info: average number of unhealthy docker containers over the last 10 seconds to: sysadmin diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf index 66d44ec13..0be9d45ba 100644 --- a/health/health.d/entropy.conf +++ b/health/health.d/entropy.conf @@ -7,10 +7,10 @@ on: system.entropy os: linux hosts: * - lookup: min -10m unaligned + lookup: min -5m unaligned units: entries every: 5m warn: $this < (($status >= $WARNING) ? (200) : (100)) delay: down 1h multiplier 1.5 max 2h - info: minimum entries in the random numbers pool in the last 10 minutes + info: minimum number of entries in the random numbers pool in the last 5 minutes to: silent diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf index 506cb0cf7..735fb5ae7 100644 --- a/health/health.d/exporting.conf +++ b/health/health.d/exporting.conf @@ -21,14 +21,3 @@ families: * delay: down 5m multiplier 1.5 max 1h info: percentage of metrics sent to the external database server to: dba - -template: exporting_metrics_lost -families: * - on: exporting_data_size - units: metrics - calc: abs($lost) - every: 10s - crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) - delay: down 5m multiplier 1.5 max 1h - info: number of metrics lost due to repeating failures to contact the external database server - to: dba diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf index 43658fef6..92c1525bd 100644 --- a/health/health.d/fping.conf +++ b/health/health.d/fping.conf @@ -11,18 +11,18 @@ families: * info: number of seconds since the last successful data collection to: sysadmin -template: host_reachable +template: fping_host_reachable families: * on: fping.latency calc: $average != nan units: up/down every: 10s crit: $this == 0 - info: states if the remote host is reachable delay: down 30m multiplier 1.5 max 2h + info: reachability status of the network host (0: unreachable, 1: reachable) to: sysadmin -template: host_latency +template: fping_host_latency families: * on: fping.latency lookup: average -10s unaligned of average @@ -32,11 +32,11 @@ families: * red: 1000 warn: $this > $green OR $max > $red crit: $this > $red - info: average round trip delay during the last 10 seconds delay: down 30m multiplier 1.5 max 2h + info: average latency to the network host over the last 10 seconds to: sysadmin -template: packet_loss +template: fping_packet_loss families: * on: fping.quality lookup: average -10m unaligned of returned @@ -47,7 +47,6 @@ families: * every: 10s warn: $this > $green crit: $this > $red - info: packet loss percentage delay: down 30m multiplier 1.5 max 2h + info: packet loss ratio to the network host over the last 10 minutes to: sysadmin - diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf index e3863ae5e..d148f7b7c 100644 --- a/health/health.d/gearman.conf +++ b/health/health.d/gearman.conf @@ -18,5 +18,5 @@ template: gearman_workers_queued warn: $this > 30000 crit: $this > 100000 delay: down 5m multiplier 1.5 max 1h - info: number of queued jobs - to: sysadmin \ No newline at end of file + info: average number of queued jobs over the last 10 minutes + to: sysadmin diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf index e49c70d48..9cd070668 100644 --- a/health/health.d/haproxy.conf +++ b/health/health.d/haproxy.conf @@ -4,7 +4,7 @@ template: haproxy_backend_server_status every: 10s lookup: average -10s crit: $this > 0 - info: number of failed haproxy backend servers + info: average number of failed haproxy backend servers over the last 10 seconds to: sysadmin template: haproxy_backend_status @@ -13,7 +13,7 @@ template: haproxy_backend_status every: 10s lookup: average -10s crit: $this > 0 - info: number of failed haproxy backends + info: average number of failed haproxy backends over the last 10 seconds to: sysadmin template: haproxy_last_collected diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf index 678faab4c..7345df4d2 100644 --- a/health/health.d/hdfs.conf +++ b/health/health.d/hdfs.conf @@ -23,7 +23,7 @@ template: hdfs_capacity_usage warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (80) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used capacity + info: summary datanodes space capacity utilization to: sysadmin @@ -36,7 +36,7 @@ template: hdfs_missing_blocks every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: missing blocks + info: number of missing blocks to: sysadmin @@ -47,7 +47,7 @@ template: hdfs_stale_nodes every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: stale data nodes + info: number of datanodes marked stale due to delayed heartbeat to: sysadmin @@ -58,7 +58,7 @@ template: hdfs_dead_nodes every: 10s crit: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: dead data nodes + info: number of datanodes which are currently dead to: sysadmin @@ -71,5 +71,5 @@ template: hdfs_num_failed_volumes every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: failed volumes + info: number of failed volumes to: sysadmin diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index 0ddf35eab..0158f63eb 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -11,17 +11,17 @@ families: * to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: web_service_up +template: httpcheck_web_service_up families: * on: httpcheck.status lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s units: up/down - info: at least 75% verified responses during last 60 seconds, ideal for badges + info: average ratio of successful HTTP requests over the last minute (at least 75%) to: silent -template: web_service_bad_content +template: httpcheck_web_service_bad_content families: * on: httpcheck.status lookup: average -5m unaligned percentage of bad_content @@ -30,11 +30,11 @@ families: * warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of unexpected http response content during the last 5 minutes + info: average ratio of HTTP responses with unexpected content over the last 5 minutes options: no-clear-notification to: webmaster -template: web_service_bad_status +template: httpcheck_web_service_bad_status families: * on: httpcheck.status lookup: average -5m unaligned percentage of bad_status @@ -43,57 +43,57 @@ families: * warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of unexpected http status during the last 5 minutes + info: average ratio of HTTP responses with unexpected status over the last 5 minutes options: no-clear-notification to: webmaster -template: web_service_timeouts +template: httpcheck_web_service_timeouts families: * on: httpcheck.status lookup: average -5m unaligned percentage of timeout every: 10s units: % - info: average of timeouts during the last 5 minutes + info: average ratio of HTTP request timeouts over the last 5 minutes -template: no_web_service_connections +template: httpcheck_no_web_service_connections families: * on: httpcheck.status lookup: average -5m unaligned percentage of no_connection every: 10s units: % - info: average of failed requests during the last 5 minutes + info: average ratio of failed requests during the last 5 minutes # combined timeout & no connection alarm -template: web_service_unreachable +template: httpcheck_web_service_unreachable families: * on: httpcheck.status - calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts) + calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) units: % every: 10s - warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40) - crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40 + warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40) + crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of failed requests either due to timeouts or no connection during the last 5 minutes + info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes options: no-clear-notification to: webmaster -template: 1h_web_service_response_time +template: httpcheck_1h_web_service_response_time families: * on: httpcheck.responsetime lookup: average -1h unaligned of time every: 30s units: ms - info: average response time over the last hour + info: average HTTP response time over the last hour -template: web_service_slow +template: httpcheck_web_service_slow families: * on: httpcheck.responsetime lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_web_service_response_time * 2) ) - crit: ($this > ($1h_web_service_response_time * 3) ) - info: average response time over the last 3 minutes, compared to the average over the last hour + warn: ($this > ($httpcheck_1h_web_service_response_time * 2) ) + crit: ($this > ($httpcheck_1h_web_service_response_time * 3) ) delay: down 5m multiplier 1.5 max 1h + info: average HTTP response time over the last 3 minutes, compared to the average over the last hour options: no-clear-notification to: webmaster diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 59a5c8edc..fa0196ef8 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -1,4 +1,4 @@ -template: disk_latency +template: ioping_disk_latency families: * on: ioping.latency lookup: average -10s unaligned of average @@ -8,6 +8,6 @@ families: * red: 1000 warn: $this > $green OR $max > $red crit: $this > $red - info: average round trip delay during the last 10 seconds delay: down 30m multiplier 1.5 max 2h + info: average I/O latency over the last 10 seconds to: sysadmin diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf index 989d6e912..f4a0f56da 100644 --- a/health/health.d/ipc.conf +++ b/health/health.d/ipc.conf @@ -11,7 +11,7 @@ warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (70) : (90)) delay: down 5m multiplier 1.5 max 1h - info: the percentage of IPC semaphores used + info: IPC semaphore utilization to: sysadmin alarm: semaphore_arrays_used @@ -24,5 +24,5 @@ warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (70) : (90)) delay: down 5m multiplier 1.5 max 1h - info: the percentage of IPC semaphore arrays used + info: IPC semaphore arrays utilization to: sysadmin diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf index 3f77572d6..fd53c2c46 100644 --- a/health/health.d/ipfs.conf +++ b/health/health.d/ipfs.conf @@ -7,5 +7,5 @@ template: ipfs_datastore_usage warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: ipfs Datastore close to running out of space + info: IPFS datastore utilization to: sysadmin diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf index c25581964..563d7a7ea 100644 --- a/health/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf @@ -6,7 +6,7 @@ warn: $this > 0 crit: $critical > 0 delay: up 5m down 15m multiplier 1.5 max 1h - info: the number IPMI sensors in non-nominal state + info: number of IPMI sensors in non-nominal state to: sysadmin alarm: ipmi_events @@ -16,5 +16,5 @@ every: 10s warn: $this > 0 delay: up 5m down 15m multiplier 1.5 max 1h - info: the number of events in the IPMI System Event Log (SEL) + info: number of events in the IPMI System Event Log (SEL) to: sysadmin diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf index 8054656ff..d1f93969a 100644 --- a/health/health.d/isc_dhcpd.conf +++ b/health/health.d/isc_dhcpd.conf @@ -1,10 +1,10 @@ - template: isc_dhcpd_leases_size - on: isc_dhcpd.leases_total - units: KB - every: 60 - calc: $leases_size - warn: $this > 3072 - crit: $this > 6144 - delay: up 2m down 5m - info: dhcpd.leases file too big! Module can slow down your server. - to: sysadmin +# template: isc_dhcpd_leases_size +# on: isc_dhcpd.leases_total +# units: KB +# every: 60 +# calc: $leases_size +# warn: $this > 3072 +# crit: $this > 6144 +# delay: up 2m down 5m +# info: dhcpd.leases file too big! Module can slow down your server. +# to: sysadmin diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf index d2ef24b58..5eda59b2c 100644 --- a/health/health.d/kubelet.conf +++ b/health/health.d/kubelet.conf @@ -4,26 +4,26 @@ # True (1) if the node is experiencing a configuration-related error, false (0) otherwise. - template: node_config_error + template: kubelet_node_config_error on: k8s_kubelet.kubelet_node_config_error calc: $kubelet_node_config_error units: bool every: 10s warn: $this == 1 delay: down 1m multiplier 1.5 max 2h - info: the node is experiencing a configuration-related error + info: the node is experiencing a configuration-related error (0: false, 1: true) to: sysadmin # Failed Token() requests to the alternate token source - template: token_requests + template: kubelet_token_requests lookup: sum -10s of token_fail_count on: k8s_kubelet.kubelet_token_requests units: failed requests every: 10s warn: $this > 0 delay: down 1m multiplier 1.5 max 2h - info: failed token requests to alternate token source + info: number of failed Token() requests to the alternate token source to: sysadmin # Docker and runtime operation errors @@ -35,7 +35,7 @@ every: 10s warn: $this > (($status >= $WARNING) ? (0) : (20)) delay: up 30s down 1m multiplier 1.5 max 2h - info: operations error + info: number of Docker or runtime operation errors to: sysadmin # ----------------------------------------------------------------------------- @@ -53,63 +53,66 @@ # quantile 0.5 -template: 1m_kubelet_pleg_relist_latency_quantile_05 +template: kubelet_1m_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 units: microseconds every: 10s - info: the average value of pleg relisting latency during the last minute (quantile 0.5) + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) -template: 10s_kubelet_pleg_relist_latency_quantile_05 +template: kubelet_10s_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) every: 10s units: % warn: $this > (($status >= $WARNING)?(100):(200)) crit: $this > (($status >= $WARNING)?(200):(400)) delay: down 1m multiplier 1.5 max 2h - info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.5) + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.5) to: sysadmin # quantile 0.9 -template: 1m_kubelet_pleg_relist_latency_quantile_09 +template: kubelet_1m_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 units: microseconds every: 10s - info: the average value of pleg relisting latency during the last minute (quantile 0.9) + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) -template: 10s_kubelet_pleg_relist_latency_quantile_09 +template: kubelet_10s_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(400)) crit: $this > (($status >= $WARNING)?(400):(800)) delay: down 1m multiplier 1.5 max 2h - info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.9) + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.9) to: sysadmin # quantile 0.99 -template: 1m_kubelet_pleg_relist_latency_quantile_099 +template: kubelet_1m_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 units: microseconds every: 10s - info: the average value of pleg relisting latency during the last minute (quantile 0.99) + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) -template: 10s_kubelet_pleg_relist_latency_quantile_099 +template: kubelet_10s_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) every: 10s units: % warn: $this > (($status >= $WARNING)?(400):(800)) crit: $this > (($status >= $WARNING)?(800):(1200)) delay: down 1m multiplier 1.5 max 2h - info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.99) + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.99) to: sysadmin diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf index 38727be2f..a27ea0722 100644 --- a/health/health.d/linux_power_supply.conf +++ b/health/health.d/linux_power_supply.conf @@ -8,5 +8,5 @@ template: linux_power_supply_capacity warn: $this < 10 crit: $this < 5 delay: up 30s down 5m multiplier 1.2 max 1h - info: the percentage remaining capacity of the power supply + info: percentage of remaining power supply capacity to: sysadmin diff --git a/health/health.d/load.conf b/health/health.d/load.conf index ee0c54b8e..ffaea1723 100644 --- a/health/health.d/load.conf +++ b/health/health.d/load.conf @@ -4,18 +4,19 @@ # Calculate the base trigger point for the load average alarms. # This is the maximum number of CPU's in the system over the past 1 # minute, with a special case for a single CPU of setting the trigger at 2. - alarm: load_trigger + alarm: load_cpu_number on: system.load os: linux hosts: * calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors ) units: cpus every: 1m - info: trigger point for load average alarms + info: number of active CPU cores in the system # Send alarms if the load average is unusually high. # These intentionally _do not_ calculate the average over the sampled # time period because the values being checked already are averages. + alarm: load_average_15 on: system.load os: linux @@ -23,10 +24,9 @@ lookup: max -1m unaligned of load15 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) delay: down 15m multiplier 1.5 max 1h - info: fifteen-minute load average + info: system fifteen-minute load average to: sysadmin alarm: load_average_5 @@ -36,10 +36,9 @@ lookup: max -1m unaligned of load5 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) delay: down 15m multiplier 1.5 max 1h - info: five-minute load average + info: system five-minute load average to: sysadmin alarm: load_average_1 @@ -49,8 +48,7 @@ lookup: max -1m unaligned of load1 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) delay: down 15m multiplier 1.5 max 1h - info: one-minute load average + info: system one-minute load average to: sysadmin diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index 2f906e187..ca2d0d9fb 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -14,7 +14,8 @@ template: mdstat_disks every: 10s calc: $down crit: $this > 0 - info: Array is degraded! + info: number of devices in the down state. \ + Any number > 0 indicates that the array is degraded. to: sysadmin template: mdstat_mismatch_cnt @@ -24,7 +25,7 @@ template: mdstat_mismatch_cnt every: 60s warn: $this > 1024 delay: up 30m - info: Mismatch count! + info: number of unsynchronized blocks to: sysadmin template: mdstat_nonredundant_last_collected @@ -35,4 +36,4 @@ template: mdstat_nonredundant_last_collected warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) info: number of seconds since the last successful data collection - to: sysadmin \ No newline at end of file + to: sysadmin diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 6e81a2a0e..f861765d2 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -1,48 +1,56 @@ -template: adapter_state + +## Adapters (controllers) + +template: megacli_adapter_state on: megacli.adapter_degraded - units: is degraded - lookup: sum -10s + lookup: max -10s foreach * + units: boolean every: 10s crit: $this > 0 - info: adapter state + delay: down 5m multiplier 2 max 10m + info: adapter is in the degraded state (0: false, 1: true) + to: sysadmin + +## Physical Disks + +template: megacli_pd_predictive_failures + on: megacli.pd_predictive_failure + lookup: sum -10s foreach * + units: predictive failures + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + info: number of physical drive predictive failures + to: sysadmin + +template: megacli_pd_media_errors + on: megacli.pd_media_error + lookup: sum -10s foreach * + units: media errors + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + info: number of physical drive media errors to: sysadmin -template: bbu_relative_charge +## Battery Backup Units (BBU) + +template: megacli_bbu_relative_charge on: megacli.bbu_relative_charge - units: percent lookup: average -10s + units: percent every: 10s warn: $this <= (($status >= $WARNING) ? (85) : (80)) crit: $this <= (($status == $CRITICAL) ? (50) : (40)) - info: BBU relative state of charge + info: average battery backup unit (BBU) relative state of charge over the last 10 seconds to: sysadmin -template: bbu_cycle_count +template: megacli_bbu_cycle_count on: megacli.bbu_cycle_count - units: cycle count lookup: average -10s + units: cycles every: 10s warn: $this >= 100 crit: $this >= 500 - info: BBU cycle count - to: sysadmin - -template: pd_media_errors - on: megacli.pd_media_error - units: media errors - lookup: sum -10s - every: 10s - warn: $this > 0 - delay: down 1m multiplier 2 max 10m - info: physical drive media errors - to: sysadmin - -template: pd_predictive_failures - on: megacli.pd_predictive_failure - units: predictive failures - lookup: sum -10s - every: 10s - warn: $this > 0 - delay: down 1m multiplier 2 max 10m - info: physical drive predictive failures + info: average battery backup unit (BBU) charge cycles count over the last 10 seconds to: sysadmin diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf index d248ef57a..e610f181f 100644 --- a/health/health.d/memcached.conf +++ b/health/health.d/memcached.conf @@ -23,30 +23,31 @@ template: memcached_cache_memory_usage warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (80) : (90)) delay: up 0 down 15m multiplier 1.5 max 1h - info: current cache memory usage + info: cache memory utilization to: dba # find the rate memcached cache is filling -template: cache_fill_rate +template: memcached_cache_fill_rate on: memcached.cache lookup: min -10m at -50m unaligned of available calc: ($this - $available) / (($now - $after) / 3600) units: KB/hour every: 1m - info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour + info: average rate the cache fills up (positive), or frees up (negative) space over the last hour # find the hours remaining until memcached cache is full -template: out_of_cache_space_time +template: memcached_out_of_cache_space_time on: memcached.cache - calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf) + calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf) units: hours every: 10s warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) delay: down 15m multiplier 1.5 max 1h - info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour + info: estimated time the cache will run out of space \ + if the system continues to add data at the same rate as the past hour to: dba diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf index 4a0e6e522..e95c0aad8 100644 --- a/health/health.d/memory.conf +++ b/health/health.d/memory.conf @@ -10,7 +10,7 @@ every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 1h - info: number of ECC correctable errors during the last hour + info: number of ECC correctable errors in the last 10 minutes to: sysadmin alarm: 1hour_ecc_memory_uncorrectable @@ -22,7 +22,7 @@ every: 1m crit: $this > 0 delay: down 1h multiplier 1.5 max 1h - info: number of ECC uncorrectable errors during the last hour + info: number of ECC uncorrectable errors in the last 10 minutes to: sysadmin alarm: 1hour_memory_hw_corrupted diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 62cef5a2e..7451b3f4d 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -24,7 +24,7 @@ template: mysql_10s_slow_queries warn: $this > (($status >= $WARNING) ? (5) : (10)) crit: $this > (($status == $CRITICAL) ? (10) : (20)) delay: down 5m multiplier 1.5 max 1h - info: number of mysql slow queries over the last 10 seconds + info: number of slow queries in the last 10 seconds to: dba @@ -36,7 +36,7 @@ template: mysql_10s_table_locks_immediate lookup: sum -10s absolute of immediate units: immediate locks every: 10s - info: number of table immediate locks over the last 10 seconds + info: number of table immediate locks in the last 10 seconds to: dba template: mysql_10s_table_locks_waited @@ -44,7 +44,7 @@ template: mysql_10s_table_locks_waited lookup: sum -10s absolute of waited units: waited locks every: 10s - info: number of table waited locks over the last 10 seconds + info: number of table waited locks in the last 10 seconds to: dba template: mysql_10s_waited_locks_ratio @@ -55,7 +55,7 @@ template: mysql_10s_waited_locks_ratio warn: $this > (($status >= $WARNING) ? (10) : (25)) crit: $this > (($status == $CRITICAL) ? (25) : (50)) delay: down 30m multiplier 1.5 max 1h - info: the ratio of mysql waited table locks, for the last 10 seconds + info: ratio of waited table locks over the last 10 seconds to: dba @@ -70,7 +70,7 @@ template: mysql_connections warn: $this > (($status >= $WARNING) ? (60) : (70)) crit: $this > (($status == $CRITICAL) ? (80) : (90)) delay: down 15m multiplier 1.5 max 1h - info: the ratio of current active connections vs the maximum possible number of connections + info: client connections utilization to: dba @@ -84,7 +84,7 @@ template: mysql_replication every: 10s crit: $this == 0 delay: down 5m multiplier 1.5 max 1h - info: checks if mysql replication has stopped + info: replication status (0: stopped, 1: working) to: dba template: mysql_replication_lag @@ -95,7 +95,8 @@ template: mysql_replication_lag warn: $this > (($status >= $WARNING) ? (5) : (10)) crit: $this > (($status == $CRITICAL) ? (10) : (30)) delay: down 15m multiplier 1.5 max 1h - info: the number of seconds mysql replication is behind this master + info: difference between the timestamp of the latest transaction processed by the SQL thread and \ + the timestamp of the same transaction when it was processed on the master to: dba @@ -107,7 +108,7 @@ template: mysql_galera_cluster_size_max_2m lookup: max -2m absolute units: nodes every: 10s - info: max cluster size 2 minute + info: maximum galera cluster size in the last 2 minutes to: dba template: mysql_galera_cluster_size @@ -118,7 +119,7 @@ template: mysql_galera_cluster_size warn: $this > $mysql_galera_cluster_size_max_2m crit: $this < $mysql_galera_cluster_size_max_2m delay: up 20s down 5m multiplier 1.5 max 1h - info: cluster size has changed + info: current galera cluster size, compared to the maximum size in the last 2 minutes to: dba # galera node state @@ -130,7 +131,8 @@ template: mysql_galera_cluster_state warn: $this < 4 crit: $this < 2 delay: up 30s down 5m multiplier 1.5 max 1h - info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced) + info: galera node state \ + (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced) to: dba @@ -142,5 +144,7 @@ template: mysql_galera_cluster_status every: 10s crit: $mysql_galera_cluster_state != nan AND $this != 0 delay: up 30s down 5m multiplier 1.5 max 1h - info: node and cluster status (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected) + info: galera node cluster component status \ + (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \ + Any other value than primary indicates that the node is part of a nonoperational component. to: dba diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 261290e51..33202421f 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -12,7 +12,7 @@ calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan ) units: Mbit every: 10s - info: The current speed of the physical network interface + info: network interface current speed template: 1m_received_traffic_overflow on: net.net @@ -20,13 +20,12 @@ hosts: * families: * lookup: average -1m unaligned absolute of received - calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan ) units: % every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) - delay: down 1m multiplier 1.5 max 1h - info: interface received bandwidth usage over net device speed max + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + info: average inbound utilization for the network interface over the last minute to: sysadmin template: 1m_sent_traffic_overflow @@ -35,13 +34,12 @@ hosts: * families: * lookup: average -1m unaligned absolute of sent - calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan ) units: % every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) - delay: down 1m multiplier 1.5 max 1h - info: interface sent bandwidth usage over net device speed max + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + info: average outbound utilization for the network interface over the last minute to: sysadmin # ----------------------------------------------------------------------------- @@ -58,56 +56,76 @@ template: inbound_packets_dropped on: net.drops os: linux hosts: * -families: * +families: !net* * lookup: sum -10m unaligned absolute of inbound units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface inbound dropped packets in the last 10 minutes - to: sysadmin + info: number of inbound dropped packets for the network interface in the last 10 minutes template: outbound_packets_dropped on: net.drops os: linux hosts: * -families: * +families: !net* * lookup: sum -10m unaligned absolute of outbound units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface outbound dropped packets in the last 10 minutes - to: sysadmin + info: number of outbound dropped packets for the network interface in the last 10 minutes template: inbound_packets_dropped_ratio on: net.packets os: linux hosts: * -families: * +families: !net* !wl* * lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of inbound dropped packets for the network interface over the last 10 minutes to: sysadmin template: outbound_packets_dropped_ratio on: net.packets os: linux hosts: * -families: * +families: !net* !wl* * lookup: sum -10m unaligned absolute of sent - calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0)) + calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of outbound dropped packets for the network interface over the last 10 minutes + to: sysadmin + +template: wifi_inbound_packets_dropped_ratio + on: net.packets + os: linux + hosts: * +families: wl* + lookup: sum -10m unaligned absolute of received + calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of inbound dropped packets for the network interface over the last 10 minutes + to: sysadmin + +template: wifi_outbound_packets_dropped_ratio + on: net.packets + os: linux + hosts: * +families: wl* + lookup: sum -10m unaligned absolute of sent + calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of outbound dropped packets for the network interface over the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -123,7 +141,7 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface inbound errors in the last 10 minutes + info: number of inbound errors for the network interface in the last 10 minutes to: sysadmin template: interface_outbound_errors @@ -136,7 +154,7 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface outbound errors in the last 10 minutes + info: number of outbound errors for the network interface in the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -157,7 +175,7 @@ families: * every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 2h - info: interface fifo errors in the last 10 minutes + info: number of FIFO errors for the network interface in the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -177,7 +195,7 @@ families: * lookup: average -1m unaligned of received units: packets every: 10s - info: the average number of packets received during the last minute + info: average number of packets received by the network interface over the last minute template: 10s_received_packets_storm on: net.packets @@ -189,7 +207,8 @@ families: * every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status >= $WARNING)?(5000):(6000)) + crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification - info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent) + info: ratio of average number of received packets for the network interface over the last 10 seconds, \ + compared to the rate over the last minute to: sysadmin diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf index 1d07752cc..f827d8e46 100644 --- a/health/health.d/netfilter.conf +++ b/health/health.d/netfilter.conf @@ -1,19 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: netfilter_last_collected_secs - on: netfilter.conntrack_sockets - os: linux - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - alarm: netfilter_conntrack_full on: netfilter.conntrack_sockets os: linux @@ -22,8 +9,8 @@ calc: $this * 100 / $netfilter_conntrack_max units: % every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) delay: down 5m multiplier 1.5 max 1h - info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size + info: netfilter connection tracker table size utilization to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index b255d35f9..f450b7122 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -20,9 +20,9 @@ template: pihole_blocked_queries units: % calc: $blocked warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) ) delay: up 2m down 5m - info: percentage of blocked dns queries for the last 24 hour + info: percentage of blocked dns queries over the last 24 hour to: sysadmin @@ -36,7 +36,7 @@ template: pihole_blocklist_last_update calc: $ago warn: $this > 60 * 60 * 24 * 8 crit: $this > 60 * 60 * 24 * 8 * 2 - info: blocklist last update time + info: gravity.list (blocklist) file last update time to: sysadmin # Gravity file check (gravity.list). @@ -48,7 +48,7 @@ template: pihole_blocklist_gravity_file calc: $file_exists crit: $this != 1 delay: up 2m down 5m - info: gravity file existence + info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists) to: sysadmin # Pi-hole's ability to block unwanted domains. @@ -61,5 +61,5 @@ template: pihole_status calc: $enabled warn: $this != 1 delay: up 2m down 5m - info: unwanted domains blocking status + info: unwanted domains blocking status (0: enabled, 1: disabled) to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index 696333fd8..29dcebbc7 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -11,17 +11,17 @@ families: * to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: service_reachable +template: portcheck_service_reachable families: * on: portcheck.status lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s units: up/down - info: at least 75% successful connections during last 60 seconds, ideal for badges + info: average ratio of successful connections over the last minute (at least 75%) to: silent -template: connection_timeouts +template: portcheck_connection_timeouts families: * on: portcheck.status lookup: average -5m unaligned percentage of timeout @@ -30,10 +30,10 @@ families: * warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of timeouts during the last 5 minutes + info: average ratio of timeouts over the last 5 minutes to: sysadmin -template: connection_fails +template: portcheck_connection_fails families: * on: portcheck.status lookup: average -5m unaligned percentage of no_connection,failed @@ -42,5 +42,5 @@ families: * warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of failed connections during the last 5 minutes + info: average ratio of failed connections over the last 5 minutes to: sysadmin diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf index 293f1aa0d..b464d8f64 100644 --- a/health/health.d/processes.conf +++ b/health/health.d/processes.conf @@ -6,8 +6,8 @@ calc: $active * 100 / $pidmax units: % every: 5s - warn: $this > (($status >= $WARNING) ? (75) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) delay: down 5m multiplier 1.5 max 1h - info: the percentage of active processes + info: system process IDs (PID) space utilization to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 0a71dac84..2daecc489 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -7,7 +7,8 @@ hosts: * calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) every: 10s - info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) + info: amount of memory reported as used, \ + but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) alarm: ram_in_use on: system.ram @@ -20,7 +21,7 @@ warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: system RAM used + info: system memory utilization to: sysadmin alarm: ram_available @@ -33,7 +34,7 @@ warn: $this < (($status >= $WARNING) ? (15) : (10)) crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) delay: down 15m multiplier 1.5 max 1h - info: estimated amount of RAM available for userspace processes, without causing swapping + info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin ## FreeBSD @@ -47,7 +48,7 @@ warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: system RAM usage + info: system memory utilization to: sysadmin alarm: ram_available @@ -60,5 +61,5 @@ warn: $this < (($status >= $WARNING) ? (15) : (10)) crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) delay: down 15m multiplier 1.5 max 1h - info: estimated amount of RAM available for userspace processes, without causing swapping + info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index c08a884a6..43f98a1d4 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -18,7 +18,7 @@ families: * every: 10s crit: $rdb_last_bgsave_status != 0 units: ok/failed - info: states if redis bgsave is working + info: status of the last RDB save operation (0: ok, 1: error) delay: down 5m multiplier 1.5 max 1h to: dba @@ -29,6 +29,6 @@ families: * warn: $rdb_bgsave_in_progress > 600 crit: $rdb_bgsave_in_progress > 1200 units: seconds - info: the time redis needs to save its database + info: duration of the on-going RDB save operation delay: down 5m multiplier 1.5 max 1h to: dba diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf index 2344b60ec..51b1deb4c 100644 --- a/health/health.d/retroshare.conf +++ b/health/health.d/retroshare.conf @@ -21,5 +21,5 @@ template: retroshare_dht_working warn: $this < (($status >= $WARNING) ? (120) : (100)) crit: $this < (($status == $CRITICAL) ? (10) : (1)) delay: up 0 down 15m multiplier 1.5 max 1h - info: Checks if the DHT has enough peers to operate + info: number of DHT peers to: sysadmin diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index 745302778..d63460264 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -1,5 +1,5 @@ # Ensure that Riak is running. template: riak_last_collected_secs -template: riak_last_collected_secs +template: riakkv_last_collected_secs on: riak.kv.throughput calc: $now - $last_collected_t units: seconds ago @@ -11,7 +11,7 @@ template: riak_last_collected_secs to: dba # Warn if a list keys operation is running. -template: riak_list_keys_active +template: riakkv_list_keys_active on: riak.core.fsm_active calc: $list_fsm_active units: state machines @@ -23,44 +23,50 @@ template: riak_list_keys_active ## Timing healthchecks # KV GET -template: 1h_kv_get_mean_latency +template: riakkv_1h_kv_get_mean_latency on: riak.kv.latency.get calc: $node_get_fsm_time_mean lookup: average -1h unaligned of time every: 30s units: ms - info: mean average KV GET latency over the last hour + info: average time between reception of client GET request and \ + subsequent response to client over the last hour -template: riak_kv_get_slow +template: riakkv_kv_get_slow on: riak.kv.latency.get calc: $mean lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_kv_get_mean_latency * 2) ) - crit: ($this > ($1h_kv_get_mean_latency * 3) ) - info: average KV GET time over the last 3 minutes, compared to the average over the last hour + warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) ) + info: average time between reception of client GET request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h to: dba # KV PUT -template: 1h_kv_put_mean_latency +template: riakkv_1h_kv_put_mean_latency on: riak.kv.latency.put calc: $node_put_fsm_time_mean lookup: average -1h unaligned of time every: 30s units: ms - info: mean average KV PUT latency over the last hour + info: average time between reception of client PUT request and \ + subsequent response to the client over the last hour -template: riak_kv_put_slow +template: riakkv_kv_put_slow on: riak.kv.latency.put calc: $mean lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_kv_put_mean_latency * 2) ) - crit: ($this > ($1h_kv_put_mean_latency * 3) ) - info: average KV PUT time over the last 3 minutes, compared to the average over the last hour + warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) ) + info: average time between reception of client PUT request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h to: dba @@ -69,12 +75,12 @@ template: riak_kv_put_slow # Default Erlang VM process limit: 262144 # On systems observed, this is < 2000, but may grow depending on load. -template: riak_vm_high_process_count +template: riakkv_vm_high_process_count on: riak.vm calc: $sys_process_count units: processes every: 10s warn: $this > 10000 crit: $this > 100000 - info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144) + info: number of processes running in the Erlang VM to: dba diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf index 1a3088a2a..ab9771bb4 100644 --- a/health/health.d/scaleio.conf +++ b/health/health.d/scaleio.conf @@ -22,7 +22,7 @@ template: scaleio_storage_pool_capacity_utilization warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: Storage Pool capacity utilization + info: storage pool capacity utilization to: sysadmin @@ -34,5 +34,5 @@ template: scaleio_sdc_mdm_connection_state every: 10s warn: $this != 1 delay: up 30s down 5m multiplier 1.5 max 1h - info: Sdc connection to MDM state + info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected) to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index f835f2aee..f761e4a01 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -12,7 +12,8 @@ every: 10s warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h - info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) + info: average number of dropped packets in the last minute \ + due to exceeded net.core.netdev_max_backlog to: sysadmin alarm: 1min_netdev_budget_ran_outs @@ -24,7 +25,9 @@ every: 10s warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h - info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets) + info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \ + net.core.netdev_budget_usecs with work remaining over the last minute \ + (this can be a cause for dropped packets) to: silent alarm: 10min_netisr_backlog_exceeded @@ -34,7 +37,9 @@ lookup: average -1m unaligned absolute of qdrops units: packets every: 10s - warn: $this > (($status >+ $WARNING) ? (0) : (10)) + warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h - info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets) + info: average number of drops in the last minute \ + due to exceeded sysctl net.route.netisr_maxqlen \ + (this can be a cause for dropped packets) to: sysadmin diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf index f920b0807..66c36c13c 100644 --- a/health/health.d/swap.conf +++ b/health/health.d/swap.conf @@ -10,23 +10,9 @@ calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) units: % of RAM every: 1m - warn: $this > (($status >= $WARNING) ? (10) : (20)) - crit: $this > (($status == $CRITICAL) ? (20) : (30)) - delay: up 0 down 15m multiplier 1.5 max 1h - info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM - to: sysadmin - - alarm: ram_in_swap - on: system.swap - os: linux - hosts: * - calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - units: % of RAM - every: 10s - warn: $this > (($status >= $WARNING) ? (15) : (20)) - crit: $this > (($status == $CRITICAL) ? (40) : (50)) - delay: up 30s down 15m multiplier 1.5 max 1h - info: the swap memory used, as a percentage of the system RAM + warn: $this > (($status >= $WARNING) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + info: percentage of the system RAM swapped in the last 30 minutes to: sysadmin alarm: used_swap @@ -39,5 +25,5 @@ warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 30s down 15m multiplier 1.5 max 1h - info: the percentage of swap memory used + info: swap memory utilization to: sysadmin diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf new file mode 100644 index 000000000..417624adb --- /dev/null +++ b/health/health.d/synchronization.conf @@ -0,0 +1,12 @@ + alarm: sync_freq + on: mem.sync + lookup: sum -1m of sync + units: calls + plugin: ebpf.plugin + every: 1m + warn: $this > 6 + delay: up 1m down 10m multiplier 1.5 max 1h + info: number of sync() system calls. \ + Every call causes all pending modifications to filesystem metadata and \ + cached file data to be written to the underlying filesystems. + to: sysadmin diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf index 7aa9a9800..38b1062dc 100644 --- a/health/health.d/tcp_conn.conf +++ b/health/health.d/tcp_conn.conf @@ -13,7 +13,7 @@ units: % every: 10s warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) - crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 )) + crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 )) delay: up 0 down 5m multiplier 1.5 max 1h - info: the percentage of IPv4 TCP connections over the max allowed + info: IPv4 TCP connections utilization to: sysadmin diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf index 3b3072577..dad462ebf 100644 --- a/health/health.d/tcp_listen.conf +++ b/health/health.d/tcp_listen.conf @@ -28,7 +28,7 @@ warn: $this > 1 crit: $this > (($status == $CRITICAL) ? (1) : (5)) delay: up 0 down 5m multiplier 1.5 max 1h - info: the average number of times the TCP accept queue of the kernel overflown, during the last minute + info: average number of overflows in the TCP accept queue over the last minute to: sysadmin # THIS IS TOO GENERIC @@ -43,7 +43,7 @@ warn: $this > 1 crit: $this > (($status == $CRITICAL) ? (1) : (5)) delay: up 0 down 5m multiplier 1.5 max 1h - info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received) + info: average number of dropped packets in the TCP accept queue over the last minute to: sysadmin @@ -65,7 +65,8 @@ warn: $this > 1 crit: $this > (($status == $CRITICAL) ? (0) : (5)) delay: up 10 down 5m multiplier 1.5 max 1h - info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute + info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \ + (SYN cookies were not enabled) to: sysadmin alarm: 1m_tcp_syn_queue_cookies @@ -78,6 +79,6 @@ warn: $this > 1 crit: $this > (($status == $CRITICAL) ? (0) : (5)) delay: up 10 down 5m multiplier 1.5 max 1h - info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute + info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute to: sysadmin diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf index 6927d5765..29d4ad68b 100644 --- a/health/health.d/tcp_mem.conf +++ b/health/health.d/tcp_mem.conf @@ -14,7 +14,7 @@ units: % every: 10s warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) - crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) + crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) delay: up 0 down 5m multiplier 1.5 max 1h - info: the amount of TCP memory as a percentage of its max memory limit + info: TCP memory utilization to: sysadmin diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf index 280d6590f..17ff7a956 100644 --- a/health/health.d/tcp_orphans.conf +++ b/health/health.d/tcp_orphans.conf @@ -15,7 +15,7 @@ units: % every: 10s warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) - crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 )) + crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) delay: up 0 down 5m multiplier 1.5 max 1h - info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors) + info: orphan IPv4 TCP sockets utilization to: sysadmin diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 36a550a5d..af2a75252 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -1,21 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent -# ----------------------------------------------------------------------------- - - alarm: ipv4_tcphandshake_last_collected_secs - on: ipv4.tcphandshake - os: linux freebsd - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: up 0 down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # ----------------------------------------------------------------------------- # tcp resets this host sends @@ -26,7 +11,7 @@ lookup: average -1m at -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s - info: average TCP RESETS this host is sending, over the last minute + info: average number of sent TCP RESETS over the last minute alarm: 10s_ipv4_tcp_resets_sent on: ipv4.tcphandshake @@ -38,7 +23,10 @@ warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification - info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent) + info: average number of sent TCP RESETS over the last 10 seconds. \ + This can indicate a port scan, \ + or that a service running on this host has crashed. \ + Netdata will not send a clear notification for this alarm. to: sysadmin # ----------------------------------------------------------------------------- @@ -51,7 +39,7 @@ lookup: average -1m at -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s - info: average TCP RESETS this host is sending, over the last minute + info: average number of received TCP RESETS over the last minute alarm: 10s_ipv4_tcp_resets_received on: ipv4.tcphandshake @@ -63,5 +51,7 @@ warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification - info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent) + info: average number of received TCP RESETS over the last 10 seconds. \ + This can be an indication that a service this host needs has crashed. \ + Netdata will not send a clear notification for this alarm. to: sysadmin diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf index 1e47b5c8b..4836d6310 100644 --- a/health/health.d/udp_errors.conf +++ b/health/health.d/udp_errors.conf @@ -1,21 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent -# ----------------------------------------------------------------------------- - - alarm: ipv4_udperrors_last_collected_secs - on: ipv4.udperrors - os: linux freebsd - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: up 0 down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # ----------------------------------------------------------------------------- # UDP receive buffer errors @@ -26,10 +11,9 @@ lookup: average -1m unaligned absolute of RcvbufErrors units: errors every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (10)) - info: average number of UDP receive buffer errors during the last minute - delay: up 0 down 60m multiplier 1.2 max 2h + warn: $this > (($status >= $WARNING) ? (0) : (10)) + info: average number of UDP receive buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h to: sysadmin # ----------------------------------------------------------------------------- @@ -42,8 +26,7 @@ lookup: average -1m unaligned absolute of SndbufErrors units: errors every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (10)) - info: number of UDP send buffer errors during the last minute - delay: up 0 down 60m multiplier 1.2 max 2h + warn: $this > (($status >= $WARNING) ? (0) : (10)) + info: average number of UDP send buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h to: sysadmin diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf index bdedc11a0..567baf188 100644 --- a/health/health.d/unbound.conf +++ b/health/health.d/unbound.conf @@ -21,7 +21,7 @@ template: unbound_request_list_overwritten every: 10s warn: $this > 5 delay: up 10 down 5m multiplier 1.5 max 1h - info: the number of overwritten queries in the request-list + info: number of overwritten queries in the request-list to: sysadmin template: unbound_request_list_dropped @@ -31,5 +31,5 @@ template: unbound_request_list_dropped every: 10s warn: $this > 0 delay: up 10 down 5m multiplier 1.5 max 1h - info: the number of dropped queries in the request-list + info: number of dropped queries in the request-list to: sysadmin diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf index 7bb98a9ba..f4b03d4cf 100644 --- a/health/health.d/vcsa.conf +++ b/health/health.d/vcsa.conf @@ -27,7 +27,8 @@ template: vcsa_system_health warn: ($this == 1) || ($this == 2) crit: $this == 3 delay: down 1m multiplier 1.5 max 1h - info: overall system health status + info: overall system health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin # Components health: @@ -45,7 +46,8 @@ template: vcsa_swap_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: swap health status + info: swap health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_storage_health @@ -56,7 +58,8 @@ template: vcsa_storage_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: storage health status + info: storage health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_mem_health @@ -67,7 +70,8 @@ template: vcsa_mem_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: mem health status + info: memory health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_load_health @@ -78,7 +82,8 @@ template: vcsa_load_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: load health status + info: load health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_database_storage_health @@ -89,7 +94,8 @@ template: vcsa_database_storage_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: database storage health status + info: database storage health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_applmgmt_health @@ -100,7 +106,8 @@ template: vcsa_applmgmt_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: appl mgmt health status + info: applmgmt health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin @@ -118,5 +125,6 @@ template: vcsa_software_updates_health warn: $this == 4 crit: $this == 3 delay: down 1m multiplier 1.5 max 1h - info: software packages health status + info: software updates availability status \ + (-1: unknown, 0: green, 2: orange, 3: red, 4: grey) to: sysadmin diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf index 36bbaf82b..9598dd39c 100644 --- a/health/health.d/vernemq.conf +++ b/health/health.d/vernemq.conf @@ -18,10 +18,10 @@ template: vernemq_socket_errors on: vernemq.socket_errors lookup: sum -1m unaligned absolute of socket_error units: errors - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h - info: socket errors in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of socket errors in the last minute to: sysadmin # Queues dropped/expired/unhandled PUBLISH messages @@ -30,30 +30,30 @@ template: vernemq_queue_message_drop on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_drop units: dropped messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h - info: dropped messaged due to full queues in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of dropped messaged due to full queues in the last minute to: sysadmin template: vernemq_queue_message_expired on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_expired units: expired messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (15)) - delay: down 5m multiplier 1.5 max 2h - info: messages which expired before delivery in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (15)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of messages which expired before delivery in the last minute to: sysadmin template: vernemq_queue_message_unhandled on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_unhandled units: unhandled messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h - info: unhandled messages (connections with clean session=true) in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of unhandled messages (connections with clean session=true) in the last minute to: sysadmin # Erlang VM @@ -66,19 +66,19 @@ template: vernemq_average_scheduler_utilization warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: average scheduler utilization for the last 10 minutes + info: average scheduler utilization over the last 10 minutes to: sysadmin # Cluster communication and netsplits template: vernemq_cluster_dropped on: vernemq.cluster_dropped - lookup: average -1m unaligned - units: KiB/s - every: 10s + lookup: sum -1m unaligned + units: KiB + every: 1m warn: $this > 0 - delay: down 5m multiplier 1.5 max 1h - info: the amount of traffic dropped during communication with the cluster nodes in the last minute + delay: up 5m down 5m multiplier 1.5 max 1h + info: amount of traffic dropped during communication with the cluster nodes in the last minute to: sysadmin template: vernemq_netsplits @@ -88,68 +88,41 @@ template: vernemq_netsplits every: 10s warn: $this > 0 delay: down 5m multiplier 1.5 max 2h - info: detected netsplits in the last minute + info: number of detected netsplits (split brain situation) in the last minute to: sysadmin # Unsuccessful CONNACK -template: vernemq_mqtt_connack_sent_reason_success - on: vernemq.mqtt_connack_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v3/v5 CONNACK sent in the last minute - to: sysadmin - template: vernemq_mqtt_connack_sent_reason_unsuccessful on: vernemq.mqtt_connack_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_connack_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v3/v5 CONNACK sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute to: sysadmin # Not normal DISCONNECT -template: vernemq_mqtt_disconnect_received_reason_normal_disconnect - on: vernemq.mqtt_disconnect_received_reason - lookup: sum -1m unaligned absolute match-names of normal_disconnect - units: packets - every: 10s - info: normal v5 DISCONNECT received in the last minute - to: sysadmin - -template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect - on: vernemq.mqtt_disconnect_sent_reason - lookup: sum -1m unaligned absolute match-names of normal_disconnect - units: packets - every: 10s - info: normal v5 DISCONNECT sent in the last minute - to: sysadmin - template: vernemq_mqtt_disconnect_received_reason_not_normal on: vernemq.mqtt_disconnect_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect + lookup: sum -1m unaligned absolute match-names of !normal_disconnect,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: not normal v5 DISCONNECT received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received not normal v5 DISCONNECT packets in the last minute to: sysadmin template: vernemq_mqtt_disconnect_sent_reason_not_normal on: vernemq.mqtt_disconnect_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect + lookup: sum -1m unaligned absolute match-names of !normal_disconnect,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: not normal v5 DISCONNECT sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent not normal v5 DISCONNECT packets in the last minute to: sysadmin # SUBSCRIBE errors and unauthorized attempts @@ -158,20 +131,20 @@ template: vernemq_mqtt_subscribe_error on: vernemq.mqtt_subscribe_error lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: failed v3/v5 SUBSCRIBE operations in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 SUBSCRIBE operations in the last minute to: sysadmin template: vernemq_mqtt_subscribe_auth_error on: vernemq.mqtt_subscribe_auth_error lookup: sum -1m unaligned absolute units: attempts - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute to: sysadmin # UNSUBSCRIBE errors @@ -180,10 +153,10 @@ template: vernemq_mqtt_unsubscribe_error on: vernemq.mqtt_unsubscribe_error lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: failed v3/v5 UNSUBSCRIBE operations in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute to: sysadmin # PUBLISH errors and unauthorized attempts @@ -192,208 +165,136 @@ template: vernemq_mqtt_publish_errors on: vernemq.mqtt_publish_errors lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: failed v3/v5 PUBLISH operations in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 PUBLISH operations in the last minute to: sysadmin template: vernemq_mqtt_publish_auth_errors on: vernemq.mqtt_publish_auth_errors lookup: sum -1m unaligned absolute units: attempts - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unauthorized v3/v5 PUBLISH attempts in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of unauthorized v3/v5 PUBLISH attempts in the last minute to: sysadmin # Unsuccessful and unexpected PUBACK -template: vernemq_mqtt_puback_received_reason_success - on: vernemq.mqtt_puback_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBACK received in the last minute - to: sysadmin - -template: vernemq_mqtt_puback_sent_reason_success - on: vernemq.mqtt_puback_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBACK sent in the last minute - to: sysadmin - template: vernemq_mqtt_puback_received_reason_unsuccessful on: vernemq.mqtt_puback_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_puback_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBACK received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBACK packets in the last minute to: sysadmin template: vernemq_mqtt_puback_sent_reason_unsuccessful on: vernemq.mqtt_puback_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_puback_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBACK sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBACK packets in the last minute to: sysadmin template: vernemq_mqtt_puback_unexpected on: vernemq.mqtt_puback_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unexpected v3/v5 PUBACK received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3/v5 PUBACK packets in the last minute to: sysadmin # Unsuccessful and unexpected PUBREC -template: vernemq_mqtt_pubrec_received_reason_success - on: vernemq.mqtt_pubrec_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREC received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrec_sent_reason_success - on: vernemq.mqtt_pubrec_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREC sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubrec_received_reason_unsuccessful on: vernemq.mqtt_pubrec_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrec_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBREC received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBREC packets in the last minute to: sysadmin template: vernemq_mqtt_pubrec_sent_reason_unsuccessful on: vernemq.mqtt_pubrec_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrec_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBREC sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBREC packets in the last minute to: sysadmin template: vernemq_mqtt_pubrec_invalid_error on: vernemq.mqtt_pubrec_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unexpected v3 PUBREC received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3 PUBREC packets in the last minute to: sysadmin # Unsuccessful PUBREL -template: vernemq_mqtt_pubrel_received_reason_success - on: vernemq.mqtt_pubrel_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREL received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrel_sent_reason_success - on: vernemq.mqtt_pubrel_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREL sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubrel_received_reason_unsuccessful on: vernemq.mqtt_pubrel_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrel_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBREL received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBREL packets in the last minute to: sysadmin template: vernemq_mqtt_pubrel_sent_reason_unsuccessful on: vernemq.mqtt_pubrel_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrel_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBREL sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBREL packets in the last minute to: sysadmin # Unsuccessful and unexpected PUBCOMP -template: vernemq_mqtt_pubcomp_received_reason_success - on: vernemq.mqtt_pubcomp_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBCOMP received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubcomp_sent_reason_success - on: vernemq.mqtt_pubcomp_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBCOMP sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubcomp_received_reason_unsuccessful on: vernemq.mqtt_pubcomp_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubcomp_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBCOMP received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBCOMP packets in the last minute to: sysadmin template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful on: vernemq.mqtt_pubcomp_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBCOMP sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBCOMP packets in the last minute to: sysadmin template: vernemq_mqtt_pubcomp_unexpected on: vernemq.mqtt_pubcomp_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unexpected v3/v5 PUBCOMP received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3/v5 PUBCOMP packets in the last minute to: sysadmin diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf index d8b2be190..3e1414c16 100644 --- a/health/health.d/vsphere.conf +++ b/health/health.d/vsphere.conf @@ -13,7 +13,7 @@ template: vsphere_vm_mem_usage warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used RAM + info: virtual machine memory utilization # -----------------------------------------------HOST Specific---------------------------------------------------------- # Memory @@ -27,7 +27,7 @@ template: vsphere_host_mem_usage warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used RAM + info: host memory utilization # Network errors @@ -38,10 +38,7 @@ families: * lookup: sum -10m unaligned absolute match-names of rx units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface inbound dropped packets in the last 10 minutes - to: sysadmin + info: number of inbound errors for the network interface in the last 10 minutes template: vsphere_outbound_packets_errors on: vsphere.net_errors_total @@ -50,10 +47,7 @@ families: * lookup: sum -10m unaligned absolute match-names of tx units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface outbound dropped packets in the last 10 minutes - to: sysadmin + info: number of outbound errors for the network interface in the last 10 minutes # Network errors ratio @@ -62,13 +56,12 @@ template: vsphere_inbound_packets_errors_ratio hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx - calc: (($vsphere_inbound_packets_errors != nan AND $this > 0) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0)) + calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of inbound errors vs the total number of received packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of inbound errors for the network interface over the last 10 minutes to: sysadmin template: vsphere_outbound_packets_errors_ratio @@ -76,13 +69,12 @@ template: vsphere_outbound_packets_errors_ratio hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx - calc: (($vsphere_outbound_packets_errors != nan AND $this > 0) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0)) + calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of outbound errors vs the total number of sent packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of outbound errors for the network interface over the last 10 minutes to: sysadmin # -----------------------------------------------Common------------------------------------------------------------------- @@ -97,7 +89,7 @@ template: vsphere_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: cpu utilization for the last 10 minutes + info: average CPU utilization to: sysadmin # Network drops @@ -109,10 +101,7 @@ families: * lookup: sum -10m unaligned absolute match-names of rx units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface inbound dropped packets in the last 10 minutes - to: sysadmin + info: number of inbound dropped packets for the network interface in the last 10 minutes template: vsphere_outbound_packets_dropped on: vsphere.net_drops_total @@ -121,10 +110,7 @@ families: * lookup: sum -10m unaligned absolute match-names of tx units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface outbound dropped packets in the last 10 minutes - to: sysadmin + info: number of outbound dropped packets for the network interface in the last 10 minutes # Network drops ratio @@ -133,13 +119,12 @@ template: vsphere_inbound_packets_dropped_ratio hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx - calc: (($vsphere_inbound_packets_dropped != nan AND $this > 0) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0)) + calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of inbound dropped packets for the network interface over the last 10 minutes to: sysadmin template: vsphere_outbound_packets_dropped_ratio @@ -147,11 +132,10 @@ template: vsphere_outbound_packets_dropped_ratio hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx - calc: (($vsphere_outbound_packets_dropped != nan AND $this > 0) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0)) + calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of outbound dropped packets for the network interface over the last 10 minutes to: sysadmin diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 44de38a48..0b01990cb 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -31,7 +31,7 @@ families: * calc: ($this == 0)?(1):($this) units: requests every: 10s - info: the sum of all HTTP requests over the last minute + info: number of HTTP requests in the last minute template: 1m_successful on: web_log.response_statuses @@ -43,7 +43,7 @@ families: * warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute + info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401) to: webmaster template: 1m_redirects @@ -56,7 +56,7 @@ families: * warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP redirects (3xx except 304) over the last minute + info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster template: 1m_bad_requests @@ -69,7 +69,7 @@ families: * warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP bad requests (4xx except 401) over the last minute + info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster template: 1m_internal_errors @@ -82,7 +82,7 @@ families: * warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP internal server errors (5xx), over the last minute + info: ratio of server error HTTP requests over the last minute (5xx) to: webmaster # unmatched lines @@ -101,10 +101,10 @@ families: * calc: ($this == 0)?(1):($this) units: requests every: 10s - info: the sum of all HTTP requests over the last minute + info: number of HTTP requests over the last minute template: 1m_unmatched -on: web_log.response_codes + on: web_log.response_codes families: * lookup: sum -1m unaligned of unmatched calc: $this * 100 / $1m_total_requests @@ -112,7 +112,7 @@ families: * every: 10s warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 ) delay: up 1m down 5m multiplier 1.5 max 1h - info: the ratio of unmatched lines, over the last minute + info: percentage of unparsed log lines over the last minute to: webmaster # ----------------------------------------------------------------------------- @@ -131,7 +131,7 @@ families: * lookup: average -10m unaligned of avg units: ms every: 30s - info: the average time to respond to HTTP requests, over the last 10 minutes + info: average HTTP response time over the last 10 minutes template: web_slow on: web_log.response_time @@ -144,7 +144,7 @@ families: * warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 ) delay: down 15m multiplier 1.5 max 1h - info: the average time to respond to HTTP requests, over the last 1 minute + info: average HTTP response time over the last minute options: no-clear-notification to: webmaster @@ -165,7 +165,7 @@ families: * lookup: average -5m at -5m unaligned of successful_requests units: requests/s every: 30s - info: average rate of successful HTTP requests over the last 5 minutes + info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago template: 5m_successful on: web_log.response_statuses @@ -173,7 +173,7 @@ families: * lookup: average -5m unaligned of successful_requests units: requests/s every: 30s - info: average successful HTTP requests over the last 5 minutes + info: average number of successful HTTP requests over the last 5 minutes template: 5m_requests_ratio on: web_log.response_codes @@ -185,7 +185,7 @@ families: * crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) delay: down 15m multiplier 1.5 max 1h options: no-clear-notification - info: the percentage of successful web requests over the last 5 minutes, \ + info: ratio of successful HTTP requests over the last 5 minutes, \ compared with the previous 5 minutes \ (clear notification for this alarm will not be sent) to: webmaster @@ -224,7 +224,7 @@ families: * calc: ($this == 0)?(1):($this) units: requests every: 10s - info: the sum of all HTTP requests over the last minute + info: number of HTTP requests in the last minute template: web_log_1m_unmatched on: web_log.excluded_requests @@ -235,7 +235,7 @@ families: * every: 10s warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 ) delay: up 1m down 5m multiplier 1.5 max 1h - info: the ratio of unmatched lines, over the last minute + info: percentage of unparsed log lines over the last minute to: webmaster # ----------------------------------------------------------------------------- @@ -255,7 +255,7 @@ families: * calc: ($this == 0)?(1):($this) units: requests every: 10s - info: the sum of all HTTP requests over the last minute + info: number of HTTP requests in the last minute template: web_log_1m_successful on: web_log.type_requests @@ -267,7 +267,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute + info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401) to: webmaster template: web_log_1m_redirects @@ -280,7 +280,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP redirects (3xx except 304) over the last minute + info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster template: web_log_1m_bad_requests @@ -293,7 +293,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP bad requests (4xx except 401) over the last minute + info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster template: web_log_1m_internal_errors @@ -306,7 +306,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP internal server errors (5xx), over the last minute + info: ratio of server error HTTP requests over the last minute (5xx) to: webmaster # ----------------------------------------------------------------------------- @@ -325,7 +325,7 @@ families: * lookup: average -10m unaligned of avg units: ms every: 30s - info: the average time to respond to HTTP requests, over the last 10 minutes + info: average HTTP response time over the last 10 minutes template: web_log_web_slow on: web_log.request_processing_time @@ -338,7 +338,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 ) delay: down 15m multiplier 1.5 max 1h - info: the average time to respond to HTTP requests, over the last 1 minute + info: average HTTP response time over the last 1 minute options: no-clear-notification to: webmaster @@ -359,7 +359,7 @@ families: * lookup: average -5m at -5m unaligned of success units: requests/s every: 30s - info: average rate of successful HTTP requests over the last 5 minutes + info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago template: web_log_5m_successful on: web_log.type_requests @@ -367,7 +367,7 @@ families: * lookup: average -5m unaligned of success units: requests/s every: 30s - info: average successful HTTP requests over the last 5 minutes + info: average number of successful HTTP requests over the last 5 minutes template: web_log_5m_requests_ratio on: web_log.type_requests @@ -379,7 +379,7 @@ families: * crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) delay: down 15m multiplier 1.5 max 1h options: no-clear-notification - info: the percentage of successful web requests over the last 5 minutes, \ + info: ratio of successful HTTP requests over over the last 5 minutes, \ compared with the previous 5 minutes \ (clear notification for this alarm will not be sent) to: webmaster diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf index 275e11dd9..36ae02fa2 100644 --- a/health/health.d/whoisquery.conf +++ b/health/health.d/whoisquery.conf @@ -20,5 +20,5 @@ template: whoisquery_days_until_expiration every: 60s warn: $this < $days_until_expiration_warning*24*60*60 crit: $this < $days_until_expiration_critical*24*60*60 - info: domain time until expiration + info: time until the domain name registration expires to: webmaster diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf index 0441fc1f3..f1f71a606 100644 --- a/health/health.d/wmi.conf +++ b/health/health.d/wmi.conf @@ -26,7 +26,7 @@ template: wmi_10min_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: cpu utilization for the last 10 minutes + info: average CPU utilization over the last 10 minutes to: sysadmin @@ -42,7 +42,7 @@ template: wmi_ram_in_use warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used RAM + info: memory utilization to: sysadmin template: wmi_swap_in_use @@ -55,13 +55,13 @@ template: wmi_swap_in_use warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used Swap + info: swap memory utilization to: sysadmin ## Network -template: inbound_packets_discarded +template: wmi_inbound_packets_discarded on: wmi.net_discarded os: linux hosts: * @@ -71,10 +71,10 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface inbound discarded packets in the last 10 minutes + info: number of inbound discarded packets for the network interface in the last 10 minutes to: sysadmin -template: outbound_packets_discarded +template: wmi_outbound_packets_discarded on: wmi.net_discarded os: linux hosts: * @@ -84,10 +84,10 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface outbound discarded packets in the last 10 minutes + info: number of outbound discarded packets for the network interface in the last 10 minutes to: sysadmin -template: inbound_packets_errors +template: wmi_inbound_packets_errors on: wmi.net_errors os: linux hosts: * @@ -97,10 +97,10 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface inbound errors in the last 10 minutes + info: number of inbound errors for the network interface in the last 10 minutes to: sysadmin -template: outbound_packets_errors +template: wmi_outbound_packets_errors on: wmi.net_errors os: linux hosts: * @@ -110,7 +110,7 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface outbound errors in the last 10 minutes + info: number of outbound errors for the network interface in the last 10 minutes to: sysadmin @@ -126,5 +126,5 @@ template: wmi_disk_in_use warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used disk space + info: disk space utilization to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index dfca37706..f2e4a050d 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -20,7 +20,7 @@ template: x509check_days_until_expiration every: 60s warn: $this < $days_until_expiration_warning*24*60*60 crit: $this < $days_until_expiration_critical*24*60*60 - info: certificate time until expiration + info: time until x509 certificate expires to: webmaster template: x509check_revocation_status @@ -28,5 +28,5 @@ template: x509check_revocation_status calc: $revoked every: 60s crit: $this != nan AND $this != 0 - info: certificate revocation status + info: x509 certificate revocation status (0: revoked, 1: valid) to: webmaster diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf index af73824e6..74f96dd32 100644 --- a/health/health.d/zfs.conf +++ b/health/health.d/zfs.conf @@ -6,5 +6,5 @@ every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 2h - info: the number of times ZFS had to limit the ARC growth in the last 10 minutes + info: number of times ZFS had to limit the ARC growth in the last 10 minutes to: sysadmin diff --git a/health/health.h b/health/health.h index 5281e16e3..07ce1311e 100644 --- a/health/health.h +++ b/health/health.h @@ -64,7 +64,7 @@ extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC * extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); -extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after); +extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf); void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf); diff --git a/health/health_config.c b/health/health_config.c index 1acf36933..e24acf77c 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -384,7 +384,7 @@ static inline int health_parse_db_lookup( } // sane defaults - *every = abs(*after); + *every = ABS(*after); // now we may have optional parameters while(*s) { diff --git a/health/health_json.c b/health/health_json.c index 7b5a1e3cb..2a81d1c02 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -2,7 +2,7 @@ #include "health.h" -static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) { +void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) { if(value && *value) { buffer_sprintf(wb, "%s\"%s\":\"", prefix, label); buffer_strcat_htmlescape(wb, value); @@ -13,7 +13,7 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix); } -inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { +void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { buffer_sprintf(wb, "\n\t{\n" "\t\t\"hostname\": \"%s\",\n" @@ -93,18 +93,22 @@ inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST buffer_strcat(wb, "\t}"); } -void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) { +void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); buffer_strcat(wb, "["); unsigned int max = host->health_log.max; unsigned int count = 0; + uint32_t hash_chart = 0; + if (chart) hash_chart = simple_hash(chart); ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) { - if(ae->unique_id > after) { - if(likely(count)) buffer_strcat(wb, ","); + for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) { + if ((ae->unique_id > after) && (!chart || (ae->hash_chart == hash_chart && !strcmp(ae->chart, chart)))) { + if (likely(count)) + buffer_strcat(wb, ","); health_alarm_entry2json_nolock(wb, ae, host); + count++; } } @@ -298,6 +302,9 @@ static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, v if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; + if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) + continue; + if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL))) continue; diff --git a/health/health_log.c b/health/health_log.c index 8c0bc5c34..3205f5920 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -213,8 +213,8 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) { ALARM_ENTRY *ae = NULL; - if(entries < 26) { - error("HEALTH [%s]: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries); + if(entries < 27) { + error("HEALTH [%s]: line %zu of file '%s' should have at least 27 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries); errored++; continue; } @@ -243,7 +243,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); if (!rc) { for(rc = host->alarms; rc ; rc = rc->next) { - RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc); + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl_t *)rc); if(rdcmp != rc) { error("Cannot insert the alarm index ID using log %s", rc->name); } diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 3bf8db5f6..bf6c02816 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -209,6 +209,9 @@ if [[ ${1} = "unittest" ]]; then cfgfile="${3}" # the location of the config file to use for unit testing status="${4}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL +elif [[ ${1} = "dump_methods" ]]; then + dump_methods=1 + status="WARNING" else roles="${1}" # the roles that should be notified for this event args_host="${2}" # the host generated this event @@ -372,6 +375,7 @@ EMAIL_PLAINTEXT_ONLY= IRC_NICKNAME= IRC_REALNAME= IRC_NETWORK= +IRC_PORT=6667 # hangouts configs declare -A HANGOUTS_WEBHOOK_URI @@ -549,6 +553,15 @@ filter_recipient_by_criticality() { # check stackpulse [ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO" +# check msteam +[ -z "${MSTEAM_WEBHOOK_URL}" ] && SEND_MSTEAM="NO" + +# check pd +[ -z "${DEFAULT_RECIPIENT_PD}" ] && SEND_PD="NO" + +# check prowl +[ -z "${DEFAULT_RECIPIENT_PROWL}" ] && SEND_PROWL="NO" + if [ "${SEND_PUSHOVER}" = "YES" ] || [ "${SEND_SLACK}" = "YES" ] || [ "${SEND_ROCKETCHAT}" = "YES" ] || @@ -639,6 +652,15 @@ if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then fi fi +if [ ${dump_methods} ]; then + for name in "${!SEND_@}"; do + if [ "${!name}" = "YES" ]; then + echo "$name" + fi + done + exit +fi + # ----------------------------------------------------------------------------- # find the recipients' addresses per method @@ -864,14 +886,15 @@ send_email() { echo >&2 "--- END sendmail command ---" fi - "${sendmail}" -t "${opts[@]}" + local cmd_output + cmd_output=$("${sendmail}" -t "${opts[@]}" 2>&1) ret=$? if [ ${ret} -eq 0 ]; then info "sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'" return 0 else - error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}." + error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret} (${cmd_output})." return 1 fi fi @@ -1722,9 +1745,9 @@ send_prowl() { # irc sender send_irc() { - local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" SERVERNAME="${5}" MESSAGE="${6}" sent=0 channel color send_alarm reply_codes error + local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" PORT="${5}" SERVERNAME="${6}" MESSAGE="${7}" sent=0 channel color send_alarm reply_codes error - if [ "${SEND_IRC}" = "YES" ] && [ -n "${NICKNAME}" ] && [ -n "${REALNAME}" ] && [ -n "${CHANNELS}" ] && [ -n "${NETWORK}" ] && [ -n "${SERVERNAME}" ]; then + if [ "${SEND_IRC}" = "YES" ] && [ -n "${NICKNAME}" ] && [ -n "${REALNAME}" ] && [ -n "${CHANNELS}" ] && [ -n "${NETWORK}" ] && [ -n "${SERVERNAME}" ] && [ -n "${PORT}" ]; then case "${status}" in WARNING) color="warning" ;; CRITICAL) color="danger" ;; @@ -1735,7 +1758,7 @@ send_irc() { SNDMESSAGE="${MESSAGE//$'\n'/", "}" for CHANNEL in ${CHANNELS}; do error=0 - send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | nc "${NETWORK}" 6667) + send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | nc "${NETWORK}" "${PORT}") reply_codes=$(echo "${send_alarm}" | cut -d ' ' -f 2 | grep -o '[0-9]*') for code in ${reply_codes}; do if [ "${code}" -ge 400 ] && [ "${code}" -le 599 ]; then @@ -2465,7 +2488,7 @@ SENT_PROWL=$? # ----------------------------------------------------------------------------- # send the irc message -send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm} +send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${IRC_PORT}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm} Severity: ${severity} Chart: ${chart} Family: ${family} diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md index 827a9c0be..ebd7f4b8c 100644 --- a/health/notifications/email/README.md +++ b/health/notifications/email/README.md @@ -43,7 +43,7 @@ You can always find the location of the alarm-notify.sh script in `netdata.conf` If you want an alternative to `sendmail` in order to have a simple MTA configuration for sending emails and auth to an existing SMTP server, you can do the following: - Install `msmtp`. -- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `mstmp`: +- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `msmtp`: ``` # The full path to the sendmail command. # If empty, the system $PATH will be searched for it. diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf index be669e135..2dab1d489 100755 --- a/health/notifications/health_alarm_notify.conf +++ b/health/notifications/health_alarm_notify.conf @@ -676,6 +676,10 @@ DEFAULT_RECIPIENT_IRC="" # e.g. "irc.freenode.net" IRC_NETWORK="" +# The irc port to which a connection will occur. +# e.g. 6667 (the default one), 6697 (a TLS/SSL one) +IRC_PORT=6667 + # The irc nickname which is required to send the notification. It must not be # an already registered name as the connection's MODE is defined as a 'guest'. IRC_NICKNAME="" diff --git a/health/notifications/stackpulse/README.md b/health/notifications/stackpulse/README.md index 13d2f7235..4c44954ab 100644 --- a/health/notifications/stackpulse/README.md +++ b/health/notifications/stackpulse/README.md @@ -39,8 +39,9 @@ SEND_STACKPULSE="YES" STACKPULSE_WEBHOOK="https://hooks.stackpulse.io/v1/webhooks/YOUR_UNIQUE_ID" ``` -4. Now [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). When your node creates an alarm, you - can see the associated notification on your StackPulse Administration Portal +4. Now restart Netdata using `sudo systemctl restart netdata`, or the [appropriate + method](/docs/configure/start-stop-restart.md) for your system. When your node creates an alarm, you can see the + associated notification on your StackPulse Administration Portal ## React to alarms with playbooks diff --git a/libnetdata/avl/avl.c b/libnetdata/avl/avl.c index 521985189..b05b97acb 100644 --- a/libnetdata/avl/avl.c +++ b/libnetdata/avl/avl.c @@ -17,8 +17,8 @@ /* Search |tree| for an item matching |item|, and return it if found. Otherwise return |NULL|. */ -avl *avl_search(avl_tree_type *tree, avl *item) { - avl *p; +avl_t *avl_search(avl_tree_type *tree, avl_t *item) { + avl_t *p; // assert (tree != NULL && item != NULL); @@ -40,11 +40,11 @@ avl *avl_search(avl_tree_type *tree, avl *item) { If a duplicate item is found in the tree, returns a pointer to the duplicate without inserting |item|. */ -avl *avl_insert(avl_tree_type *tree, avl *item) { - avl *y, *z; /* Top node to update balance factor, and parent. */ - avl *p, *q; /* Iterator, and parent. */ - avl *n; /* Newly inserted node. */ - avl *w; /* New root of rebalanced subtree. */ +avl_t *avl_insert(avl_tree_type *tree, avl_t *item) { + avl_t *y, *z; /* Top node to update balance factor, and parent. */ + avl_t *p, *q; /* Iterator, and parent. */ + avl_t *n; /* Newly inserted node. */ + avl_t *w; /* New root of rebalanced subtree. */ unsigned char dir; /* Direction to descend. */ unsigned char da[AVL_MAX_HEIGHT]; /* Cached comparison results. */ @@ -52,7 +52,7 @@ avl *avl_insert(avl_tree_type *tree, avl *item) { // assert(tree != NULL && item != NULL); - z = (avl *) &tree->root; + z = (avl_t *) &tree->root; y = tree->root; dir = 0; for (q = z, p = y; p != NULL; q = p, p = p->avl_link[dir]) { @@ -79,7 +79,7 @@ avl *avl_insert(avl_tree_type *tree, avl *item) { p->avl_balance++; if (y->avl_balance == -2) { - avl *x = y->avl_link[0]; + avl_t *x = y->avl_link[0]; if (x->avl_balance == -1) { w = x; y->avl_link[0] = x->avl_link[1]; @@ -103,7 +103,7 @@ avl *avl_insert(avl_tree_type *tree, avl *item) { } } else if (y->avl_balance == +2) { - avl *x = y->avl_link[1]; + avl_t *x = y->avl_link[1]; if (x->avl_balance == +1) { w = x; y->avl_link[1] = x->avl_link[0]; @@ -136,19 +136,19 @@ avl *avl_insert(avl_tree_type *tree, avl *item) { /* Deletes from |tree| and returns an item matching |item|. Returns a null pointer if no matching item found. */ -avl *avl_remove(avl_tree_type *tree, avl *item) { +avl_t *avl_remove(avl_tree_type *tree, avl_t *item) { /* Stack of nodes. */ - avl *pa[AVL_MAX_HEIGHT]; /* Nodes. */ + avl_t *pa[AVL_MAX_HEIGHT]; /* Nodes. */ unsigned char da[AVL_MAX_HEIGHT]; /* |avl_link[]| indexes. */ int k; /* Stack pointer. */ - avl *p; /* Traverses tree to find node to delete. */ + avl_t *p; /* Traverses tree to find node to delete. */ int cmp; /* Result of comparison between |item| and |p|. */ // assert (tree != NULL && item != NULL); k = 0; - p = (avl *) &tree->root; + p = (avl_t *) &tree->root; for(cmp = -1; cmp != 0; cmp = tree->compar(item, p)) { unsigned char dir = (unsigned char)(cmp > 0); @@ -164,7 +164,7 @@ avl *avl_remove(avl_tree_type *tree, avl *item) { if (p->avl_link[1] == NULL) pa[k - 1]->avl_link[da[k - 1]] = p->avl_link[0]; else { - avl *r = p->avl_link[1]; + avl_t *r = p->avl_link[1]; if (r->avl_link[0] == NULL) { r->avl_link[0] = p->avl_link[0]; r->avl_balance = p->avl_balance; @@ -173,7 +173,7 @@ avl *avl_remove(avl_tree_type *tree, avl *item) { pa[k++] = r; } else { - avl *s; + avl_t *s; int j = k++; for (;;) { @@ -198,15 +198,15 @@ avl *avl_remove(avl_tree_type *tree, avl *item) { // assert (k > 0); while (--k > 0) { - avl *y = pa[k]; + avl_t *y = pa[k]; if (da[k] == 0) { y->avl_balance++; if (y->avl_balance == +1) break; else if (y->avl_balance == +2) { - avl *x = y->avl_link[1]; + avl_t *x = y->avl_link[1]; if (x->avl_balance == -1) { - avl *w; + avl_t *w; // assert (x->avl_balance == -1); w = x->avl_link[0]; x->avl_link[0] = w->avl_link[1]; @@ -240,9 +240,9 @@ avl *avl_remove(avl_tree_type *tree, avl *item) { y->avl_balance--; if (y->avl_balance == -1) break; else if (y->avl_balance == -2) { - avl *x = y->avl_link[0]; + avl_t *x = y->avl_link[0]; if (x->avl_balance == +1) { - avl *w; + avl_t *w; // assert (x->avl_balance == +1); w = x->avl_link[1]; x->avl_link[1] = w->avl_link[0]; @@ -284,7 +284,7 @@ avl *avl_remove(avl_tree_type *tree, avl *item) { // --------------------------- // traversing -int avl_walker(avl *node, int (*callback)(void * /*entry*/, void * /*data*/), void *data) { +int avl_walker(avl_t *node, int (*callback)(void * /*entry*/, void * /*data*/), void *data) { int total = 0, ret = 0; if(node->avl_link[0]) { @@ -383,23 +383,23 @@ void avl_destroy_lock(avl_tree_lock *tree) { #endif /* AVL_WITHOUT_PTHREADS */ } -avl *avl_search_lock(avl_tree_lock *tree, avl *item) { +avl_t *avl_search_lock(avl_tree_lock *tree, avl_t *item) { avl_read_lock(tree); - avl *ret = avl_search(&tree->avl_tree, item); + avl_t *ret = avl_search(&tree->avl_tree, item); avl_unlock(tree); return ret; } -avl * avl_remove_lock(avl_tree_lock *tree, avl *item) { +avl_t * avl_remove_lock(avl_tree_lock *tree, avl_t *item) { avl_write_lock(tree); - avl *ret = avl_remove(&tree->avl_tree, item); + avl_t *ret = avl_remove(&tree->avl_tree, item); avl_unlock(tree); return ret; } -avl *avl_insert_lock(avl_tree_lock *tree, avl *item) { +avl_t *avl_insert_lock(avl_tree_lock *tree, avl_t *item) { avl_write_lock(tree); - avl * ret = avl_insert(&tree->avl_tree, item); + avl_t * ret = avl_insert(&tree->avl_tree, item); avl_unlock(tree); return ret; } diff --git a/libnetdata/avl/avl.h b/libnetdata/avl/avl.h index 32e3f27a8..eba967fd0 100644 --- a/libnetdata/avl/avl.h +++ b/libnetdata/avl/avl.h @@ -28,14 +28,14 @@ /* Data structures */ /* One element of the AVL tree */ -typedef struct avl { - struct avl *avl_link[2]; /* Subtrees. */ +typedef struct avl_element { + struct avl_element *avl_link[2]; /* Subtrees. */ signed char avl_balance; /* Balance factor. */ -} avl; +} avl_t; /* An AVL tree */ typedef struct avl_tree_type { - avl *root; + avl_t *root; int (*compar)(void *a, void *b); } avl_tree_type; @@ -59,23 +59,23 @@ typedef struct avl_tree_lock { * a is linked directly to the tree, so it has to * be properly allocated by the caller. */ -avl *avl_insert_lock(avl_tree_lock *tree, avl *item) NEVERNULL WARNUNUSED; -avl *avl_insert(avl_tree_type *tree, avl *item) NEVERNULL WARNUNUSED; +avl_t *avl_insert_lock(avl_tree_lock *tree, avl_t *item) NEVERNULL WARNUNUSED; +avl_t *avl_insert(avl_tree_type *tree, avl_t *item) NEVERNULL WARNUNUSED; /* Remove an element a from the AVL tree t * returns a pointer to the removed element * or NULL if an element equal to a is not found * (equal as returned by t->compar()) */ -avl *avl_remove_lock(avl_tree_lock *tree, avl *item) WARNUNUSED; -avl *avl_remove(avl_tree_type *tree, avl *item) WARNUNUSED; +avl_t *avl_remove_lock(avl_tree_lock *tree, avl_t *item) WARNUNUSED; +avl_t *avl_remove(avl_tree_type *tree, avl_t *item) WARNUNUSED; /* Find the element into the tree that equal to a * (equal as returned by t->compar()) * returns NULL is no element is equal to a */ -avl *avl_search_lock(avl_tree_lock *tree, avl *item); -avl *avl_search(avl_tree_type *tree, avl *item); +avl_t *avl_search_lock(avl_tree_lock *tree, avl_t *item); +avl_t *avl_search(avl_tree_type *tree, avl_t *item); /* Initialize the avl_tree_lock */ diff --git a/libnetdata/config/appconfig.c b/libnetdata/config/appconfig.c index d9dcde71a..f570f32d7 100644 --- a/libnetdata/config/appconfig.c +++ b/libnetdata/config/appconfig.c @@ -123,15 +123,15 @@ static int appconfig_option_compare(void *a, void *b) { else return strcmp(((struct config_option *)a)->name, ((struct config_option *)b)->name); } -#define appconfig_option_index_add(co, cv) (struct config_option *)avl_insert_lock(&((co)->values_index), (avl *)(cv)) -#define appconfig_option_index_del(co, cv) (struct config_option *)avl_remove_lock(&((co)->values_index), (avl *)(cv)) +#define appconfig_option_index_add(co, cv) (struct config_option *)avl_insert_lock(&((co)->values_index), (avl_t *)(cv)) +#define appconfig_option_index_del(co, cv) (struct config_option *)avl_remove_lock(&((co)->values_index), (avl_t *)(cv)) static struct config_option *appconfig_option_index_find(struct section *co, const char *name, uint32_t hash) { struct config_option tmp; tmp.hash = (hash)?hash:simple_hash(name); tmp.name = (char *)name; - return (struct config_option *)avl_search_lock(&(co->values_index), (avl *) &tmp); + return (struct config_option *)avl_search_lock(&(co->values_index), (avl_t *) &tmp); } @@ -144,15 +144,15 @@ int appconfig_section_compare(void *a, void *b) { else return strcmp(((struct section *)a)->name, ((struct section *)b)->name); } -#define appconfig_index_add(root, cfg) (struct section *)avl_insert_lock(&(root)->index, (avl *)(cfg)) -#define appconfig_index_del(root, cfg) (struct section *)avl_remove_lock(&(root)->index, (avl *)(cfg)) +#define appconfig_index_add(root, cfg) (struct section *)avl_insert_lock(&(root)->index, (avl_t *)(cfg)) +#define appconfig_index_del(root, cfg) (struct section *)avl_remove_lock(&(root)->index, (avl_t *)(cfg)) static struct section *appconfig_index_find(struct config *root, const char *name, uint32_t hash) { struct section tmp; tmp.hash = (hash)?hash:simple_hash(name); tmp.name = (char *)name; - return (struct section *)avl_search_lock(&root->index, (avl *) &tmp); + return (struct section *)avl_search_lock(&root->index, (avl_t *) &tmp); } diff --git a/libnetdata/config/appconfig.h b/libnetdata/config/appconfig.h index 9d02e4ada..f405eeb09 100644 --- a/libnetdata/config/appconfig.h +++ b/libnetdata/config/appconfig.h @@ -111,7 +111,7 @@ #define CONFIG_VALUE_CHECKED 0x08 // has been checked if the value is different from the default struct config_option { - avl avl_node; // the index entry of this entry - this has to be first! + avl_t avl_node; // the index entry of this entry - this has to be first! uint8_t flags; uint32_t hash; // a simple hash to speed up searching @@ -124,7 +124,7 @@ struct config_option { }; struct section { - avl avl_node; // the index entry of this section - this has to be first! + avl_t avl_node; // the index entry of this section - this has to be first! uint32_t hash; // a simple hash to speed up searching // we first compare hashes, and only if the hashes are equal we do string comparisons diff --git a/libnetdata/dictionary/dictionary.c b/libnetdata/dictionary/dictionary.c index cfcf1fbab..b3dc3f371 100644 --- a/libnetdata/dictionary/dictionary.c +++ b/libnetdata/dictionary/dictionary.c @@ -67,7 +67,7 @@ static inline NAME_VALUE *dictionary_name_value_index_find_nolock(DICTIONARY *di tmp.name = (char *)name; NETDATA_DICTIONARY_STATS_SEARCHES_PLUS1(dict); - return (NAME_VALUE *)avl_search(&(dict->values_index), (avl *) &tmp); + return (NAME_VALUE *)avl_search(&(dict->values_index), (avl_t *) &tmp); } // ---------------------------------------------------------------------------- @@ -95,7 +95,7 @@ static NAME_VALUE *dictionary_name_value_create_nolock(DICTIONARY *dict, const c // index it NETDATA_DICTIONARY_STATS_INSERTS_PLUS1(dict); - if(unlikely(avl_insert(&((dict)->values_index), (avl *)(nv)) != (avl *)nv)) + if(unlikely(avl_insert(&((dict)->values_index), (avl_t *)(nv)) != (avl_t *)nv)) error("dictionary: INTERNAL ERROR: duplicate insertion to dictionary."); NETDATA_DICTIONARY_STATS_ENTRIES_PLUS1(dict); @@ -107,7 +107,7 @@ static void dictionary_name_value_destroy_nolock(DICTIONARY *dict, NAME_VALUE *n debug(D_DICTIONARY, "Destroying name value entry for name '%s'.", nv->name); NETDATA_DICTIONARY_STATS_DELETES_PLUS1(dict); - if(unlikely(avl_remove(&(dict->values_index), (avl *)(nv)) != (avl *)nv)) + if(unlikely(avl_remove(&(dict->values_index), (avl_t *)(nv)) != (avl_t *)nv)) error("dictionary: INTERNAL ERROR: dictionary invalid removal of node."); NETDATA_DICTIONARY_STATS_ENTRIES_MINUS1(dict); @@ -258,7 +258,7 @@ int dictionary_del(DICTIONARY *dict, const char *name) { // the dictionary is locked for reading while this happens // do not user other dictionary calls while walking the dictionary - deadlock! -static int dictionary_walker(avl *a, int (*callback)(void *entry, void *data), void *data) { +static int dictionary_walker(avl_t *a, int (*callback)(void *entry, void *data), void *data) { int total = 0, ret = 0; if(a->avl_link[0]) { @@ -293,7 +293,7 @@ int dictionary_get_all(DICTIONARY *dict, int (*callback)(void *entry, void *data return ret; } -static int dictionary_walker_name_value(avl *a, int (*callback)(char *name, void *entry, void *data), void *data) { +static int dictionary_walker_name_value(avl_t *a, int (*callback)(char *name, void *entry, void *data), void *data) { int total = 0, ret = 0; if(a->avl_link[0]) { diff --git a/libnetdata/dictionary/dictionary.h b/libnetdata/dictionary/dictionary.h index fc24ec2ec..76213887e 100644 --- a/libnetdata/dictionary/dictionary.h +++ b/libnetdata/dictionary/dictionary.h @@ -13,7 +13,7 @@ struct dictionary_stats { }; typedef struct name_value { - avl avl_node; // the index - this has to be first! + avl_t avl_node; // the index - this has to be first! uint32_t hash; // a simple hash to speed up searching // we first compare hashes, and only if the hashes are equal we do string comparisons diff --git a/libnetdata/ebpf/ebpf.c b/libnetdata/ebpf/ebpf.c index a9ff21f69..8619ae26f 100644 --- a/libnetdata/ebpf/ebpf.c +++ b/libnetdata/ebpf/ebpf.c @@ -8,6 +8,9 @@ #include "../libnetdata.h" +char *ebpf_user_config_dir = CONFIG_DIR; +char *ebpf_stock_config_dir = LIBCONFIG_DIR; + /* static int clean_kprobe_event(FILE *out, char *filename, char *father_pid, netdata_ebpf_events_t *ptr) { @@ -97,7 +100,7 @@ int get_kernel_version(char *out, int size) return -1; move = patch; - while (*version && *version != '\n') + while (*version && *version != '\n' && *version != '-') *move++ = *version++; *move = '\0'; @@ -182,20 +185,26 @@ static int kernel_is_rejected() } char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s/%s", config_dir, EBPF_KERNEL_REJECT_LIST_FILE); + snprintfz(filename, FILENAME_MAX, "%s/ebpf.d/%s", config_dir, EBPF_KERNEL_REJECT_LIST_FILE); FILE *kernel_reject_list = fopen(filename, "r"); if (!kernel_reject_list) { - config_dir = getenv("NETDATA_STOCK_CONFIG_DIR"); - if (config_dir == NULL) { - config_dir = LIBCONFIG_DIR; - } - + // Keep this to have compatibility with old versions snprintfz(filename, FILENAME_MAX, "%s/%s", config_dir, EBPF_KERNEL_REJECT_LIST_FILE); kernel_reject_list = fopen(filename, "r"); - if (!kernel_reject_list) - return 0; + if (!kernel_reject_list) { + config_dir = getenv("NETDATA_STOCK_CONFIG_DIR"); + if (config_dir == NULL) { + config_dir = LIBCONFIG_DIR; + } + + snprintfz(filename, FILENAME_MAX, "%s/ebpf.d/%s", config_dir, EBPF_KERNEL_REJECT_LIST_FILE); + kernel_reject_list = fopen(filename, "r"); + + if (!kernel_reject_list) + return 0; + } } // Find if the kernel is in the reject list @@ -246,7 +255,9 @@ char *ebpf_kernel_suffix(int version, int isrh) else return "3.10"; } else { - if (version >= NETDATA_EBPF_KERNEL_5_10) + if (version >= NETDATA_EBPF_KERNEL_5_11) + return "5.11"; + else if (version >= NETDATA_EBPF_KERNEL_5_10) return "5.10"; else if (version >= NETDATA_EBPF_KERNEL_4_17) return "5.4"; @@ -294,8 +305,10 @@ struct bpf_link **ebpf_load_program(char *plugins_dir, ebpf_module_t *em, char * if (test < 0 || test > 127) return NULL; - snprintf(lpath, 4096, "%s/%s", plugins_dir, lname); - if (bpf_prog_load(lpath, BPF_PROG_TYPE_KPROBE, obj, &prog_fd)) { + snprintf(lpath, 4096, "%s/ebpf.d/%s", plugins_dir, lname); + // We are using BPF_PROG_TYPE_UNSPEC instead a specific type for bpf_prog_load to define the type + // according the eBPF program loaded + if (bpf_prog_load(lpath, BPF_PROG_TYPE_UNSPEC, obj, &prog_fd)) { em->enabled = CONFIG_BOOLEAN_NO; info("Cannot load program: %s", lpath); return NULL; @@ -322,3 +335,65 @@ struct bpf_link **ebpf_load_program(char *plugins_dir, ebpf_module_t *em, char * return links; } + +//---------------------------------------------------------------------------------------------------------------------- + +void ebpf_mount_config_name(char *filename, size_t length, char *path, char *config) +{ + snprintf(filename, length, "%s/ebpf.d/%s", path, config); +} + +int ebpf_load_config(struct config *config, char *filename) +{ + return appconfig_load(config, filename, 0, NULL); +} + + +static netdata_run_mode_t ebpf_select_mode(char *mode) +{ + if (!strcasecmp(mode, "return")) + return MODE_RETURN; + else if (!strcasecmp(mode, "dev")) + return MODE_DEVMODE; + + return MODE_ENTRY; +} + +void ebpf_update_module_using_config(ebpf_module_t *modules, struct config *cfg) +{ + char *mode = appconfig_get(cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_LOAD_MODE, EBPF_CFG_LOAD_MODE_DEFAULT); + modules->mode = ebpf_select_mode(mode); + + modules->update_time = (int)appconfig_get_number(cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_UPDATE_EVERY, 1); + + modules->apps_charts = appconfig_get_boolean(cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_APPLICATION, + CONFIG_BOOLEAN_YES); +} + + +/** + * Update module + * + * When this function is called, it will load the configuration file and after this + * it updates the global information of ebpf_module. + * If the module has specific configuration, this function will load it, but it will not + * update the variables. + * + * @param em the module structure + * @param cfg the configuration structure + * @param cfg_file the filename to load + */ +void ebpf_update_module(ebpf_module_t *em, struct config *cfg, char *cfg_file) +{ + char filename[FILENAME_MAX+1]; + ebpf_mount_config_name(filename, FILENAME_MAX, ebpf_user_config_dir, cfg_file); + if (!ebpf_load_config(cfg, filename)) { + ebpf_mount_config_name(filename, FILENAME_MAX, ebpf_stock_config_dir, cfg_file); + if (!ebpf_load_config(cfg, filename)) { + error("Cannot load the ebpf configuration file %s", cfg_file); + return; + } + } + + ebpf_update_module_using_config(em, cfg); +} diff --git a/libnetdata/ebpf/ebpf.h b/libnetdata/ebpf/ebpf.h index d4faccffd..ac3a1a2fc 100644 --- a/libnetdata/ebpf/ebpf.h +++ b/libnetdata/ebpf/ebpf.h @@ -8,6 +8,15 @@ #define NETDATA_DEBUGFS "/sys/kernel/debug/tracing/" +// Config files +#define EBPF_GLOBAL_SECTION "global" +#define EBPF_CFG_LOAD_MODE "ebpf load mode" +#define EBPF_CFG_LOAD_MODE_DEFAULT "entry" +#define EBPF_CFG_LOAD_MODE_RETURN "return" + +#define EBPF_CFG_UPDATE_EVERY "update every" +#define EBPF_CFG_APPLICATION "apps" + /** * The next magic number is got doing the following math: * 294960 = 4*65536 + 11*256 + 0 @@ -31,6 +40,13 @@ */ #define NETDATA_RH_8 2048 +/** + * Kernel 5.11 + * + * 330240 = 5*65536 + 11*256 + */ +#define NETDATA_EBPF_KERNEL_5_11 330496 + /** * Kernel 5.10 * @@ -62,6 +78,9 @@ #define VERSION_STRING_LEN 256 #define EBPF_KERNEL_REJECT_LIST_FILE "ebpf_kernel_reject_list.txt" +extern char *ebpf_user_config_dir; +extern char *ebpf_stock_config_dir; + typedef struct ebpf_data { int *map_fd; @@ -87,6 +106,7 @@ typedef struct ebpf_module { netdata_run_mode_t mode; uint32_t thread_id; int optional; + void (*apps_routine)(struct ebpf_module *em, void *ptr); } ebpf_module_t; #define NETDATA_MAX_PROBES 64 @@ -102,4 +122,9 @@ extern struct bpf_link **ebpf_load_program(char *plugins_dir, struct bpf_object **obj, int *map_fd); +extern void ebpf_mount_config_name(char *filename, size_t length, char *path, char *config); +extern int ebpf_load_config(struct config *config, char *filename); +extern void ebpf_update_module_using_config(ebpf_module_t *modules, struct config *cfg); +extern void ebpf_update_module(ebpf_module_t *em, struct config *cfg, char *cfg_file); + #endif /* NETDATA_EBPF_H */ diff --git a/libnetdata/eval/eval.c b/libnetdata/eval/eval.c index b53b07039..7ca45882f 100644 --- a/libnetdata/eval/eval.c +++ b/libnetdata/eval/eval.c @@ -296,7 +296,7 @@ calculated_number eval_abs(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { calculated_number n1 = eval_value(exp, &op->ops[0], error); if(isnan(n1)) return NAN; if(isinf(n1)) return INFINITY; - return abs(n1); + return ABS(n1); } calculated_number eval_if_then_else(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { if(is_true(eval_value(exp, &op->ops[0], error))) diff --git a/libnetdata/libnetdata.c b/libnetdata/libnetdata.c index 325df3f76..6ccb61ede 100644 --- a/libnetdata/libnetdata.c +++ b/libnetdata/libnetdata.c @@ -1406,45 +1406,46 @@ void recursive_config_double_dir_load(const char *user_path, const char *stock_p if (!dir) { error("CONFIG cannot open stock config directory '%s'.", sdir); } - else if (strcmp(udir, sdir)) { - struct dirent *de = NULL; - while((de = readdir(dir))) { - if(de->d_type == DT_DIR || de->d_type == DT_LNK) { - if( !de->d_name[0] || - (de->d_name[0] == '.' && de->d_name[1] == '\0') || - (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0') + else { + if (strcmp(udir, sdir)) { + struct dirent *de = NULL; + while((de = readdir(dir))) { + if(de->d_type == DT_DIR || de->d_type == DT_LNK) { + if( !de->d_name[0] || + (de->d_name[0] == '.' && de->d_name[1] == '\0') || + (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0') ) { - debug(D_HEALTH, "CONFIG ignoring stock config directory '%s/%s'", sdir, de->d_name); - continue; - } + debug(D_HEALTH, "CONFIG ignoring stock config directory '%s/%s'", sdir, de->d_name); + continue; + } - if(path_is_dir(sdir, de->d_name)) { - // we recurse in stock subdirectory, only when there is no corresponding - // user subdirectory - to avoid reading the files twice + if(path_is_dir(sdir, de->d_name)) { + // we recurse in stock subdirectory, only when there is no corresponding + // user subdirectory - to avoid reading the files twice - if(!path_is_dir(udir, de->d_name)) - recursive_config_double_dir_load(udir, sdir, de->d_name, callback, data, depth + 1); + if(!path_is_dir(udir, de->d_name)) + recursive_config_double_dir_load(udir, sdir, de->d_name, callback, data, depth + 1); - continue; + continue; + } } - } - if(de->d_type == DT_UNKNOWN || de->d_type == DT_REG || de->d_type == DT_LNK) { - size_t len = strlen(de->d_name); - if(path_is_file(sdir, de->d_name) && !path_is_file(udir, de->d_name) && - len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) { - char *filename = strdupz_path_subpath(sdir, de->d_name); - debug(D_HEALTH, "CONFIG calling callback for stock file '%s'", filename); - callback(filename, data); - freez(filename); - continue; + if(de->d_type == DT_UNKNOWN || de->d_type == DT_REG || de->d_type == DT_LNK) { + size_t len = strlen(de->d_name); + if(path_is_file(sdir, de->d_name) && !path_is_file(udir, de->d_name) && + len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) { + char *filename = strdupz_path_subpath(sdir, de->d_name); + debug(D_HEALTH, "CONFIG calling callback for stock file '%s'", filename); + callback(filename, data); + freez(filename); + continue; + } + } + debug(D_HEALTH, "CONFIG ignoring stock-config file '%s/%s' of type %d", udir, de->d_name, (int)de->d_type); } - - debug(D_HEALTH, "CONFIG ignoring stock-config file '%s/%s' of type %d", udir, de->d_name, (int)de->d_type); } - closedir(dir); } diff --git a/libnetdata/libnetdata.h b/libnetdata/libnetdata.h index 50568b5bc..212273870 100644 --- a/libnetdata/libnetdata.h +++ b/libnetdata/libnetdata.h @@ -205,11 +205,7 @@ extern "C" { #define WARNUNUSED #endif -#ifdef abs -#undef abs -#endif -#define abs(x) (((x) < 0)? (-(x)) : (x)) - +#define ABS(x) (((x) < 0)? (-(x)) : (x)) #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) @@ -291,7 +287,8 @@ extern char *read_by_filename(char *filename, long *file_size); #endif #endif -#define BITS_IN_A_KILOBIT 1000 +#define BITS_IN_A_KILOBIT 1000 +#define KILOBITS_IN_A_MEGABIT 1000 /* misc. */ #define UNUSED(x) (void)(x) @@ -319,7 +316,7 @@ extern char *netdata_configured_host_prefix; #include "log/log.h" #include "procfile/procfile.h" #include "dictionary/dictionary.h" -#ifdef HAVE_LIBBPF +#if defined(HAVE_LIBBPF) && !defined(__cplusplus) #include "ebpf/ebpf.h" #endif #include "eval/eval.h" diff --git a/libnetdata/tests/test_str2ld.c b/libnetdata/tests/test_str2ld.c index 9d59f6c0e..01d8677f0 100644 --- a/libnetdata/tests/test_str2ld.c +++ b/libnetdata/tests/test_str2ld.c @@ -32,7 +32,7 @@ static void test_str2ld(void **state) else if (isinf(mine)) assert_true(isinf(sys)); else if (mine != sys) - assert_false(abs(mine - sys) > 0.000001); + assert_false(ABS(mine - sys) > 0.000001); assert_ptr_equal(e_mine, e_sys); } diff --git a/netdata-installer.sh b/netdata-installer.sh index c11e1a7ec..c7867243a 100755 --- a/netdata-installer.sh +++ b/netdata-installer.sh @@ -215,6 +215,7 @@ USAGE: ${PROGRAM} [options] --disable-ebpf Disable eBPF Kernel plugin (Default: enabled) --disable-cloud Disable all Netdata Cloud functionality. --require-cloud Fail the install if it can't build Netdata Cloud support. + --aclk-ng Forces build of ACLK Next Generation which is fallback by default. --enable-plugin-freeipmi Enable the FreeIPMI plugin. Default: enable it when libipmimonitoring is available. --disable-plugin-freeipmi --disable-https Explicitly disable TLS support @@ -319,6 +320,10 @@ while [ -n "${1}" ]; do "--disable-go") NETDATA_DISABLE_GO=1 ;; "--enable-ebpf") NETDATA_DISABLE_EBPF=0 ;; "--disable-ebpf") NETDATA_DISABLE_EBPF=1 NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--disable-ebpf/} --disable-ebpf" ;; + "--aclk-ng") + NETDATA_ACLK_NG=1 + NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--with-aclk-ng/} --with-aclk-ng" + ;; "--disable-cloud") if [ -n "${NETDATA_REQUIRE_CLOUD}" ]; then echo "Cloud explicitly enabled, ignoring --disable-cloud." @@ -567,8 +572,8 @@ copy_libmosquitto() { } bundle_libmosquitto() { - if [ -n "${NETDATA_DISABLE_CLOUD}" ]; then - echo "Skipping cloud" + if [ -n "${NETDATA_DISABLE_CLOUD}" ] || [ -n "${NETDATA_ACLK_NG}" ]; then + echo "Skipping libmosquitto" return 0 fi @@ -664,7 +669,7 @@ copy_libwebsockets() { } bundle_libwebsockets() { - if [ -n "${NETDATA_DISABLE_CLOUD}" ] || [ -n "${USE_SYSTEM_LWS}" ]; then + if [ -n "${NETDATA_DISABLE_CLOUD}" ] || [ -n "${USE_SYSTEM_LWS}" ] || [ -n "${NETDATA_ACLK_NG}" ]; then return 0 fi @@ -1313,12 +1318,14 @@ if [ "${UID}" -eq 0 ]; then if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin" ]; then run chown root:${NETDATA_GROUP} "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin" - run chmod 4750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin" + run chmod 0750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin" + run sh -c "setcap cap_perfmon+ep \"${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin\" || setcap cap_sys_admin+ep \"${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin\"" fi if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/slabinfo.plugin" ]; then run chown root:${NETDATA_GROUP} "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/slabinfo.plugin" - run chmod 4750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/slabinfo.plugin" + run chmod 0750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/slabinfo.plugin" + run setcap cap_dac_read_search+ep "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/slabinfo.plugin" fi if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/ioping" ]; then @@ -1611,11 +1618,23 @@ remove_old_ebpf() { # Added to remove eBPF programs with name pattern: NAME_VERSION.SUBVERSION.PATCH if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/pnetdata_ebpf_process.3.10.0.o" ]; then - echo >&2 "Removing old eBPF programs" + echo >&2 "Removing old eBPF programs with patch." rm -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/rnetdata_ebpf"*.?.*.*.o rm -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/pnetdata_ebpf"*.?.*.*.o fi + # Remove old eBPF program to store new eBPF program inside subdirectory + if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/pnetdata_ebpf_process.3.10.o" ]; then + echo >&2 "Removing old eBPF programs installed in old directory." + rm -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/rnetdata_ebpf"*.?.*.o + rm -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/pnetdata_ebpf"*.?.*.o + fi + + # Remove old reject list from previous directory + if [ -f "${NETDATA_PREFIX}/usr/lib/netdata/conf.d/ebpf_kernel_reject_list.txt" ]; then + echo >&2 "Removing old ebpf_kernel_reject_list.txt." + rm -f "${NETDATA_PREFIX}/usr/lib/netdata/conf.d/ebpf_kernel_reject_list.txt" + fi } install_ebpf() { @@ -1652,7 +1671,16 @@ install_ebpf() { # chown everything to root:netdata before we start copying out of our package run chown -R root:netdata "${tmp}" - run cp -a -v "${tmp}"/*netdata_ebpf_*.o "${NETDATA_PREFIX}"/usr/libexec/netdata/plugins.d + if [ ! -d "${NETDATA_PREFIX}"/usr/libexec/netdata/plugins.d/ebpf.d ]; then + mkdir "${NETDATA_PREFIX}"/usr/libexec/netdata/plugins.d/ebpf.d + RET=$? + if [ "${RET}" != "0" ]; then + rm -rf "${tmp}" + return 1 + fi + fi + + run cp -a -v "${tmp}"/*netdata_ebpf_*.o "${NETDATA_PREFIX}"/usr/libexec/netdata/plugins.d/ebpf.d rm -rf "${tmp}" diff --git a/netdata.spec.in b/netdata.spec.in index f55dcfe3d..6a0a005e5 100644 --- a/netdata.spec.in +++ b/netdata.spec.in @@ -474,18 +474,20 @@ rm -rf "${RPM_BUILD_ROOT}" # cgroup-network detects the network interfaces of CGROUPs # it must be able to use setns() and run cgroup-network-helper.sh as root # the helper script reads /proc/PID/fdinfo/* files, runs virsh, etc. -%caps(cap_setuid=ep) %attr(4750,root,netdata) %{_libexecdir}/%{name}/plugins.d/cgroup-network +%attr(4750,root,netdata) %{_libexecdir}/%{name}/plugins.d/cgroup-network %attr(0750,root,netdata) %{_libexecdir}/%{name}/plugins.d/cgroup-network-helper.sh %endif # perf plugin -%caps(cap_setuid=ep) %attr(4750,root,netdata) %{_libexecdir}/%{name}/plugins.d/perf.plugin +# This should be CAP_PERFMON once RPM finally learns about it, but needs to be CAP_SYS_ADMIN for now. +# %caps(cap_perfmon=ep) %attr(0750,root,netdata) %{_libexecdir}/%{name}/plugins.d/perf.plugin +%caps(cap_sys_admin=ep) %attr(0750,root,netdata) %{_libexecdir}/%{name}/plugins.d/perf.plugin # perf plugin -%caps(cap_setuid=ep) %attr(4750,root,netdata) %{_libexecdir}/%{name}/plugins.d/slabinfo.plugin +%caps(cap_dac_read_search=ep) %attr(0750,root,netdata) %{_libexecdir}/%{name}/plugins.d/slabinfo.plugin # freeipmi files -%caps(cap_setuid=ep) %attr(4750,root,netdata) %{_libexecdir}/%{name}/plugins.d/freeipmi.plugin +%attr(4750,root,netdata) %{_libexecdir}/%{name}/plugins.d/freeipmi.plugin # Enforce 0644 for files and 0755 for directories # for the netdata web directory diff --git a/packaging/Dockerfile.packager b/packaging/Dockerfile.packager index 14f711ca6..4c90f14fd 100644 --- a/packaging/Dockerfile.packager +++ b/packaging/Dockerfile.packager @@ -1,41 +1,42 @@ ARG ARCH=amd64 ARG DISTRO=debian +ARG TEST_BASE=debian ARG DISTRO_VERSION=10 -ARG VERSION=0.1 +ARG PKG_VERSION=0.1 FROM netdata/package-builders:${DISTRO}${DISTRO_VERSION} AS build ARG ARCH ARG DISTRO ARG DISTRO_VERSION -ARG VERSION +ARG PKG_VERSION ENV ARCH=$ARCH ENV DISTRO=$DISTRO ENV DISTRO_VERSION=$DISTRO_VERSION -ENV VERSION=$VERSION +ENV VERSION=$PKG_VERSION WORKDIR /netdata COPY . . RUN /build.sh -FROM ${DISTRO}:${DISTRO_VERSION} AS runtime +FROM ${TEST_BASE}:${DISTRO_VERSION} AS runtime ARG ARCH ARG DISTRO ARG DISTRO_VERSION -ARG VERSION +ARG PKG_VERSION ENV ARCH=$ARCH ENV DISTRO=$DISTRO ENV DISTRO_VERSION=$DISTRO_VERSION -ENV VERSION=$VERSION +ENV VERSION=$PKG_VERSION COPY ./packaging/scripts/install.sh /install.sh COPY ./packaging/scripts/test.sh /test.sh -COPY --from=build /netdata/artifacts /artifacts +COPY --from=build /netdata/artifacts /packages RUN /install.sh diff --git a/packaging/dashboard.checksums b/packaging/dashboard.checksums index f343ef57e..26a32da7f 100644 --- a/packaging/dashboard.checksums +++ b/packaging/dashboard.checksums @@ -1 +1 @@ -804e4610477ab64726f62cf6093613197be3ccf0140959364426065871075309 dashboard.tar.gz +3edf0957252cbb107d6d16928c3a1167d3b4ae54e45f8bd7972a81f266781f4a dashboard.tar.gz diff --git a/packaging/dashboard.version b/packaging/dashboard.version index 3ef97df69..8e6d5ffc3 100644 --- a/packaging/dashboard.version +++ b/packaging/dashboard.version @@ -1 +1 @@ -v2.13.6 +v2.13.28_ diff --git a/packaging/docker/README.md b/packaging/docker/README.md index e97cb0c6f..a960897d8 100644 --- a/packaging/docker/README.md +++ b/packaging/docker/README.md @@ -94,6 +94,22 @@ volumes: netdatacache: ``` +## Docker tags + +The official `netdata/netdata` Docker image provides the following named tags: + +* `stable`: The `stable` tag will always point to the most recently published stable build. +* `edge`: The `edge` tag will always point ot the most recently published nightly build. In most cases, this is + updated daily at around 01:00 UTC. +* `latest`: The `latest` tag will always point to the most recently published build, whether itโ€™s a stable build + or a nightly build. This is what Docker will use by default if you do not specify a tag. + +Additionally, for each stable release, three tags are pushed, one with the full version of the release (for example, +`v1.30.0`), one with just the major and minor version (for example, `v1.30`), and one with just the major version +(for example, `v1`). The tags for the minor versions and major versions are updated whenever a release is published +that would match that tag (for example, if `v1.30.1` were to be published, the `v1.30` tag would be updated to +point to that instead of `v1.30.0`). + ## Health Checks Our Docker image provides integrated support for health checks through the standard Docker interfaces. diff --git a/packaging/docker/run.sh b/packaging/docker/run.sh index 432d19907..c04be9ff8 100755 --- a/packaging/docker/run.sh +++ b/packaging/docker/run.sh @@ -20,11 +20,12 @@ if [ -n "${PGID}" ]; then usermod -a -G "${PGID}" "${DOCKER_USR}" || echo >&2 "Could not add netdata user to group docker with ID ${PGID}" fi -if [ -n "${NETDATA_CLAIM_URL}" ] && [ -n "${NETDATA_CLAIM_TOKEN}" ] && [ ! -f /var/lib/netdata/claim.d/claimed_id ]; then - /usr/sbin/netdata-claim.sh -token "${NETDATA_CLAIM_TOKEN}" \ - -url "${NETDATA_CLAIM_URL}" \ - ${NETDATA_CLAIM_ROOMS:+-rooms "${NETDATA_CLAIM_ROOMS}"} \ - ${NETDATA_CLAIM_PROXY:+-proxy "${NETDATA_CLAIM_PROXY}"} +if [ -n "${NETDATA_CLAIM_URL}" ] && [ -n "${NETDATA_CLAIM_TOKEN}" ] && [ ! -f /var/lib/netdata/cloud.d/claimed_id ]; then + /usr/sbin/netdata-claim.sh -token="${NETDATA_CLAIM_TOKEN}" \ + -url="${NETDATA_CLAIM_URL}" \ + ${NETDATA_CLAIM_ROOMS:+-rooms="${NETDATA_CLAIM_ROOMS}"} \ + ${NETDATA_CLAIM_PROXY:+-proxy="${NETDATA_CLAIM_PROXY}"} \ + -daemon-not-running fi exec /usr/sbin/netdata -u "${DOCKER_USR}" -D -s /host -p "${NETDATA_LISTENER_PORT}" -W set web "web files group" root -W set web "web files owner" root "$@" diff --git a/packaging/ebpf.checksums b/packaging/ebpf.checksums index e35fa1f17..d8fdbb699 100644 --- a/packaging/ebpf.checksums +++ b/packaging/ebpf.checksums @@ -1,3 +1,3 @@ -d9c1c81fe3a8b9af7fc1174a28c16ddb24e2f3ff79e6beb1b2eb184bf0d2e8c0 netdata-kernel-collector-glibc-v0.5.5.tar.xz -0e1dd5e12a58dda53576b2dab963cd26fa26fe2084d84c51becb9238d1055fc1 netdata-kernel-collector-musl-v0.5.5.tar.xz -d6d65e5f40a83880aa7dd740829a7ffe6a0805637e1616805aebdff088a3fcb0 netdata-kernel-collector-static-v0.5.5.tar.xz +380e31fe143e7b53bcebaaf03a04d143ae82e13318b264461ebb5d3ac9026ae5 netdata-kernel-collector-glibc-v0.6.1.tar.xz +5a196ab8a00d307a4f6a5c213178bd62e5720173f433afc6e77dfa911fb6ca56 netdata-kernel-collector-musl-v0.6.1.tar.xz +683e6676c1eee0cd4a7da5be953e94052e780de1ca375146a488d62593220c46 netdata-kernel-collector-static-v0.6.1.tar.xz diff --git a/packaging/ebpf.version b/packaging/ebpf.version index 12aa8c541..14909610e 100644 --- a/packaging/ebpf.version +++ b/packaging/ebpf.version @@ -1 +1 @@ -v0.5.5 +v0.6.1 diff --git a/packaging/go.d.checksums b/packaging/go.d.checksums index 19a2af870..3de8797d6 100644 --- a/packaging/go.d.checksums +++ b/packaging/go.d.checksums @@ -1,16 +1,16 @@ -aac2f7e67d1231c68cf5a5500801617f7c8bc9543a9e7d80c81448da50225ab9 *config.tar.gz -0248ced58c9eaba9815ff4c4cd5498436b3a2db93f50a71b8d602cf6688d66b0 *go.d.plugin-v0.27.0.darwin-amd64.tar.gz -79cbb731e2f57ef4af1f8c8c204a488008cd212853a0c9bb552504e261e17e5b *go.d.plugin-v0.27.0.freebsd-386.tar.gz -b0f4151ec40bc92ef6624e8e148074de5a3b3d47c4df55bb858d04e68d557cc1 *go.d.plugin-v0.27.0.freebsd-amd64.tar.gz -73c0e6b994ef1ade6323163bca7636455d823db34d28e349f6ea9346e1320602 *go.d.plugin-v0.27.0.freebsd-arm.tar.gz -283e2b3d86221c9208f26c979a86257e6a99d7812cdb6ef1c7989d847ec3cfda *go.d.plugin-v0.27.0.freebsd-arm64.tar.gz -528cd446a6cfdc5f62e66806e71d6ea1fab4e48fd1e61ab0425cb5df45676430 *go.d.plugin-v0.27.0.linux-386.tar.gz -b68d0ed812658c175ed6f74cdb29944d006fe92a8958b06aa281d435f728a412 *go.d.plugin-v0.27.0.linux-amd64.tar.gz -3a70a07081056414ed5c76370e3ec61c0bac7bc852a15266489fe27f5ba9601e *go.d.plugin-v0.27.0.linux-arm.tar.gz -0cc30f7ea11c7ffce5ac0831845adcea63a76cd56016c066fde9362201f0e9b3 *go.d.plugin-v0.27.0.linux-arm64.tar.gz -54fab477fb5b80a3e0fc28a1983f7bdf21717a3e7b0c64c1f6bd82b572076f15 *go.d.plugin-v0.27.0.linux-mips.tar.gz -831e5fb6e38683d3da58219af6c0fbd1495766fafde6c00550c97b086d1bee0f *go.d.plugin-v0.27.0.linux-mips64.tar.gz -45626fe15fbde8ff18b1f1d167957be664674ebb55d120edf4e193698dcd07f9 *go.d.plugin-v0.27.0.linux-mips64le.tar.gz -3468a8e95313926ca7712bdaa4838d38279fa4dffde66e1576af44afdc9f0562 *go.d.plugin-v0.27.0.linux-mipsle.tar.gz -1b23454071500d58dd935c977e9c742744dd201ac654f2862b2d545e1ab212da *go.d.plugin-v0.27.0.linux-ppc64.tar.gz -8d3feb67d036a9fdc337dea1118074591b85624fc17d235cf0979e3b93dd2009 *go.d.plugin-v0.27.0.linux-ppc64le.tar.gz +ebfdec0f2363b395b135a540a7ce2ef72414fa0948a35dae6d50c7a6c8050e75 *config.tar.gz +0f19be07c9359cbde99536e46d7499ea2f50920ee4413ec289797ca6f62232a4 *go.d.plugin-v0.28.1.darwin-amd64.tar.gz +f5154a45038609dac8d3d86eeaadc2a2c61ed564caa01589e401896d9a4b5897 *go.d.plugin-v0.28.1.freebsd-386.tar.gz +3fc5780801b4b08205773c0f0bdf556fdee364c5feb3e060aa710d8a1e491a2b *go.d.plugin-v0.28.1.freebsd-amd64.tar.gz +ae32c3a6434a392215eda9e0564188b645cc35e4a7c5844407da716742d01c5b *go.d.plugin-v0.28.1.freebsd-arm.tar.gz +7e1fa2404195701b8726efafc09fc8df3884a292fca33ac94539f48508de36dc *go.d.plugin-v0.28.1.freebsd-arm64.tar.gz +e24da1e441cb567450ef99397066dbfa73181873920d7b909a4dfe5901c38c64 *go.d.plugin-v0.28.1.linux-386.tar.gz +6735e94e2842182d5a0b8bbb419e222596de451698e8f890384d93e2a1a2d950 *go.d.plugin-v0.28.1.linux-amd64.tar.gz +d4cfe21f1716b531329d57afe39ffc863bba94a884ab988d5f6e1b0cfbf2a1eb *go.d.plugin-v0.28.1.linux-arm.tar.gz +d01386cb7ac1c0046430365383315affcc54ad9e7be73e6e7892214d8e32e030 *go.d.plugin-v0.28.1.linux-arm64.tar.gz +57d1db26c7815f7fec3f6773df0ac704846be03e221882c3eac4fc72ddbf4198 *go.d.plugin-v0.28.1.linux-mips.tar.gz +2dea3359b6613d7a557a1893f2b652796d2862030fd32a90107e59ccc2db398d *go.d.plugin-v0.28.1.linux-mips64.tar.gz +920215d7cf5dd40344d18033c8e3861fe1e381a7897f361a601b16dafd90ab62 *go.d.plugin-v0.28.1.linux-mips64le.tar.gz +bb9b560e497a238e090649f61126cd4b0c7953db4c2f4b4e5400b1f1b4c5f56d *go.d.plugin-v0.28.1.linux-mipsle.tar.gz +523f3197622675cdc3ceb4a54ac01a98c2b89f9e942e31917bc8e6f905e01d66 *go.d.plugin-v0.28.1.linux-ppc64.tar.gz +e72506cb5d0d5f4227eb53bff05fe19ea23fce8025da106284a34b2dc44435d6 *go.d.plugin-v0.28.1.linux-ppc64le.tar.gz diff --git a/packaging/go.d.version b/packaging/go.d.version index 0a8bf80d6..244df55dd 100644 --- a/packaging/go.d.version +++ b/packaging/go.d.version @@ -1 +1 @@ -v0.27.0 +v0.28.1 diff --git a/packaging/installer/README.md b/packaging/installer/README.md index 317ac6380..d5a69aa6a 100644 --- a/packaging/installer/README.md +++ b/packaging/installer/README.md @@ -232,8 +232,8 @@ the lines to match the output from `ls -la` above and uncomment them if necessar web files group = netdata ``` -Save the file, [restart the Netdata Agent](/docs/getting-started.md#start-stop-and-restart-netdata), and try accessing -the dashboard again. +Save the file, restart Netdata using `sudo systemctl restart netdata`, or the [appropriate +method](/docs/configure/start-stop-restart.md) for your system, and try accessing the dashboard again. ### Multiple versions of OpenSSL diff --git a/packaging/installer/kickstart-static64.sh b/packaging/installer/kickstart-static64.sh index 3c45d9efb..a86a62fcd 100755 --- a/packaging/installer/kickstart-static64.sh +++ b/packaging/installer/kickstart-static64.sh @@ -12,6 +12,10 @@ # --local-files Use a manually provided tarball for the installation # --allow-duplicate-install do not bail if we detect a duplicate install # --reinstall if an existing install would be updated, reinstall instead +# --claim-token specify a token to use for claiming the newly installed instance +# --claim-url specify a URL to use for claiming the newly installed isntance +# --claim-rooms specify a list of rooms to claim the newly installed instance to +# --claim-proxy specify a proxy to use while claiming the newly installed instance # # Environment options: # @@ -224,56 +228,81 @@ NETDATA_INSTALLER_OPTIONS="" NETDATA_UPDATES="--auto-update" RELEASE_CHANNEL="nightly" while [ -n "${1}" ]; do - if [ "${1}" = "--dont-wait" ] || [ "${1}" = "--non-interactive" ] || [ "${1}" = "--accept" ]; then - opts="${opts} --accept" - shift 1 - elif [ "${1}" = "--dont-start-it" ]; then - NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS:+${NETDATA_INSTALLER_OPTIONS} }${1}" - shift 1 - elif [ "${1}" = "--no-updates" ]; then - NETDATA_UPDATES="" - shift 1 - elif [ "${1}" = "--auto-update" ]; then - true # This is the default behaviour, so ignore it. - shift 1 - elif [ "${1}" = "--stable-channel" ]; then - RELEASE_CHANNEL="stable" - NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS:+${NETDATA_INSTALLER_OPTIONS} }${1}" - shift 1 - elif [ "${1}" = "--disable-telemetry" ]; then - NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS:+${NETDATA_INSTALLER_OPTIONS} }${1}" - shift 1 - elif [ "${1}" = "--local-files" ]; then - NETDATA_UPDATES="" # Disable autoupdates if using pre-downloaded files. - shift 1 - if [ -z "${1}" ]; then - fatal "Option --local-files requires extra information. The desired tarball full filename is needed" - fi + case "${1}" in + "--dont-wait") opts="${opts} --accept" ;; + "--non-interactive") opts="${opts} --accept" ;; + "--accept") opts="${opts} --accept" ;; + "--dont-start-it") + NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS:+${NETDATA_INSTALLER_OPTIONS} }${1}" + NETDATA_CLAIM_EXTRA="${NETDATA_CLAIM_EXTRA} -daemon-not-running" + ;; + "--no-updates") NETDATA_UPDATES="" ;; + "--stable-channel") + RELEASE_CHANNEL="stable" + NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS:+${NETDATA_INSTALLER_OPTIONS} }${1}" + ;; + "--disable-telemetry") NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS:+${NETDATA_INSTALLER_OPTIONS} }${1}";; + "--local-files") + NETDATA_UPDATES="" # Disable autoupdates if using pre-downloaded files. + if [ -z "${2}" ]; then + fatal "Option --local-files requires extra information. The desired tarball full filename is needed" + fi - NETDATA_LOCAL_TARBALL_OVERRIDE="${1}" - shift 1 - if [ -z "${1}" ]; then - fatal "Option --local-files requires a pair of the tarball source and the checksum file" - fi + NETDATA_LOCAL_TARBALL_OVERRIDE="${2}" - NETDATA_LOCAL_TARBALL_OVERRIDE_CHECKSUM="${1}" - shift 1 - elif [ "${1}" = "--allow-duplicate-install" ]; then - NETDATA_ALLOW_DUPLICATE_INSTALL=1 - shift 1 - elif [ "${1}" = "--reinstall" ]; then - NETDATA_REINSTALL=1 - shift 1 - else - echo >&2 "Unknown option '${1}' or invalid number of arguments. Please check the README for the available arguments of ${0} and try again" - exit 1 - fi + if [ -z "${3}" ]; then + fatal "Option --local-files requires a pair of the tarball source and the checksum file" + fi + + NETDATA_LOCAL_TARBALL_OVERRIDE_CHECKSUM="${3}" + shift 2 + ;; + "--allow-duplicate-install") NETDATA_ALLOW_DUPLICATE_INSTALL=1 ;; + "--reinstall") NETDATA_REINSTALL=1 ;; + "--claim-token") + NETDATA_CLAIM_TOKEN="${2}" + shift 1 + ;; + "--claim-rooms") + NETDATA_CLAIM_ROOMS="${2}" + shift 1 + ;; + "--claim-url") + NETDATA_CLAIM_URL="${2}" + shift 1 + ;; + "--claim-proxy") + NETDATA_CLAIM_EXTRA="${NETDATA_CLAIM_EXTRA} -proxy ${2}" + shift 1 + ;; + *) + echo >&2 "Unknown option '${1}' or invalid number of arguments. Please check the README for the available arguments of ${0} and try again" + exit 1 + esac + shift 1 done if [ ! "${DO_NOT_TRACK:-0}" -eq 0 ] || [ -n "$DO_NOT_TRACK" ]; then NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS:+${NETDATA_INSTALLER_OPTIONS} }--disable-telemtry" fi +if [ -n "${NETDATA_DISABLE_CLOUD}" ]; then + if [ -n "${NETDATA_CLAIM_TOKEN}" ] || [ -n "${NETDATA_CLAIM_ROOMS}" ] || [ -n "${NETDATA_CLAIM_URL}" ]; then + run_failed "Cloud explicitly disabled but automatic claiming requested." + run_failed "Either enable Netdata Cloud, or remove the --claim-* options." + exit 1 + fi +fi + +# shellcheck disable=SC2235,SC2030 +if ( [ -z "${NETDATA_CLAIM_TOKEN}" ] && [ -n "${NETDATA_CLAIM_URL}" ] ) || ( [ -n "${NETDATA_CLAIM_TOKEN}" ] && [ -z "${NETDATA_CLAIM_URL}" ] ); then + run_failed "Invalid claiming options, both a claiming token and URL must be specified." + exit 1 +elif [ -z "${NETDATA_CLAIM_TOKEN}" ] && [ -n "${NETDATA_CLAIM_ROOMS}" ]; then + run_failed "Invalid claiming options, claim rooms may only be specified when a token and URL are specified." + exit 1 +fi + # Netdata Tarball Base URL (defaults to our Google Storage Bucket) [ -z "$NETDATA_TARBALL_BASEURL" ] && NETDATA_TARBALL_BASEURL=https://storage.googleapis.com/netdata-nightlies @@ -365,4 +394,18 @@ if [ $? -eq 0 ]; then fi else echo >&2 "NOTE: did not remove: ${TMPDIR}/netdata-latest.gz.run" + exit 1 +fi + +# -------------------------------------------------------------------------------------------------------------------- + +if [ -n "${NETDATA_CLAIM_TOKEN}" ]; then + progress "Attempting to claim agent to ${NETDATA_CLAIM_URL}" + NETDATA_CLAIM_PATH=/opt/netdata/bin/netdata-claim.sh + + if "${NETDATA_CLAIM_PATH}" -token=${NETDATA_CLAIM_TOKEN} -rooms=${NETDATA_CLAIM_ROOMS} -url=${NETDATA_CLAIM_URL} ${NETDATA_CLAIM_EXTRA}; then + progress "Successfully claimed node" + else + run_failed "Unable to claim node, you must do so manually." + fi fi diff --git a/packaging/installer/kickstart.sh b/packaging/installer/kickstart.sh index c97587c18..bd18b7db5 100755 --- a/packaging/installer/kickstart.sh +++ b/packaging/installer/kickstart.sh @@ -16,6 +16,10 @@ # --local-files set the full path of the desired tarball to run install with # --allow-duplicate-install do not bail if we detect a duplicate install # --reinstall if an existing install would be updated, reinstall instead +# --claim-token specify a token to use for claiming the newly installed instance +# --claim-url specify a URL to use for claiming the newly installed isntance +# --claim-rooms specify a list of rooms to claim the newly installed instance to +# --claim-proxy specify a proxy to use while claiming the newly installed instance # # Environment options: # @@ -317,66 +321,83 @@ NETDATA_INSTALLER_OPTIONS="" NETDATA_UPDATES="--auto-update" RELEASE_CHANNEL="nightly" while [ -n "${1}" ]; do - if [ "${1}" = "all" ]; then - PACKAGES_INSTALLER_OPTIONS="netdata-all" - shift 1 - elif [ "${1}" = "--dont-wait" ] || [ "${1}" = "--non-interactive" ]; then - INTERACTIVE=0 - shift 1 - elif [ "${1}" = "--no-updates" ]; then - # echo >&2 "netdata will not auto-update" - NETDATA_UPDATES= - shift 1 - elif [ "${1}" = "--stable-channel" ]; then - RELEASE_CHANNEL="stable" - NETDATA_INSTALLER_OPTIONS="$NETDATA_INSTALLER_OPTIONS --stable-channel" - shift 1 - elif [ "${1}" = "--allow-duplicate-install" ]; then - NETDATA_ALLOW_DUPLICATE_INSTALL=1 - shift 1 - elif [ "${1}" = "--reinstall" ]; then - NETDATA_REINSTALL=1 - shift 1 - elif [ "${1}" = "--local-files" ]; then - shift 1 - if [ -z "${1}" ]; then - fatal "Missing netdata: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" - fi + case "${1}" in + "all") PACKAGES_INSTALLER_OPTIONS="netdata-all" ;; + "--dont-wait") INTERACTIVE=0 ;; + "--non-interactive") INTERACTIVE=0 ;; + "--no-updates") NETDATA_UPDATES= ;; + "--stable-channel") + RELEASE_CHANNEL="stable" + NETDATA_INSTALLER_OPTIONS="$NETDATA_INSTALLER_OPTIONS --stable-channel" + ;; + "--allow-duplicate-install") NETDATA_ALLOW_DUPLICATE_INSTALL=1 ;; + "--reinstall") NETDATA_REINSTALL=1 ;; + "--claim-token") + NETDATA_CLAIM_TOKEN="${2}" + shift 1 + ;; + "--claim-rooms") + NETDATA_CLAIM_ROOMS="${2}" + shift 1 + ;; + "--claim-url") + NETDATA_CLAIM_URL="${2}" + shift 1 + ;; + "--claim-proxy") + NETDATA_CLAIM_EXTRA="${NETDATA_CLAIM_EXTRA} -proxy ${2}" + shift 1 + ;; + "--dont-start-it") + NETDATA_CLAIM_EXTRA="${NETDATA_CLAIM_EXTRA} -daemon-not-running" + NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS} --dont-start-it" + ;; + "--install") + NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS} --install ${2}" + NETDATA_PREFIX="${2}" + shift 1 + ;; + "--disable-cloud") + NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS} --disable-cloud" + NETDATA_DISABLE_CLOUD=1 + ;; + "--local-files") + if [ -z "${2}" ]; then + fatal "Missing netdata: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" + fi - export NETDATA_LOCAL_TARBALL_OVERRIDE="${1}" - shift 1 + export NETDATA_LOCAL_TARBALL_OVERRIDE="${2}" - if [ -z "${1}" ]; then - fatal "Missing checksum file: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" - fi + if [ -z "${3}" ]; then + fatal "Missing checksum file: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" + fi - export NETDATA_LOCAL_TARBALL_OVERRIDE_CHECKSUM="${1}" - shift 1 + export NETDATA_LOCAL_TARBALL_OVERRIDE_CHECKSUM="${3}" - if [ -z "${1}" ]; then - fatal "Missing go.d tarball: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" - fi + if [ -z "${4}" ]; then + fatal "Missing go.d tarball: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" + fi - export NETDATA_LOCAL_TARBALL_OVERRIDE_GO_PLUGIN="${1}" - shift 1 + export NETDATA_LOCAL_TARBALL_OVERRIDE_GO_PLUGIN="${4}" - if [ -z "${1}" ]; then - fatal "Missing go.d config tarball: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" - fi + if [ -z "${5}" ]; then + fatal "Missing go.d config tarball: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" + fi - export NETDATA_LOCAL_TARBALL_OVERRIDE_GO_PLUGIN_CONFIG="${1}" - shift 1 + export NETDATA_LOCAL_TARBALL_OVERRIDE_GO_PLUGIN_CONFIG="${5}" - if [ -z "${1}" ]; then - fatal "Missing dependencies management scriptlet: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" - fi + if [ -z "${6}" ]; then + fatal "Missing dependencies management scriptlet: Option --local-files requires extra information. The desired tarball for netdata, the checksum, the go.d plugin tarball , the go.d plugin config tarball and the dependency management script, in this particular order" + fi - export NETDATA_LOCAL_TARBALL_OVERRIDE_DEPS_SCRIPT="${1}" - shift 1 - else - NETDATA_INSTALLER_OPTIONS="$NETDATA_INSTALLER_OPTIONS ${1}" - shift 1 - fi + export NETDATA_LOCAL_TARBALL_OVERRIDE_DEPS_SCRIPT="${6}" + shift 5 + ;; + *) + NETDATA_INSTALLER_OPTIONS="${NETDATA_INSTALLER_OPTIONS} ${1}" + ;; + esac + shift 1 done if [ "${INTERACTIVE}" = "0" ]; then @@ -384,6 +405,23 @@ if [ "${INTERACTIVE}" = "0" ]; then NETDATA_INSTALLER_OPTIONS="$NETDATA_INSTALLER_OPTIONS --dont-wait" fi +if [ -n "${NETDATA_DISABLE_CLOUD}" ]; then + if [ -n "${NETDATA_CLAIM_TOKEN}" ] || [ -n "${NETDATA_CLAIM_ROOMS}" ] || [ -n "${NETDATA_CLAIM_URL}" ]; then + run_failed "Cloud explicitly disabled but automatic claiming requested." + run_failed "Either enable Netdata Cloud, or remove the --claim-* options." + exit 1 + fi +fi + +# shellcheck disable=SC2235,SC2030 +if ( [ -z "${NETDATA_CLAIM_TOKEN}" ] && [ -n "${NETDATA_CLAIM_URL}" ] ) || ( [ -n "${NETDATA_CLAIM_TOKEN}" ] && [ -z "${NETDATA_CLAIM_URL}" ] ); then + run_failed "Invalid claiming options, both a claiming token and URL must be specified." + exit 1 +elif [ -z "${NETDATA_CLAIM_TOKEN}" ] && [ -n "${NETDATA_CLAIM_ROOMS}" ]; then + run_failed "Invalid claiming options, claim rooms may only be specified when a token and URL are specified." + exit 1 +fi + # --------------------------------------------------------------------------------------------------------------------- # look for an existing install and try to update that instead if it exists @@ -487,5 +525,23 @@ else cd "$(find . -mindepth 1 -maxdepth 1 -type d)" && install "$@" else fatal "Cannot install netdata from source (the source directory does not include netdata-installer.sh). Leaving all files in ${ndtmpdir}" + exit 1 + fi +fi + +# -------------------------------------------------------------------------------------------------------------------- + +if [ -n "${NETDATA_CLAIM_TOKEN}" ]; then + progress "Attempting to claim agent to ${NETDATA_CLAIM_URL}" + if [ -z "${NETDATA_PREFIX}" ] ; then + NETDATA_CLAIM_PATH=/usr/sbin/netdata-claim.sh + else + NETDATA_CLAIM_PATH="${NETDATA_PREFIX}/bin/netdata-claim.sh" + fi + + if "${NETDATA_CLAIM_PATH}" -token=${NETDATA_CLAIM_TOKEN} -rooms=${NETDATA_CLAIM_ROOMS} -url=${NETDATA_CLAIM_URL} ${NETDATA_CLAIM_EXTRA}; then + progress "Successfully claimed node" + else + run_failed "Unable to claim node, you must do so manually." fi fi diff --git a/packaging/installer/methods/kickstart-64.md b/packaging/installer/methods/kickstart-64.md index 120cc9e11..dbb0cd464 100644 --- a/packaging/installer/methods/kickstart-64.md +++ b/packaging/installer/methods/kickstart-64.md @@ -77,7 +77,7 @@ To use `md5sum` to verify the integrity of the `kickstart-static64.sh` script yo command above, run the following: ```bash -[ "047c86a7c8905955bee39b6980a28e30" = "$(curl -Ss https://my-netdata.io/kickstart-static64.sh | md5sum | cut -d ' ' -f 1)" ] && echo "OK, VALID" || echo "FAILED, INVALID" +[ "33ed36d80c7db0e501b68c4c0c3ceb02" = "$(curl -Ss https://my-netdata.io/kickstart-static64.sh | md5sum | cut -d ' ' -f 1)" ] && echo "OK, VALID" || echo "FAILED, INVALID" ``` If the script is valid, this command will return `OK, VALID`. diff --git a/packaging/installer/methods/kickstart.md b/packaging/installer/methods/kickstart.md index f825f808e..8ad99dc4a 100644 --- a/packaging/installer/methods/kickstart.md +++ b/packaging/installer/methods/kickstart.md @@ -60,7 +60,7 @@ To use `md5sum` to verify the integrity of the `kickstart.sh` script you will do run the following: ```bash -[ "8df7a45b2abb336c84507b7c107bcba3" = "$(curl -Ss https://my-netdata.io/kickstart.sh | md5sum | cut -d ' ' -f 1)" ] && echo "OK, VALID" || echo "FAILED, INVALID" +[ "35e92cd3fd8a29621e23962ac5626dfc" = "$(curl -Ss https://my-netdata.io/kickstart.sh | md5sum | cut -d ' ' -f 1)" ] && echo "OK, VALID" || echo "FAILED, INVALID" ``` If the script is valid, this command will return `OK, VALID`. diff --git a/packaging/installer/methods/kubernetes.md b/packaging/installer/methods/kubernetes.md index 3e85928e9..f593765fc 100644 --- a/packaging/installer/methods/kubernetes.md +++ b/packaging/installer/methods/kubernetes.md @@ -1,25 +1,21 @@ -# Install Netdata on a Kubernetes cluster +# Deploy Kubernetes monitoring with Netdata This document details how to install Netdata on an existing Kubernetes (k8s) cluster. By following these directions, you -will use Netdata's [Helm chart](https://github.com/netdata/helmchart) to bootstrap a Netdata deployment on your cluster. -The Helm chart installs one parent pod for storing metrics and managing alarm notifications plus an additional child pod -for every node in the cluster. +will use Netdata's [Helm chart](https://github.com/netdata/helmchart) to create a Kubernetes monitoring deployment on +your cluster. -Each child pod will collect metrics from the node it runs on, in addition to [compatible -applications](https://github.com/netdata/helmchart#service-discovery-and-supported-services), plus any endpoints covered -by our [generic Prometheus collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/prometheus), -via [service discovery](https://github.com/netdata/agent-service-discovery/). Each child pod will also collect -[cgroups](/collectors/cgroups.plugin/README.md), -[Kubelet](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubelet), and -[kube-proxy](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/k8s_kubeproxy) metrics from its node. +The Helm chart installs one `parent` pod for storing metrics and managing alarm notifications, plus an additional +`child` pod for every node in the cluster, responsible for collecting metrics from the node, Kubernetes control planes, +pods/containers, and [supported application-specific +metrics](https://github.com/netdata/helmchart#service-discovery-and-supported-services). -To install Netdata on a Kubernetes cluster, you need: +To deploy Kubernetes monitoring with Netdata, you need: - A working cluster running Kubernetes v1.9 or newer. - The [kubectl](https://kubernetes.io/docs/reference/kubectl/overview/) command line tool, within [one minor version @@ -27,12 +23,6 @@ To install Netdata on a Kubernetes cluster, you need: administrative system. - The [Helm package manager](https://helm.sh/) v3.0.0 or newer on the same administrative system. -The default configuration creates one `parent` pod, installed on one of your cluster's nodes, and a DaemonSet for -additional `child` pods. This DaemonSet ensures that every node in your k8s cluster also runs a `child` pod, including -the node that also runs `parent`. The `child` pods collect metrics and stream the information to the `parent` pod, which -uses two persistent volumes to store metrics and alarms. The `parent` pod also handles alarm notifications and enables -the Netdata dashboard using an ingress controller. - ## Install the Netdata Helm chart We recommend you install the Helm chart using our Helm repository. In the `helm install` command, replace `netdata` with @@ -43,147 +33,125 @@ helm repo add netdata https://netdata.github.io/helmchart/ helm install netdata netdata/netdata ``` -> You can also install the Netdata Helm chart by cloning the -> [repository](https://artifacthub.io/packages/helm/netdata/netdata#install-by-cloning-the-repository) and manually -> running Helm against the included chart. - -### Post-installation - Run `kubectl get services` and `kubectl get pods` to confirm that your cluster now runs a `netdata` service, one -`parent` pod, and three `child` pods. +parent pod, and multiple child pods. -You've now installed Netdata on your Kubernetes cluster. See how to [access the Netdata -dashboard](#access-the-netdata-dashboard) to confirm it's working as expected, or see the next section to [configure the -Helm chart](#configure-the-netdata-helm-chart) to suit your cluster's particular setup. +You've now installed Netdata on your Kubernetes cluster. Next, it's time to opt-in and enable the powerful Kubernetes +dashboards available in Netdata Cloud. -## Configure the Netdata Helm chart +## Claim your Kubernetes cluster to Netdata Cloud -Read up on the various configuration options in the [Helm chart -documentation](https://github.com/netdata/helmchart#configuration) to see if you need to change any of the options based -on your cluster's setup. +To start [Kubernetes monitoring](https://learn.netdata.cloud/docs/cloud/visualize/kubernetes/), you must first +[claim](/claim/README.md) your Kubernetes cluster to [Netdata Cloud](https://app.netdata.cloud). Claiming securely +connects your Kubernetes cluster to stream metrics data to Netdata Cloud, enabling Kubernetes-specific visualizations +like the health map and time-series composite charts. -To change a setting, use the `--set` or `--values` arguments with `helm install`, for the initial deployment, or `helm upgrade` to upgrade an existing deployment. +First, find your claiming script in Netdata Cloud by clicking on your Space's dropdown, then **Manage your Space**. +Click the **Nodes** tab to reveal the `netdata-claim.sh` script for your Space in Netdata Cloud. You need the `TOKEN` +and `ROOM` values. -```bash -helm install --set a.b.c=xyz netdata netdata/netdata -helm upgrade --set a.b.c=xyz netdata netdata/netdata -``` - -For example, to change the size of the persistent metrics volume on the parent node: +Next, create a file called `override.yml`. ```bash -helm install --set parent.database.volumesize=4Gi netdata netdata/netdata -helm upgrade --set parent.database.volumesize=4Gi netdata netdata/netdata +touch override.yml ``` -### Configure service discovery - -As mentioned in the introduction, Netdata has a [service discovery -plugin](https://github.com/netdata/agent-service-discovery/#service-discovery) to identify compatible pods and collect -metrics from the service they run. The Netdata Helm chart installs this service discovery plugin into your k8s cluster. - -Service discovery scans your cluster for pods exposed on certain ports and with certain image names. By default, it -looks for its supported services on the ports they most commonly listen on, and using default image names. Service -discovery currently supports [popular -applications](https://github.com/netdata/helmchart#service-discovery-and-supported-services), plus any endpoints covered -by our [generic Prometheus collector](https://learn.netdata.cloud/docs/agent/collectors/go.d.plugin/modules/prometheus). - -If you haven't changed listening ports, image names, or other defaults, service discovery should find your pods, create -the proper configurations based on the service that pod runs, and begin monitoring them immediately after deployment. +Paste the following into your `override.yml` file, replacing instances of `ROOM` and `TOKEN` with those from the +claiming script from Netdata Cloud. These settings claim your `parent`/`child` nodes to Netdata Cloud and store more +metrics in the nodes' time-series databases. + +```yaml +parent: + claiming: + enabled: true + token: "TOKEN" + rooms: "ROOM" + +child: + claiming: + enabled: true + token: "TOKEN" + rooms: "ROOM" + configs: + netdata: + data: | + [global] + memory mode = ram + history = 3600 + [health] + enabled = no +``` -However, if you have changed some of these defaults, you need to copy a file from the Netdata Helm chart repository, -make your edits, and pass the changed file to `helm install`/`helm upgrade`. +> โ— These override settings, along with the Helm chart's defaults, will retain an hour's worth of metrics (`history = +> 3600`, or `3600 seconds`) on each child node. Based on your metrics retention needs, and the resources available on +> your cluster, you may want to increase the `history` setting. -First, copy the file to your administrative system. +Apply these new settings: ```bash -curl https://raw.githubusercontent.com/netdata/helmchart/master/charts/netdata/sdconfig/child.yml -o child.yml +helm upgrade -f override.yml netdata netdata/netdata ``` -Edit the new `child.yml` file according to your needs. See the [Helm chart -configuration](https://github.com/netdata/helmchart#configuration) and the file itself for details. +The cluster terminates the old pods and creates new ones with the proper persistence and claiming configuration. You'll +see your nodes, containers, and pods appear in Netdata Cloud in a few seconds. -You can then run `helm install`/`helm upgrade` with the `--set-file` argument to use your configured `child.yml` file -instead of the default, changing the path if you copied it elsewhere. +![Netdata's Kubernetes monitoring +visualizations](https://user-images.githubusercontent.com/1153921/107801491-5dcb0f00-6d1d-11eb-9ab1-876c39f556e2.png) -```bash -helm install --set-file sd.child.configmap.from.value=./child.yml netdata netdata/netdata -helm upgrade --set-file sd.child.configmap.from.value=./child.yml netdata netdata/netdata -``` +If you don't need to configure your Netdata deployment, [skip down](#whats-next) to see how Kubernetes monitoring works +in Netdata, in addition to more guides and resources. -Your configured service discovery is now pushed to your cluster. +## Configure your Netdata monitoring deployment -## Access the Netdata dashboard +Read up on the various configuration options in the [Helm chart +documentation](https://github.com/netdata/helmchart#configuration) if you need to tweak your Kubernetes monitoring. -Accessing the Netdata dashboard itself depends on how you set up your k8s cluster and the Netdata Helm chart. If you -installed the Helm chart with the default `service.type=ClusterIP`, you will need to forward a port to the parent pod. +Your first option is to create an `override.yml` file, if you haven't created one already for +[claiming](#claim-your-kubernetes-cluster-to-netdata-cloud), then apply the new configuration to your cluster with `helm +upgrade`. ```bash -kubectl port-forward netdata-parent-0 19999:19999 +helm upgrade -f override.yml netdata netdata/netdata ``` -You can now access the dashboard at `http://CLUSTER:19999`, replacing `CLUSTER` with the IP address or hostname of your -k8s cluster. - -If you set up the Netdata Helm chart with `service.type=LoadBalancer`, you can find the external IP for the load -balancer with `kubectl get services`, under the `EXTERNAL-IP` column. +If you want to change only a single setting, use the `--set` argument with `helm upgrade`. For example, to change the +size of the persistent metrics volume on the parent node: ```bash -kubectl get services -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -cockroachdb ClusterIP None 26257/TCP,8080/TCP 46h -cockroachdb-public ClusterIP 10.245.148.233 26257/TCP,8080/TCP 46h -kubernetes ClusterIP 10.245.0.1 443/TCP 47h -netdata LoadBalancer 10.245.160.131 203.0.113.0 19999:32231/TCP 74m +helm upgrade --set parent.database.volumesize=4Gi netdata netdata/netdata ``` -In the above example, access the dashboard by navigating to `http://203.0.113.0:19999`. - -## Claim a Kubernetes cluster's parent pod - -You can [claim](/claim/README.md) a cluster's parent Netdata pod to see its real-time metrics alongside any other nodes -you monitor using [Netdata Cloud](https://app.netdata.cloud). +### Configure service discovery -> Netdata Cloud does not currently support claiming child nodes because the Helm chart does not allocate a persistent -> volume for them. +Netdata's [service discovery](https://github.com/netdata/agent-service-discovery/#service-discovery), installed as part +of the Helm chart installation, finds what services are running in a cluster's containers and automatically collects +service-level metrics from them. -Ensure persistence is enabled on the parent pod by running the following `helm upgrade` command. +Service discovery supports [popular applications](https://github.com/netdata/helmchart#applications) and [Prometheus +endpoints](https://github.com/netdata/helmchart#prometheus-endpoints). -```bash -helm upgrade \ - --set parent.database.persistence=true \ - --set parent.alarms.persistence=true \ - netdata netdata/netdata -``` +If your cluster runs services on non-default ports or uses non-default names, you may need to configure service +discovery to start collecting metrics from your services. You have to edit the default ConfigMap that is shipped with +the Helmchart and deploy that to your cluster. -Next, find your claiming script in Netdata Cloud by clicking on your Space's dropdown, then **Manage your Space**. Click -the **Nodes** tab. Netdata Cloud shows a script similar to the following: +First, copy the default file to your administrative system. ```bash -sudo netdata-claim.sh -token=TOKEN -rooms=ROOM1,ROOM2 -url=https://app.netdata.cloud +curl https://raw.githubusercontent.com/netdata/helmchart/master/charts/netdata/sdconfig/child.yml -o child.yml ``` -You will need the values of `TOKEN` and `ROOM1,ROOM2` for the command, which sets `parent.claiming.enabled`, -`parent.claiming.token`, and `parent.claiming.rooms` to complete the parent pod claiming process. +Edit the new `child.yml` file according to your needs. See the [Helm chart +configuration](https://github.com/netdata/helmchart#configuration) and the file itself for details. -Run the following `helm upgrade` command after replacing `TOKEN` and `ROOM1,ROOM2` with the values found in the claiming -script from Netdata Cloud. The quotations are required. +You can then run `helm upgrade` with the `--set-file` argument to use your configured `child.yml` file instead of the +default, changing the path if you copied it elsewhere. ```bash -helm upgrade \ - --set parent.claiming.enabled=true \ - --set parent.claiming.token="TOKEN" \ - --set parent.claiming.rooms="ROOM1,ROOM2" \ - netdata netdata/netdata +helm upgrade --set-file sd.child.configmap.from.value=./child.yml netdata netdata/netdata ``` -The cluster terminates the old parent pod and creates a new one with the proper claiming configuration. You can see your -parent pod in Netdata Cloud after a few moments. You can now [build new -dashboards](https://learn.netdata.cloud/docs/cloud/visualize/dashboards) using the parent pod's metrics or run [Metric -Correlations](https://learn.netdata.cloud/docs/cloud/insights/metric-correlations) to troubleshoot anomalies. - -![A parent Netdata pod in Netdata -Cloud](https://user-images.githubusercontent.com/1153921/94497340-c1f49880-01ab-11eb-97b2-6044537565af.png) +Now that you pushed an edited ConfigMap to your cluster, service discovery should find and set up metrics collection +from your non-default service. ## Update/reinstall the Netdata Helm chart @@ -194,16 +162,27 @@ with the name of the release, if you changed it upon installation: helm upgrade netdata netdata/netdata ``` +To update Netdata's Helm chart to the latest version, run `helm repo update`, then deploy `upgrade` it`: + +```bash +helm repo update +helm upgrade netdata netdata/netdata +``` + ## What's next? -Read the [monitoring a Kubernetes cluster guide](/docs/guides/monitor/kubernetes-k8s-netdata.md) for details on the -various metrics and charts created by the Helm chart and some best practices on real-time troubleshooting using Netdata. +[Start Kubernetes monitoring](https://learn.netdata.cloud/docs/cloud/visualize/kubernetes/) in Netdata Cloud, which +comes with meaningful visualizations out of the box. + +Read our guide, [_Kubernetes monitoring with Netdata: Overview and +visualizations_](/docs/guides/monitor/kubernetes-k8s-netdata.md), for a complete walkthrough of Netdata's Kubernetes +monitoring capabilities, including a health map of every container in your infrastructure, aggregated resource +utilization metrics, and application metrics. -Check out our [infrastructure](/docs/quickstart/infrastructure.md) for details about additional k8s monitoring features, -and learn more about [configuring the Netdata Agent](/docs/configure/nodes.md) to better understand the settings you -might be interested in changing. +### Related reference documentation -To further configure Netdata for your cluster, see our [Helm chart repository](https://github.com/netdata/helmchart) and -the [service discovery repository](https://github.com/netdata/agent-service-discovery/). +- [Netdata Cloud ยท Kubernetes monitoring](https://learn.netdata.cloud/docs/cloud/visualize/kubernetes/) +- [Netdata Helm chart](https://github.com/netdata/helmchart) +- [Netdata service discovery](https://github.com/netdata/agent-service-discovery/) [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fpackaging%2Finstaller%2Fmethods%2Fkubernetes&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/packaging/installer/netdata-updater.sh b/packaging/installer/netdata-updater.sh index 10835cd00..2b833d79c 100755 --- a/packaging/installer/netdata-updater.sh +++ b/packaging/installer/netdata-updater.sh @@ -116,7 +116,7 @@ _cannot_use_tmpdir() { create_tmp_directory() { if [ -n "${NETDATA_TMPDIR_PATH}" ]; then - TMPDIR="${NETDATA_TMPDIR_PATH}" + echo "${NETDATA_TMPDIR_PATH}" else if [ -z "${NETDATA_TMPDIR}" ] || _cannot_use_tmpdir "${NETDATA_TMPDIR}" ; then if [ -z "${TMPDIR}" ] || _cannot_use_tmpdir "${TMPDIR}" ; then @@ -204,7 +204,7 @@ self_update() { if _safe_download "https://raw.githubusercontent.com/netdata/netdata/master/packaging/installer/netdata-updater.sh" ./netdata-updater.sh; then chmod +x ./netdata-updater.sh || exit 1 export ENVIRONMENT_FILE="${ENVIRONMENT_FILE}" - exec ./netdata-updater.sh --not-running-from-cron --no-self-update --tmpdir-path "$(pwd)" + exec ./netdata-updater.sh --not-running-from-cron --no-updater-self-update --tmpdir-path "$(pwd)" else echo >&3 "Failed to download newest version of updater script, continuing with current version." fi @@ -314,8 +314,10 @@ update() { do_not_start="--dont-start-it" fi + env="TMPDIR='${TMPDIR}'" + if [ -n "${NETDATA_SELECTED_DASHBOARD}" ]; then - env="NETDATA_SELECTED_DASHBOARD=${NETDATA_SELECTED_DASHBOARD}" + env="${env} NETDATA_SELECTED_DASHBOARD=${NETDATA_SELECTED_DASHBOARD}" fi if [ ! -x ./netdata-installer.sh ]; then diff --git a/packaging/scripts/install.sh b/packaging/scripts/install.sh index db8d4a67f..c591b0bee 100755 --- a/packaging/scripts/install.sh +++ b/packaging/scripts/install.sh @@ -7,7 +7,7 @@ install_debian_like() { apt-get update # Install NetData - apt-get install -y "/artifacts/netdata_${VERSION}_${ARCH}.deb" + apt-get install -y "/packages/netdata_${VERSION}_${ARCH}.deb" # Install testing tools apt-get install -y --no-install-recommends \ @@ -20,8 +20,28 @@ install_fedora_like() { PKGMGR="$( (command -v dnf > /dev/null && echo "dnf") || echo "yum")" + pkg_version="$(echo "${VERSION}" | tr - .)" + # Install NetData - "$PKGMGR" install -y /artifacts/netdata-"${VERSION}"-*.rpm + "$PKGMGR" install -y /packages/netdata-"${pkg_version}"-*.rpm + + # Install testing tools + "$PKGMGR" install -y curl nc jq +} + +install_centos() { + # Using a glob pattern here because I can't reliably determine what the + # resulting package name will be (TODO: There must be a better way!) + + PKGMGR="$( (command -v dnf > /dev/null && echo "dnf") || echo "yum")" + + pkg_version="$(echo "${VERSION}" | tr - .)" + + # Install EPEL (needed for `jq` + "$PKGMGR" install -y epel-release + + # Install NetData + "$PKGMGR" install -y /packages/netdata-"${pkg_version}"-*.rpm # Install testing tools "$PKGMGR" install -y curl nc jq @@ -31,23 +51,28 @@ install_suse_like() { # Using a glob pattern here because I can't reliably determine what the # resulting package name will be (TODO: There must be a better way!) + pkg_version="$(echo "${VERSION}" | tr - .)" + # Install NetData # FIXME: Allow unsigned packages (for now) #7773 zypper install -y --allow-unsigned-rpm \ - /artifacts/netdata-"${VERSION}"-*.rpm + /packages/netdata-"${pkg_version}"-*.rpm # Install testing tools zypper install -y --no-recommends \ - curl netcat jq + curl gnu-netcat jq } case "${DISTRO}" in debian | ubuntu) install_debian_like ;; - fedora | centos) + fedora) install_fedora_like ;; + centos) + install_centos + ;; opensuse) install_suse_like ;; diff --git a/packaging/scripts/test.sh b/packaging/scripts/test.sh index 24ba2966f..c39082622 100755 --- a/packaging/scripts/test.sh +++ b/packaging/scripts/test.sh @@ -1,15 +1,32 @@ #!/bin/sh +dump_log() { + cat ./netdata.log +} + +trap dump_log EXIT + wait_for() { host="${1}" port="${2}" name="${3}" - timeout="${4:-30}" + timeout="30" + + if command -v nc > /dev/null ; then + netcat="nc" + elif command -v netcat > /dev/null ; then + netcat="netcat" + else + printf "Unable to find a usable netcat command.\n" + return 1 + fi printf "Waiting for %s on %s:%s ... " "${name}" "${host}" "${port}" + sleep 30 + i=0 - while ! nc -z "${host}" "${port}"; do + while ! ${netcat} -z "${host}" "${port}"; do sleep 1 if [ "$i" -gt "$timeout" ]; then printf "Timed out!\n" @@ -20,8 +37,16 @@ wait_for() { printf "OK\n" } -netdata -D > netdata.log 2>&1 & +/usr/sbin/netdata -D > ./netdata.log 2>&1 & + +wait_for localhost 19999 netdata || exit 1 + +curl -sS http://127.0.0.1:19999/api/v1/info > ./response || exit 1 + +cat ./response + +jq '.version' ./response || exit 1 -wait_for localhost 19999 netdata +trap - EXIT -curl -sS http://127.0.0.1:19999/api/v1/info | jq '.version' +cp -a /packages/* /artifacts diff --git a/packaging/version b/packaging/version index 9bfaa7892..c8c2eaea5 100644 --- a/packaging/version +++ b/packaging/version @@ -1 +1 @@ -v1.29.3 +v1.30.0 diff --git a/registry/README.md b/registry/README.md index 968292c0b..1544a57d1 100644 --- a/registry/README.md +++ b/registry/README.md @@ -176,6 +176,17 @@ There can be up to 2 files: Both files are machine readable text files. +### How can I disable the SameSite and Secure cookies? + +Beginning with `v1.30.0`, when the Netdata Agent's web server processes a request, it delivers the `SameSite=none` +and `Secure` cookies. If you have problems accessing the local Agent dashboard or Netdata Cloud, disable these +cookies by [editing `netdata.conf`](/docs/configure/nodes.md#use-edit-config-to-edit-configuration-files): + +```conf +[registry] + enable cookies SameSite and Secure = no +``` + ## The future The registry opens a whole world of new possibilities for Netdata. Check here what we think: diff --git a/registry/registry.c b/registry/registry.c index b14f4ee4a..8148745fe 100644 --- a/registry/registry.c +++ b/registry/registry.c @@ -23,7 +23,7 @@ static inline void registry_unlock(void) { // COOKIES static void registry_set_cookie(struct web_client *w, const char *guid) { - char edate[100]; + char edate[100], domain[512]; time_t et = now_realtime_sec() + registry.persons_expiration; struct tm etmbuf, *etm = gmtime_r(&et, &etmbuf); strftime(edate, sizeof(edate), "%a, %d %b %Y %H:%M:%S %Z", etm); @@ -31,7 +31,22 @@ static void registry_set_cookie(struct web_client *w, const char *guid) { snprintfz(w->cookie1, NETDATA_WEB_REQUEST_COOKIE_SIZE, NETDATA_REGISTRY_COOKIE_NAME "=%s; Expires=%s", guid, edate); if(registry.registry_domain && registry.registry_domain[0]) - snprintfz(w->cookie2, NETDATA_WEB_REQUEST_COOKIE_SIZE, NETDATA_REGISTRY_COOKIE_NAME "=%s; Domain=%s; Expires=%s", guid, registry.registry_domain, edate); + snprintfz(domain, 511, "Domain=%s", registry.registry_domain); + else + domain[0]='\0'; + + int length = snprintfz(w->cookie2, NETDATA_WEB_REQUEST_COOKIE_SIZE, + NETDATA_REGISTRY_COOKIE_NAME "=%s; Expires=%s; %s", + guid, edate, domain); + + size_t remaining_length = NETDATA_WEB_REQUEST_COOKIE_SIZE - length; + // 25 is the necessary length to add new cookies + if (registry.enable_cookies_samesite_secure) { + if (length > 0 && remaining_length > 25) + snprintfz(&w->cookie2[length], remaining_length, "; SameSite=None; Secure"); + else + error("Netdata does not have enough space to store cookies SameSite and Secure"); + } } static inline void registry_set_person_cookie(struct web_client *w, REGISTRY_PERSON *p) { diff --git a/registry/registry_init.c b/registry/registry_init.c index ffdb83f3a..36673ff0f 100644 --- a/registry/registry_init.c +++ b/registry/registry_init.c @@ -39,6 +39,7 @@ int registry_init(void) { registry.registry_to_announce = config_get(CONFIG_SECTION_REGISTRY, "registry to announce", "https://registry.my-netdata.io"); registry.hostname = config_get(CONFIG_SECTION_REGISTRY, "registry hostname", netdata_configured_hostname); registry.verify_cookies_redirects = config_get_boolean(CONFIG_SECTION_REGISTRY, "verify browser cookies support", 1); + registry.enable_cookies_samesite_secure = config_get_boolean(CONFIG_SECTION_REGISTRY, "enable cookies SameSite and Secure", 1); registry_update_cloud_base_url(); setenv("NETDATA_REGISTRY_HOSTNAME", registry.hostname, 1); diff --git a/registry/registry_internals.h b/registry/registry_internals.h index 0eb83a436..3caf0aad8 100644 --- a/registry/registry_internals.h +++ b/registry/registry_internals.h @@ -40,6 +40,7 @@ struct registry { char *cloud_base_url; time_t persons_expiration; // seconds to expire idle persons int verify_cookies_redirects; + int enable_cookies_samesite_secure; size_t max_url_length; size_t max_name_length; diff --git a/registry/registry_person.c b/registry/registry_person.c index 268b0bd13..fae1520c4 100644 --- a/registry/registry_person.c +++ b/registry/registry_person.c @@ -32,7 +32,7 @@ inline REGISTRY_PERSON_URL *registry_person_url_index_find(REGISTRY_PERSON *p, c inline REGISTRY_PERSON_URL *registry_person_url_index_add(REGISTRY_PERSON *p, REGISTRY_PERSON_URL *pu) { debug(D_REGISTRY, "Registry: registry_person_url_index_add('%s', '%s')", p->guid, pu->url->url); - REGISTRY_PERSON_URL *tpu = (REGISTRY_PERSON_URL *)avl_insert(&(p->person_urls), (avl *)(pu)); + REGISTRY_PERSON_URL *tpu = (REGISTRY_PERSON_URL *)avl_insert(&(p->person_urls), (avl_t *)(pu)); if(tpu != pu) error("Registry: registry_person_url_index_add('%s', '%s') already exists as '%s'", p->guid, pu->url->url, tpu->url->url); @@ -41,7 +41,7 @@ inline REGISTRY_PERSON_URL *registry_person_url_index_add(REGISTRY_PERSON *p, RE inline REGISTRY_PERSON_URL *registry_person_url_index_del(REGISTRY_PERSON *p, REGISTRY_PERSON_URL *pu) { debug(D_REGISTRY, "Registry: registry_person_url_index_del('%s', '%s')", p->guid, pu->url->url); - REGISTRY_PERSON_URL *tpu = (REGISTRY_PERSON_URL *)avl_remove(&(p->person_urls), (avl *)(pu)); + REGISTRY_PERSON_URL *tpu = (REGISTRY_PERSON_URL *)avl_remove(&(p->person_urls), (avl_t *)(pu)); if(!tpu) error("Registry: registry_person_url_index_del('%s', '%s') deleted nothing", p->guid, pu->url->url); else if(tpu != pu) diff --git a/registry/registry_person.h b/registry/registry_person.h index 9a4aa959b..42419bfe9 100644 --- a/registry/registry_person.h +++ b/registry/registry_person.h @@ -10,7 +10,7 @@ // for each PERSON-URL pair we keep this struct registry_person_url { - avl avl; // binary tree node + avl_t avl; // binary tree node REGISTRY_URL *url; // de-duplicated URL REGISTRY_MACHINE *machine; // link the MACHINE of this URL diff --git a/registry/registry_url.c b/registry/registry_url.c index 9ac3ce10c..559799d8f 100644 --- a/registry/registry_url.c +++ b/registry/registry_url.c @@ -13,11 +13,11 @@ int registry_url_compare(void *a, void *b) { } inline REGISTRY_URL *registry_url_index_add(REGISTRY_URL *u) { - return (REGISTRY_URL *)avl_insert(&(registry.registry_urls_root_index), (avl *)(u)); + return (REGISTRY_URL *)avl_insert(&(registry.registry_urls_root_index), (avl_t *)(u)); } inline REGISTRY_URL *registry_url_index_del(REGISTRY_URL *u) { - return (REGISTRY_URL *)avl_remove(&(registry.registry_urls_root_index), (avl *)(u)); + return (REGISTRY_URL *)avl_remove(&(registry.registry_urls_root_index), (avl_t *)(u)); } REGISTRY_URL *registry_url_get(const char *url, size_t urllen) { @@ -33,7 +33,7 @@ REGISTRY_URL *registry_url_get(const char *url, size_t urllen) { strncpyz(n->url, url, n->len); n->hash = simple_hash(n->url); - REGISTRY_URL *u = (REGISTRY_URL *)avl_search(&(registry.registry_urls_root_index), (avl *)n); + REGISTRY_URL *u = (REGISTRY_URL *)avl_search(&(registry.registry_urls_root_index), (avl_t *)n); if(!u) { debug(D_REGISTRY, "Registry: registry_url_get('%s', %zu): allocating %zu bytes", url, urllen, sizeof(REGISTRY_URL) + urllen); u = callocz(1, sizeof(REGISTRY_URL) + urllen); // no need for +1, 1 is already in REGISTRY_URL diff --git a/registry/registry_url.h b/registry/registry_url.h index c684f1c35..0cc364fdf 100644 --- a/registry/registry_url.h +++ b/registry/registry_url.h @@ -12,7 +12,7 @@ // we store them here and we keep pointers elsewhere struct registry_url { - avl avl; + avl_t avl; uint32_t hash; // the index hash uint32_t links; // the number of links to this URL - when none is left, we free it diff --git a/spawn/spawn.c b/spawn/spawn.c index 256c04679..017ba7f39 100644 --- a/spawn/spawn.c +++ b/spawn/spawn.c @@ -62,7 +62,7 @@ uint64_t spawn_enq_cmd(char *command_to_run) { unsigned queue_size; uint64_t serial; - avl *avl_ret; + avl_t *avl_ret; struct spawn_cmd_info *cmdinfo; cmdinfo = create_spawn_cmd(command_to_run); @@ -79,8 +79,8 @@ uint64_t spawn_enq_cmd(char *command_to_run) cmdinfo->serial = serial; /* No need to take the cmd mutex since it is unreachable at the moment */ /* enqueue command */ - avl_ret = avl_insert(&spawn_cmd_queue.cmd_tree, (avl *)cmdinfo); - fatal_assert(avl_ret == (avl *)cmdinfo); + avl_ret = avl_insert(&spawn_cmd_queue.cmd_tree, (avl_t *)cmdinfo); + fatal_assert(avl_ret == (avl_t *)cmdinfo); uv_mutex_unlock(&spawn_cmd_queue.mutex); /* wake up event loop */ @@ -93,13 +93,13 @@ uint64_t spawn_enq_cmd(char *command_to_run) */ void spawn_wait_cmd(uint64_t serial, int *exit_status, time_t *exec_run_timestamp) { - avl *avl_ret; + avl_t *avl_ret; struct spawn_cmd_info tmp, *cmdinfo; tmp.serial = serial; uv_mutex_lock(&spawn_cmd_queue.mutex); - avl_ret = avl_search(&spawn_cmd_queue.cmd_tree, (avl *)&tmp); + avl_ret = avl_search(&spawn_cmd_queue.cmd_tree, (avl_t *)&tmp); uv_mutex_unlock(&spawn_cmd_queue.mutex); fatal_assert(avl_ret); /* Could be NULL if more than 1 threads wait for the command */ @@ -122,13 +122,13 @@ void spawn_wait_cmd(uint64_t serial, int *exit_status, time_t *exec_run_timestam void spawn_deq_cmd(struct spawn_cmd_info *cmdinfo) { unsigned queue_size; - avl *avl_ret; + avl_t *avl_ret; uv_mutex_lock(&spawn_cmd_queue.mutex); queue_size = spawn_cmd_queue.size; fatal_assert(queue_size); /* dequeue command */ - avl_ret = avl_remove(&spawn_cmd_queue.cmd_tree, (avl *)cmdinfo); + avl_ret = avl_remove(&spawn_cmd_queue.cmd_tree, (avl_t *)cmdinfo); fatal_assert(avl_ret); spawn_cmd_queue.size = queue_size - 1; diff --git a/spawn/spawn.h b/spawn/spawn.h index 34b2632e0..6a4414338 100644 --- a/spawn/spawn.h +++ b/spawn/spawn.h @@ -42,7 +42,7 @@ struct spawn_prot_header { #define SPAWN_CMD_DONE 0x00000008 struct spawn_cmd_info { - avl avl; + avl_t avl; /* concurrency control per command */ uv_mutex_t mutex; diff --git a/spawn/spawn_server.c b/spawn/spawn_server.c index f84fab1ca..57bcdf99a 100644 --- a/spawn/spawn_server.c +++ b/spawn/spawn_server.c @@ -16,7 +16,7 @@ static char prot_buffer[MAX_COMMAND_LENGTH]; static unsigned prot_buffer_len = 0; struct spawn_execution_info { - avl avl; + avl_t avl; void *handle; int exit_status; @@ -106,7 +106,7 @@ static void wait_children(void *arg) { siginfo_t i; struct spawn_execution_info tmp, *exec_info; - avl *ret_avl; + avl_t *ret_avl; (void)arg; while (!server_shutdown) { @@ -133,7 +133,7 @@ static void wait_children(void *arg) #endif fatal_assert(CLD_EXITED == i.si_code); tmp.pid = (pid_t)i.si_pid; - while (NULL == (ret_avl = avl_remove_lock(&spawn_outstanding_exec_tree, (avl *)&tmp))) { + while (NULL == (ret_avl = avl_remove_lock(&spawn_outstanding_exec_tree, (avl_t *)&tmp))) { fprintf(stderr, "SPAWN: race condition detected, waiting for child process %d to be indexed.\n", (int)tmp.pid); @@ -153,7 +153,7 @@ void spawn_protocol_execute_command(void *handle, char *command_to_run, uint16_t { uv_buf_t writebuf[2]; int ret; - avl *avl_ret; + avl_t *avl_ret; struct spawn_execution_info *exec_info; struct write_context *write_ctx; @@ -174,8 +174,8 @@ void spawn_protocol_execute_command(void *handle, char *command_to_run, uint16_t exec_info = mallocz(sizeof(*exec_info)); exec_info->handle = handle; exec_info->pid = write_ctx->spawn_result.exec_pid; - avl_ret = avl_insert_lock(&spawn_outstanding_exec_tree, (avl *)exec_info); - fatal_assert(avl_ret == (avl *)exec_info); + avl_ret = avl_insert_lock(&spawn_outstanding_exec_tree, (avl_t *)exec_info); + fatal_assert(avl_ret == (avl_t *)exec_info); /* wake up the thread that blocks waiting for processes to exit */ uv_mutex_lock(&wait_children_mutex); diff --git a/streaming/receiver.c b/streaming/receiver.c index 3ea15806d..03954064a 100644 --- a/streaming/receiver.c +++ b/streaming/receiver.c @@ -261,6 +261,14 @@ static int rrdpush_receive(struct receiver_state *rpt) mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->key, "default memory mode", rrd_memory_mode_name(mode))); mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->machine_guid, "memory mode", rrd_memory_mode_name(mode))); +#ifndef ENABLE_DBENGINE + if (unlikely(mode == RRD_MEMORY_MODE_DBENGINE)) { + close(rpt->fd); + log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->machine_guid, rpt->hostname, "REJECTED -- DBENGINE MEMORY MODE NOT SUPPORTED"); + return 1; + } +#endif + health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->key, "health enabled by default", health_enabled); health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->machine_guid, "health enabled", health_enabled); @@ -440,7 +448,7 @@ static int rrdpush_receive(struct receiver_state *rpt) cd.version = rpt->stream_version; -#ifdef ENABLE_ACLK +#if defined(ENABLE_ACLK) && !defined(ACLK_NG) // in case we have cloud connection we inform cloud // new slave connected if (netdata_cloud_setting) @@ -454,7 +462,7 @@ static int rrdpush_receive(struct receiver_state *rpt) error("STREAM %s [receive from [%s]:%s]: disconnected (completed %zu updates).", rpt->hostname, rpt->client_ip, rpt->client_port, count); -#ifdef ENABLE_ACLK +#if defined(ENABLE_ACLK) && !defined(ACLK_NG) // in case we have cloud connection we inform cloud // new slave connected if (netdata_cloud_setting) diff --git a/web/api/exporters/shell/README.md b/web/api/exporters/shell/README.md index b919045f2..9d44a3707 100644 --- a/web/api/exporters/shell/README.md +++ b/web/api/exporters/shell/README.md @@ -38,14 +38,12 @@ echo ${NETDATA_SYSTEM_CPU_VISIBLETOTAL} # what about alarms? set | grep "^NETDATA_ALARM_SYSTEM_SWAP_" -NETDATA_ALARM_SYSTEM_SWAP_RAM_IN_SWAP_STATUS=CRITICAL -NETDATA_ALARM_SYSTEM_SWAP_RAM_IN_SWAP_VALUE=53 NETDATA_ALARM_SYSTEM_SWAP_USED_SWAP_STATUS=CLEAR NETDATA_ALARM_SYSTEM_SWAP_USED_SWAP_VALUE=51 -# let's get the current status of the alarm 'ram in swap' -echo ${NETDATA_ALARM_SYSTEM_SWAP_RAM_IN_SWAP_STATUS} -CRITICAL +# let's get the current status of the alarm 'used swap' +echo ${NETDATA_ALARM_SYSTEM_SWAP_USED_SWAP_STATUS} +CLEAR # is it fast? time curl -s 'http://localhost:19999/api/v1/allmetrics' >/dev/null diff --git a/web/api/formatters/json/json.c b/web/api/formatters/json/json.c index f28eb5738..bf311e22c 100644 --- a/web/api/formatters/json/json.c +++ b/web/api/formatters/json/json.c @@ -5,8 +5,14 @@ #define JSON_DATES_JS 1 #define JSON_DATES_TIMESTAMP 2 -void rrdr2json(RRDR *r, BUFFER *wb, RRDR_OPTIONS options, int datatable, RRDDIM *temp_rd) { - rrdset_check_rdlock(r->st); +void rrdr2json(RRDR *r, BUFFER *wb, RRDR_OPTIONS options, int datatable, struct context_param *context_param_list) +{ + RRDDIM *temp_rd = context_param_list ? context_param_list->rd : NULL; + + int should_lock = (!context_param_list || !(context_param_list->flags & CONTEXT_FLAGS_ARCHIVE)); + + if (should_lock) + rrdset_check_rdlock(r->st); //info("RRD2JSON(): %s: BEGIN", r->st->id); int row_annotations = 0, dates, dates_with_new = 0; diff --git a/web/api/formatters/json/json.h b/web/api/formatters/json/json.h index 6d73a3ffe..5c4e11371 100644 --- a/web/api/formatters/json/json.h +++ b/web/api/formatters/json/json.h @@ -5,6 +5,6 @@ #include "../rrd2json.h" -extern void rrdr2json(RRDR *r, BUFFER *wb, RRDR_OPTIONS options, int datatable, RRDDIM *temp_rd); +extern void rrdr2json(RRDR *r, BUFFER *wb, RRDR_OPTIONS options, int datatable, struct context_param *context_param_list); #endif //NETDATA_API_FORMATTER_JSON_H diff --git a/web/api/formatters/json_wrapper.c b/web/api/formatters/json_wrapper.c index cf4f1099a..1d9c2472a 100644 --- a/web/api/formatters/json_wrapper.c +++ b/web/api/formatters/json_wrapper.c @@ -2,8 +2,16 @@ #include "json_wrapper.h" -void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS options, int string_value, RRDDIM *temp_rd, char *chart_label_key) { - rrdset_check_rdlock(r->st); +void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS options, int string_value, + struct context_param *context_param_list, char *chart_label_key) +{ + + RRDDIM *temp_rd = context_param_list ? context_param_list->rd : NULL; + int should_lock = (!context_param_list || !(context_param_list->flags & CONTEXT_FLAGS_ARCHIVE)); + uint8_t context_mode = (!context_param_list || (context_param_list->flags & CONTEXT_FLAGS_CONTEXT)); + + if (should_lock) + rrdset_check_rdlock(r->st); long rows = rrdr_rows(r); long c, i; @@ -22,7 +30,8 @@ void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS sq[0] = '"'; } - rrdset_rdlock(r->st); + if (should_lock) + rrdset_rdlock(r->st); buffer_sprintf(wb, "{\n" " %sapi%s: 1,\n" " %sid%s: %s%s%s,\n" @@ -35,16 +44,17 @@ void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS " %safter%s: %u,\n" " %sdimension_names%s: [" , kq, kq - , kq, kq, sq, temp_rd?r->st->context:r->st->id, sq - , kq, kq, sq, temp_rd?r->st->context:r->st->name, sq + , kq, kq, sq, context_mode && temp_rd?r->st->context:r->st->id, sq + , kq, kq, sq, context_mode && temp_rd?r->st->context:r->st->name, sq , kq, kq, r->update_every , kq, kq, r->st->update_every - , kq, kq, (uint32_t)rrdset_first_entry_t_nolock(r->st) - , kq, kq, (uint32_t)rrdset_last_entry_t_nolock(r->st) + , kq, kq, (uint32_t) (context_param_list ? context_param_list->first_entry_t : rrdset_first_entry_t_nolock(r->st)) + , kq, kq, (uint32_t) (context_param_list ? context_param_list->last_entry_t : rrdset_last_entry_t_nolock(r->st)) , kq, kq, (uint32_t)r->before , kq, kq, (uint32_t)r->after , kq, kq); - rrdset_unlock(r->st); + if (should_lock) + rrdset_unlock(r->st); for(c = 0, i = 0, rd = temp_rd?temp_rd:r->st->dimensions; rd && c < r->d ;c++, rd = rd->next) { if(unlikely(r->od[c] & RRDR_DIMENSION_HIDDEN)) continue; @@ -89,7 +99,7 @@ void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS buffer_strcat(wb, "],\n"); // Composite charts - if (temp_rd) { + if (context_mode && temp_rd) { buffer_sprintf( wb, " %schart_ids%s: [", diff --git a/web/api/formatters/json_wrapper.h b/web/api/formatters/json_wrapper.h index d48d5d1ae..14662db74 100644 --- a/web/api/formatters/json_wrapper.h +++ b/web/api/formatters/json_wrapper.h @@ -5,7 +5,7 @@ #include "rrd2json.h" -extern void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS options, int string_value, RRDDIM *temp_rd, char *chart_key); +extern void rrdr_json_wrapper_begin(RRDR *r, BUFFER *wb, uint32_t format, RRDR_OPTIONS options, int string_value, struct context_param *context_param_list, char *chart_key); extern void rrdr_json_wrapper_end(RRDR *r, BUFFER *wb, uint32_t format, uint32_t options, int string_value); #endif //NETDATA_API_FORMATTER_JSON_WRAPPER_H diff --git a/web/api/formatters/rrd2json.c b/web/api/formatters/rrd2json.c index d8e248066..5b12c89ba 100644 --- a/web/api/formatters/rrd2json.c +++ b/web/api/formatters/rrd2json.c @@ -2,7 +2,28 @@ #include "web/api/web_api_v1.h" -static inline void free_temp_rrddim(RRDDIM *temp_rd) +static inline void free_single_rrdrim(RRDDIM *temp_rd, int archive_mode) +{ + if (unlikely(!temp_rd)) + return; + + freez((char *)temp_rd->id); + freez((char *)temp_rd->name); + + if (unlikely(archive_mode)) { + temp_rd->rrdset->counter--; + if (!temp_rd->rrdset->counter) { + freez((char *)temp_rd->rrdset->name); + freez(temp_rd->rrdset->context); + freez(temp_rd->rrdset); + } + } + freez(temp_rd->state->metric_uuid); + freez(temp_rd->state); + freez(temp_rd); +} + +static inline void free_rrddim_list(RRDDIM *temp_rd, int archive_mode) { if (unlikely(!temp_rd)) return; @@ -10,14 +31,7 @@ static inline void free_temp_rrddim(RRDDIM *temp_rd) RRDDIM *t; while (temp_rd) { t = temp_rd->next; - freez((char *)temp_rd->id); - freez((char *)temp_rd->name); -#ifdef ENABLE_DBENGINE - if (temp_rd->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - freez(temp_rd->state->metric_uuid); -#endif - freez(temp_rd->state); - freez(temp_rd); + free_single_rrdrim(temp_rd, archive_mode); temp_rd = t; } } @@ -27,7 +41,7 @@ void free_context_param_list(struct context_param **param_list) if (unlikely(!param_list || !*param_list)) return; - free_temp_rrddim(((*param_list)->rd)); + free_rrddim_list(((*param_list)->rd), (*param_list)->flags & CONTEXT_FLAGS_ARCHIVE); freez((*param_list)); *param_list = NULL; } @@ -36,21 +50,17 @@ void rebuild_context_param_list(struct context_param *context_param_list, time_t { RRDDIM *temp_rd = context_param_list->rd; RRDDIM *new_rd_list = NULL, *t; + int is_archived = (context_param_list->flags & CONTEXT_FLAGS_ARCHIVE); while (temp_rd) { t = temp_rd->next; - if (rrdset_last_entry_t(temp_rd->rrdset) >= after_requested) { + RRDSET *st = temp_rd->rrdset; + time_t last_entry_t = is_archived ? st->last_entry_t : rrdset_last_entry_t(st); + + if (last_entry_t >= after_requested) { temp_rd->next = new_rd_list; new_rd_list = temp_rd; - } else { - freez((char *)temp_rd->id); - freez((char *)temp_rd->name); -#ifdef ENABLE_DBENGINE - if (temp_rd->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - freez(temp_rd->state->metric_uuid); -#endif - freez(temp_rd->state); - freez(temp_rd); - } + } else + free_single_rrdrim(temp_rd, is_archived); temp_rd = t; } context_param_list->rd = new_rd_list; @@ -65,6 +75,7 @@ void build_context_param_list(struct context_param **param_list, RRDSET *st) *param_list = mallocz(sizeof(struct context_param)); (*param_list)->first_entry_t = LONG_MAX; (*param_list)->last_entry_t = 0; + (*param_list)->flags = CONTEXT_FLAGS_CONTEXT; (*param_list)->rd = NULL; } @@ -214,9 +225,9 @@ int rrdset2anything_api_v1( , struct context_param *context_param_list , char *chart_label_key ) { - time_t last_accessed_time = now_realtime_sec(); - st->last_accessed_time = last_accessed_time; + if (context_param_list && !(context_param_list->flags & CONTEXT_FLAGS_ARCHIVE)) + st->last_accessed_time = now_realtime_sec(); RRDR *r = rrd2rrdr(st, points, after, before, group_method, group_time, options, dimensions?buffer_tostring(dimensions):NULL, context_param_list); if(!r) { @@ -238,7 +249,7 @@ int rrdset2anything_api_v1( case DATASOURCE_SSV: if(options & RRDR_OPTION_JSON_WRAP) { wb->contenttype = CT_APPLICATION_JSON; - rrdr_json_wrapper_begin(r, wb, format, options, 1, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 1, context_param_list, chart_label_key); rrdr2ssv(r, wb, options, "", " ", ""); rrdr_json_wrapper_end(r, wb, format, options, 1); } @@ -251,7 +262,7 @@ int rrdset2anything_api_v1( case DATASOURCE_SSV_COMMA: if(options & RRDR_OPTION_JSON_WRAP) { wb->contenttype = CT_APPLICATION_JSON; - rrdr_json_wrapper_begin(r, wb, format, options, 1, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 1, context_param_list, chart_label_key); rrdr2ssv(r, wb, options, "", ",", ""); rrdr_json_wrapper_end(r, wb, format, options, 1); } @@ -264,7 +275,7 @@ int rrdset2anything_api_v1( case DATASOURCE_JS_ARRAY: if(options & RRDR_OPTION_JSON_WRAP) { wb->contenttype = CT_APPLICATION_JSON; - rrdr_json_wrapper_begin(r, wb, format, options, 0, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 0, context_param_list, chart_label_key); rrdr2ssv(r, wb, options, "[", ",", "]"); rrdr_json_wrapper_end(r, wb, format, options, 0); } @@ -277,7 +288,7 @@ int rrdset2anything_api_v1( case DATASOURCE_CSV: if(options & RRDR_OPTION_JSON_WRAP) { wb->contenttype = CT_APPLICATION_JSON; - rrdr_json_wrapper_begin(r, wb, format, options, 1, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 1, context_param_list, chart_label_key); rrdr2csv(r, wb, format, options, "", ",", "\\n", "", temp_rd); rrdr_json_wrapper_end(r, wb, format, options, 1); } @@ -290,7 +301,7 @@ int rrdset2anything_api_v1( case DATASOURCE_CSV_MARKDOWN: if(options & RRDR_OPTION_JSON_WRAP) { wb->contenttype = CT_APPLICATION_JSON; - rrdr_json_wrapper_begin(r, wb, format, options, 1, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 1, context_param_list, chart_label_key); rrdr2csv(r, wb, format, options, "", "|", "\\n", "", temp_rd); rrdr_json_wrapper_end(r, wb, format, options, 1); } @@ -303,7 +314,7 @@ int rrdset2anything_api_v1( case DATASOURCE_CSV_JSON_ARRAY: wb->contenttype = CT_APPLICATION_JSON; if(options & RRDR_OPTION_JSON_WRAP) { - rrdr_json_wrapper_begin(r, wb, format, options, 0, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 0, context_param_list, chart_label_key); buffer_strcat(wb, "[\n"); rrdr2csv(r, wb, format, options + RRDR_OPTION_LABEL_QUOTES, "[", ",", "]", ",\n", temp_rd); buffer_strcat(wb, "\n]"); @@ -320,7 +331,7 @@ int rrdset2anything_api_v1( case DATASOURCE_TSV: if(options & RRDR_OPTION_JSON_WRAP) { wb->contenttype = CT_APPLICATION_JSON; - rrdr_json_wrapper_begin(r, wb, format, options, 1, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 1, context_param_list, chart_label_key); rrdr2csv(r, wb, format, options, "", "\t", "\\n", "", temp_rd); rrdr_json_wrapper_end(r, wb, format, options, 1); } @@ -333,7 +344,7 @@ int rrdset2anything_api_v1( case DATASOURCE_HTML: if(options & RRDR_OPTION_JSON_WRAP) { wb->contenttype = CT_APPLICATION_JSON; - rrdr_json_wrapper_begin(r, wb, format, options, 1, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 1, context_param_list, chart_label_key); buffer_strcat(wb, "\\n
\\n\\n"); rrdr2csv(r, wb, format, options, "\\n", "", temp_rd); buffer_strcat(wb, "
", "", "
\\n
\\n\\n"); @@ -351,9 +362,9 @@ int rrdset2anything_api_v1( wb->contenttype = CT_APPLICATION_X_JAVASCRIPT; if(options & RRDR_OPTION_JSON_WRAP) - rrdr_json_wrapper_begin(r, wb, format, options, 0, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 0, context_param_list, chart_label_key); - rrdr2json(r, wb, options, 1, temp_rd); + rrdr2json(r, wb, options, 1, context_param_list); if(options & RRDR_OPTION_JSON_WRAP) rrdr_json_wrapper_end(r, wb, format, options, 0); @@ -363,9 +374,9 @@ int rrdset2anything_api_v1( wb->contenttype = CT_APPLICATION_JSON; if(options & RRDR_OPTION_JSON_WRAP) - rrdr_json_wrapper_begin(r, wb, format, options, 0, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 0, context_param_list, chart_label_key); - rrdr2json(r, wb, options, 1, temp_rd); + rrdr2json(r, wb, options, 1, context_param_list); if(options & RRDR_OPTION_JSON_WRAP) rrdr_json_wrapper_end(r, wb, format, options, 0); @@ -374,9 +385,9 @@ int rrdset2anything_api_v1( case DATASOURCE_JSONP: wb->contenttype = CT_APPLICATION_X_JAVASCRIPT; if(options & RRDR_OPTION_JSON_WRAP) - rrdr_json_wrapper_begin(r, wb, format, options, 0, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 0, context_param_list, chart_label_key); - rrdr2json(r, wb, options, 0, temp_rd); + rrdr2json(r, wb, options, 0, context_param_list); if(options & RRDR_OPTION_JSON_WRAP) rrdr_json_wrapper_end(r, wb, format, options, 0); @@ -387,9 +398,9 @@ int rrdset2anything_api_v1( wb->contenttype = CT_APPLICATION_JSON; if(options & RRDR_OPTION_JSON_WRAP) - rrdr_json_wrapper_begin(r, wb, format, options, 0, temp_rd, chart_label_key); + rrdr_json_wrapper_begin(r, wb, format, options, 0, context_param_list, chart_label_key); - rrdr2json(r, wb, options, 0, temp_rd); + rrdr2json(r, wb, options, 0, context_param_list); if(options & RRDR_OPTION_JSON_WRAP) rrdr_json_wrapper_end(r, wb, format, options, 0); diff --git a/web/api/formatters/value/value.c b/web/api/formatters/value/value.c index aea6c162b..a69af6165 100644 --- a/web/api/formatters/value/value.c +++ b/web/api/formatters/value/value.c @@ -4,7 +4,8 @@ inline calculated_number rrdr2value(RRDR *r, long i, RRDR_OPTIONS options, int *all_values_are_null) { - rrdset_check_rdlock(r->st); + if (r->st_needs_lock) + rrdset_check_rdlock(r->st); long c; RRDDIM *d; diff --git a/web/api/queries/query.c b/web/api/queries/query.c index 663e4bd14..2a27a94fc 100644 --- a/web/api/queries/query.c +++ b/web/api/queries/query.c @@ -281,8 +281,14 @@ RRDR_GROUPING web_client_api_request_v1_data_group(const char *name, RRDR_GROUPI // ---------------------------------------------------------------------------- -static void rrdr_disable_not_selected_dimensions(RRDR *r, RRDR_OPTIONS options, const char *dims, RRDDIM *temp_rd) { - rrdset_check_rdlock(r->st); +static void rrdr_disable_not_selected_dimensions(RRDR *r, RRDR_OPTIONS options, const char *dims, + struct context_param *context_param_list) +{ + RRDDIM *temp_rd = context_param_list ? context_param_list->rd : NULL; + int should_lock = (!context_param_list || !(context_param_list->flags & CONTEXT_FLAGS_ARCHIVE)); + + if (should_lock) + rrdset_check_rdlock(r->st); if(unlikely(!dims || !*dims || (dims[0] == '*' && dims[1] == '\0'))) return; @@ -758,8 +764,8 @@ static int rrdr_convert_before_after_to_absolute( } // allow relative for before (smaller than API_RELATIVE_TIME_MAX) - if(abs(before_requested) <= API_RELATIVE_TIME_MAX) { - if(abs(before_requested) % update_every) { + if(ABS(before_requested) <= API_RELATIVE_TIME_MAX) { + if(ABS(before_requested) % update_every) { // make sure it is multiple of st->update_every if(before_requested < 0) before_requested = before_requested - update_every - before_requested % update_every; @@ -772,9 +778,9 @@ static int rrdr_convert_before_after_to_absolute( } // allow relative for after (smaller than API_RELATIVE_TIME_MAX) - if(abs(after_requested) <= API_RELATIVE_TIME_MAX) { + if(ABS(after_requested) <= API_RELATIVE_TIME_MAX) { if(after_requested == 0) after_requested = -update_every; - if(abs(after_requested) % update_every) { + if(ABS(after_requested) % update_every) { // make sure it is multiple of st->update_every if(after_requested < 0) after_requested = after_requested - update_every - after_requested % update_every; else after_requested = after_requested + update_every - after_requested % update_every; @@ -1060,10 +1066,11 @@ static RRDR *rrd2rrdr_fixedstep( // ------------------------------------------------------------------------- // disable the not-wanted dimensions - rrdset_check_rdlock(st); + if (context_param_list && !(context_param_list->flags & CONTEXT_FLAGS_ARCHIVE)) + rrdset_check_rdlock(st); if(dimensions) - rrdr_disable_not_selected_dimensions(r, options, dimensions, temp_rd); + rrdr_disable_not_selected_dimensions(r, options, dimensions, context_param_list); // ------------------------------------------------------------------------- @@ -1435,11 +1442,11 @@ static RRDR *rrd2rrdr_variablestep( // ------------------------------------------------------------------------- // disable the not-wanted dimensions - - rrdset_check_rdlock(st); + if (context_param_list && !(context_param_list->flags & CONTEXT_FLAGS_ARCHIVE)) + rrdset_check_rdlock(st); if(dimensions) - rrdr_disable_not_selected_dimensions(r, options, dimensions, temp_rd); + rrdr_disable_not_selected_dimensions(r, options, dimensions, context_param_list); // ------------------------------------------------------------------------- @@ -1591,8 +1598,12 @@ RRDR *rrd2rrdr( if (first_entry_t > after_requested) first_entry_t = after_requested; - if (context_param_list) + if (context_param_list && !(context_param_list->flags & CONTEXT_FLAGS_ARCHIVE)) { rebuild_context_param_list(context_param_list, after_requested); + st = context_param_list->rd ? context_param_list->rd->rrdset : NULL; + if (unlikely(!st)) + return NULL; + } #ifdef ENABLE_DBENGINE if (st->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { diff --git a/web/api/queries/rrdr.c b/web/api/queries/rrdr.c index ef237fa02..b64868222 100644 --- a/web/api/queries/rrdr.c +++ b/web/api/queries/rrdr.c @@ -100,7 +100,7 @@ inline void rrdr_free(RRDR *r) RRDR *rrdr_create(struct rrdset *st, long n, struct context_param *context_param_list) { - if(unlikely(!st)) { + if (unlikely(!st)) { error("NULL value given!"); return NULL; } @@ -108,7 +108,10 @@ RRDR *rrdr_create(struct rrdset *st, long n, struct context_param *context_param RRDR *r = callocz(1, sizeof(RRDR)); r->st = st; - rrdr_lock_rrdset(r); + if (!context_param_list || !(context_param_list->flags & CONTEXT_FLAGS_ARCHIVE)) { + rrdr_lock_rrdset(r); + r->st_needs_lock = 1; + } RRDDIM *temp_rd = context_param_list ? context_param_list->rd : NULL; RRDDIM *rd; diff --git a/web/api/queries/rrdr.h b/web/api/queries/rrdr.h index 4d349c305..d95c10857 100644 --- a/web/api/queries/rrdr.h +++ b/web/api/queries/rrdr.h @@ -73,6 +73,7 @@ typedef struct rrdresult { time_t after; int has_st_lock; // if st is read locked by us + uint8_t st_needs_lock; // if ST should be locked // internal rrd2rrdr() members below this point struct { diff --git a/web/api/web_api_v1.c b/web/api/web_api_v1.c index 73ac15d30..1d8217bbd 100644 --- a/web/api/web_api_v1.c +++ b/web/api/web_api_v1.c @@ -276,6 +276,7 @@ inline int web_client_api_request_v1_alarm_count(RRDHOST *host, struct web_clien inline int web_client_api_request_v1_alarm_log(RRDHOST *host, struct web_client *w, char *url) { uint32_t after = 0; + char *chart = NULL; while(url) { char *value = mystrsep(&url, "&"); @@ -285,12 +286,13 @@ inline int web_client_api_request_v1_alarm_log(RRDHOST *host, struct web_client if(!name || !*name) continue; if(!value || !*value) continue; - if(!strcmp(name, "after")) after = (uint32_t)strtoul(value, NULL, 0); + if (!strcmp(name, "after")) after = (uint32_t)strtoul(value, NULL, 0); + else if (!strcmp(name, "chart")) chart = value; } buffer_flush(w->response.data); w->response.data->contenttype = CT_APPLICATION_JSON; - health_alarm_log2json(host, w->response.data, after); + health_alarm_log2json(host, w->response.data, after, chart); return HTTP_RESP_OK; } @@ -511,6 +513,10 @@ inline int web_client_api_request_v1_data(RRDHOST *host, struct web_client *w, c rrdhost_unlock(host); if (likely(context_param_list && context_param_list->rd)) // Just set the first one st = context_param_list->rd->rrdset; + else { + if (!chart_label_key) + sql_build_context_param_list(&context_param_list, host, context, NULL); + } } else { st = rrdset_find(host, chart); @@ -518,6 +524,17 @@ inline int web_client_api_request_v1_data(RRDHOST *host, struct web_client *w, c st = rrdset_find_byname(host, chart); if (likely(st)) st->last_accessed_time = now_realtime_sec(); + else + sql_build_context_param_list(&context_param_list, host, NULL, chart); + } + + if (!st) { + if (likely(context_param_list && context_param_list->rd && context_param_list->rd->rrdset)) + st = context_param_list->rd->rrdset; + else { + free_context_param_list(&context_param_list); + context_param_list = NULL; + } } if (!st && !context_param_list) { @@ -961,6 +978,11 @@ inline int web_client_api_request_v1_info_fill_buffer(RRDHOST *host, BUFFER *wb) #ifdef ENABLE_ACLK buffer_strcat(wb, "\t\"cloud-available\": true,\n"); +#ifdef ACLK_NG + buffer_strcat(wb, "\t\"aclk-implementation\": \"Next Generation\",\n"); +#else + buffer_strcat(wb, "\t\"aclk-implementation\": \"legacy\",\n"); +#endif #else buffer_strcat(wb, "\t\"cloud-available\": false,\n"); #endif diff --git a/web/gui/dashboard_info.js b/web/gui/dashboard_info.js index df15a6372..d5d7c693c 100644 --- a/web/gui/dashboard_info.js +++ b/web/gui/dashboard_info.js @@ -582,11 +582,19 @@ netdataDashboard.menu = { icon: '', info: 'Charts showing alarm status over time. More details here.' }, + 'statsd': { title: 'StatsD', icon: '', info:'StatsD is an industry-standard technology stack for monitoring applications and instrumenting any piece of software to deliver custom metrics. Netdata allows the user to organize the metrics in different charts and visualize any application metric easily. Read more on Netdata Learn.' - } + }, + + 'supervisord': { + title: 'Supervisord', + icon: '', + info: 'Detailed statistics for each group of processes controlled by Supervisor. ' + + 'Netdata collects these metrics using getAllProcessInfo method.' + }, }; @@ -1010,6 +1018,38 @@ netdataDashboard.context = { info: 'Transparent HugePages (THP) is backing virtual memory with huge pages, supporting automatic promotion and demotion of page sizes. It works for all applications for anonymous memory mappings and tmpfs/shmem.' }, + 'mem.cachestat_ratio': { + info: 'When the processor needs to read or write a location in main memory, it checks for a corresponding entry in the page cache. If the entry is there, a page cache hit has occurred and the read is from the cache. If the entry is not there, a page cache miss has occurred and the kernel allocates a new entry and copies in data from the disk. Netdata calculates the percentage of accessed files that are cached on memory. The ratio is calculated counting the accessed cached pages (without counting dirty pages and pages added because of read misses) divided by total access without dirty pages. The algorithm will not plot data when ratio is zero and our dashboard will interpolate the plot. ' + }, + + 'mem.cachestat_dirties': { + info: 'Number of dirty(modified) pages cache. Pages in the page cache modified after being brought in are called dirty pages. Since non-dirty pages in the page cache have identical copies in secondary storage (e.g. hard disk drive or solid-state drive), discarding and reusing their space is much quicker than paging out application memory, and is often preferred over flushing the dirty pages into secondary storage and reusing their space.' + }, + + 'mem.cachestat_hits': { + info: 'When the processor needs to read or write a location in main memory, it checks for a corresponding entry in the page cache. If the entry is there, a page cache hit has occurred and the read is from the cache. Hits show pages accessed that were not modified (we are excluding dirty pages), this counting also excludes the recent pages inserted for read.' + }, + + 'mem.cachestat_misses': { + info: 'When the processor needs to read or write a location in main memory, it checks for a corresponding entry in the page cache. If the entry is not there, a page cache miss has occurred and the cache allocates a new entry and copies in data for the main memory. Misses count page insertions to the memory not related to writing.' + }, + + 'mem.sync': { + info: 'System calls for sync() and syncfs() which flush the file system buffers to storage devices. Performance perturbations might be caused by these calls. The sync() calls are based on the eBPF syncsnoop from BCC tools.' + }, + + 'mem.file_sync': { + info: 'System calls for fsync() and fdatasync() transfer all modified page caches for the files on disk devices. These calls block until the device reports that the transfer has been completed.' + }, + + 'mem.memory_map': { + info: 'System calls for msync() which flushes changes made to the in-core copy of a file that was mapped.' + }, + + 'mem.file_segment': { + info: 'System calls for sync_file_range() permits fine control when synchronizing the open file referred to by the file descriptor fd with disk. This system call is extremely dangerous and should not be used in portable programs.' + }, + // ------------------------------------------------------------------------ // network interfaces @@ -1017,6 +1057,18 @@ netdataDashboard.context = { info: 'Packets that have been dropped at the network interface level. These are the same counters reported by ifconfig as RX dropped (inbound) and TX dropped (outbound). inbound packets can be dropped at the network interface level due to softnet backlog overflow, bad / unintented VLAN tags, unknown or unregistered protocols, IPv6 frames when the server is not configured for IPv6. Check this document for more information.' }, + 'net.duplex': { + info: 'State map: 0 - unknown, 1 - half duplex, 2 - full duplex' + }, + + 'net.operstate': { + info: 'State map: 0 - unknown, 1 - notpresent, 2 - down, 3 - lowerlayerdown, 4 - testing, 5 - dormant, 6 - up' + }, + + 'net.carrier': { + info: 'State map: 0 - down, 1 - up' + }, + // ------------------------------------------------------------------------ // IP @@ -1119,12 +1171,12 @@ netdataDashboard.context = { }, 'apps.file_closed': { - info: 'Calls to the internal function __close_fd, which is called from' + + info: 'Calls to the internal function __close_fd or close_fd according to your kernel version, which is called from' + ' close(2). ' }, 'apps.file_close_error': { - info: 'Failed calls to the internal function __close_fd.' + info: 'Failed calls to the internal function __close_fd or close_fd according to your kernel version.' }, 'apps.file_deleted': { @@ -1342,6 +1394,11 @@ netdataDashboard.context = { info: 'Disk Utilization measures the amount of time the disk was busy with something. This is not related to its performance. 100% means that the system always had an outstanding operation on the disk. Keep in mind that depending on the underlying technology of the disk, 100% here may or may not be an indication of congestion.' }, + 'disk.busy': { + colors: '#FF5588', + info: 'Disk Busy Time measures the amount of time the disk was busy with something.' + }, + 'disk.backlog': { colors: '#0099CC', info: 'Backlog is an indication of the duration of pending disk operations. On every I/O event the system is multiplying the time spent doing I/O since the last update of this field with the number of pending operations. While not accurate, this metric can provide an indication of the expected completion time of the operations in progress.' @@ -2306,7 +2363,7 @@ netdataDashboard.context = { }, 'web_log.squid_transport_errors': { - info: 'These tags are optional and describe some error conditions which occured during response delivery (if any). ' + + info: 'These tags are optional and describe some error conditions which occurred during response delivery (if any). ' + 'ABORTED when the response was not completed due to the connection being aborted (usually by the client). ' + 'TIMEOUT, when the response was not completed due to a connection timeout.' }, @@ -3063,7 +3120,7 @@ netdataDashboard.context = { }, 'squidlog.cache_code_error_tag_requests': { - info: 'These tags are optional and describe some error conditions which occured during response delivery.
' + + info: 'These tags are optional and describe some error conditions which occurred during response delivery.
' + '
    ' + '
  • ABORTED the response was not completed due to the connection being aborted (usually by the client).
  • ' + '
  • TIMEOUT the response was not completed due to a connection timeout.
  • ' + @@ -3304,7 +3361,7 @@ netdataDashboard.context = { info: 'Calls for internal functions on Linux kernel. The open dimension is attached to the kernel internal function do_sys_open ( For kernels newer than 5.5.19 we add a kprobe to do_sys_openat2. ), which is the common function called from'+ ' open(2) ' + ' and openat(2). ' + - ' The close dimension is attached to the function __close_fd, which is called from system call' + + ' The close dimension is attached to the function __close_fd or close_fd according to your kernel version, which is called from system call' + ' close(2). ' }, @@ -3313,7 +3370,7 @@ netdataDashboard.context = { info: 'Failed calls to the kernel internal function do_sys_open ( For kernels newer than 5.5.19 we add a kprobe to do_sys_openat2. ), which is the common function called from'+ ' open(2) ' + ' and openat(2). ' + - ' The close dimension is attached to the function __close_fd, which is called from system call' + + ' The close dimension is attached to the function __close_fd or close_fd according to your kernel version, which is called from system call' + ' close(2). ' }, @@ -3707,6 +3764,7 @@ netdataDashboard.context = { + ' data-chart-library="easypiechart"' + ' data-title="Fan Speed"' + ' data-units="percentage"' + + ' data-easypiechart-max-value="100"' + ' data-gauge-adjust="width"' + ' data-width="12%"' + ' data-before="0"' @@ -3778,4 +3836,13 @@ netdataDashboard.context = { } ] }, + + // ------------------------------------------------------------------------ + // Supervisor + + 'supervisord.process_state_code': { + info: 'Process states map: ' + + '0 - stopped, 10 - starting, 20 - running, 30 - backoff,' + + '40 - stopping, 100 - exited, 200 - fatal, 1000 - unknown.' + }, }; diff --git a/web/gui/main.js b/web/gui/main.js index 5bf11e5f7..dc9a5f7f0 100644 --- a/web/gui/main.js +++ b/web/gui/main.js @@ -668,13 +668,13 @@ function renderMachines(machinesArray) { if (machines) { html += ( `` ) } else { html += ( `` ) } @@ -812,7 +812,7 @@ function renderMyNetdataMenu(machinesArray) { ` ) @@ -1815,8 +1815,8 @@ function renderPage(menus, data) { const isMemoryModeDbEngine = data.memory_mode === "dbengine"; - sidebar += '
  • Add more charts
  • '; - sidebar += '
  • Add more alarms
  • '; + sidebar += '
  • Add more charts
  • '; + sidebar += '
  • Add more alarms
  • '; sidebar += '
  • Every ' + ((data.update_every === 1) ? 'second' : data.update_every.toString() + ' seconds') + ', ' + 'Netdata collects ' + data.dimensions_count.toLocaleString() + ' metrics on ' + @@ -1828,7 +1828,7 @@ function renderPage(menus, data) { if (!isMemoryModeDbEngine) { sidebar += '
     
    Get more history by ' + - 'configuring Netdata\'s history or using the DB engine.'; + 'configuring Netdata\'s history or using the DB engine.'; } sidebar += '
     
    netdata
    ' + data.version.toString() + '
  • '; @@ -3052,7 +3052,7 @@ function notifyForUpdate(force) { versionLog('

    You already have the latest netdata!

    No update yet?
    We probably need some motivation to keep going on!

    If you haven\'t already, give netdata a at its github page.

    '); } else { save = true; - var compare = 'https://docs.netdata.cloud/changelog/'; + var compare = 'https://learn.netdata.cloud/docs/agent/changelog/'; versionLog('

    New version of netdata available!

    Latest version: ' + sha2 + '

    Click here for the changes log and
    click here for directions on updating your netdata installation.

    We suggest to review the changes log for new features you may be interested, or important bug fixes you may need.
    Keeping your netdata updated is generally a good idea.

    '); document.getElementById('update_badge').innerHTML = '!'; diff --git a/web/server/web_client.c b/web/server/web_client.c index f0856fb17..5e3de38d5 100644 --- a/web/server/web_client.c +++ b/web/server/web_client.c @@ -55,7 +55,7 @@ static inline int web_client_uncrock_socket(struct web_client *w) { return 0; } -static inline char *strip_control_characters(char *url) { +char *strip_control_characters(char *url) { char *s = url; if(!s) return ""; @@ -1374,34 +1374,25 @@ static inline int web_client_switch_host(RRDHOST *host, struct web_client *w, ch uint32_t hash = simple_hash(tok); host = rrdhost_find_by_hostname(tok, hash); - if(!host) host = rrdhost_find_by_guid(tok, hash); - -#ifdef ENABLE_DBENGINE - int release_host = 0; + if (!host) + host = rrdhost_find_by_guid(tok, hash); if (!host) { host = sql_create_host_by_uuid(tok); if (likely(host)) { - rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED); - release_host = 1; - } - } - if(host) { - int rc = web_client_process_url(host, w, url); - if (release_host) { + int rc = web_client_process_url(host, w, url); freez(host->hostname); - freez((char *) host->os); - freez((char *) host->tags); - freez((char *) host->timezone); + freez((char *)host->os); + freez((char *)host->tags); + freez((char *)host->timezone); freez(host->program_name); freez(host->program_version); freez(host->registry_hostname); + freez(host->system_info); freez(host); + return rc; } - return rc; } -#else if (host) return web_client_process_url(host, w, url); -#endif } buffer_flush(w->response.data); diff --git a/web/server/web_client.h b/web/server/web_client.h index 48bf1ac86..4580b9749 100644 --- a/web/server/web_client.h +++ b/web/server/web_client.h @@ -211,6 +211,7 @@ extern void buffer_data_options2string(BUFFER *wb, uint32_t options); extern int mysendfile(struct web_client *w, char *filename); extern void web_client_build_http_header(struct web_client *w); +extern char *strip_control_characters(char *url); #include "daemon/common.h" -- cgit v1.2.3