diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-05 11:19:16 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-05 12:07:37 +0000 |
commit | b485aab7e71c1625cfc27e0f92c9509f42378458 (patch) | |
tree | ae9abe108601079d1679194de237c9a435ae5b55 /collectors/cgroups.plugin | |
parent | Adding upstream version 1.44.3. (diff) | |
download | netdata-upstream.tar.xz netdata-upstream.zip |
Adding upstream version 1.45.3+dfsg.upstream/1.45.3+dfsgupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | collectors/cgroups.plugin/Makefile.am | 23 | ||||
-rw-r--r-- | collectors/cgroups.plugin/README.md | 302 | ||||
-rw-r--r-- | collectors/cgroups.plugin/metadata.yaml | 1022 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-charts.c (renamed from collectors/cgroups.plugin/cgroup-charts.c) | 2 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-discovery.c (renamed from collectors/cgroups.plugin/cgroup-discovery.c) | 94 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-internals.h (renamed from collectors/cgroups.plugin/cgroup-internals.h) | 21 | ||||
-rwxr-xr-x | src/collectors/cgroups.plugin/cgroup-name.sh.in (renamed from collectors/cgroups.plugin/cgroup-name.sh.in) | 0 | ||||
-rwxr-xr-x | src/collectors/cgroups.plugin/cgroup-network-helper.sh.in (renamed from collectors/cgroups.plugin/cgroup-network-helper.sh.in) | 0 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-network.c (renamed from collectors/cgroups.plugin/cgroup-network.c) | 3 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-top.c (renamed from collectors/cgroups.plugin/cgroup-top.c) | 52 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/containers.md (renamed from collectors/cgroups.plugin/integrations/containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/kubernetes_containers.md (renamed from collectors/cgroups.plugin/integrations/kubernetes_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/libvirt_containers.md (renamed from collectors/cgroups.plugin/integrations/libvirt_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/lxc_containers.md (renamed from collectors/cgroups.plugin/integrations/lxc_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/ovirt_containers.md (renamed from collectors/cgroups.plugin/integrations/ovirt_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/proxmox_containers.md (renamed from collectors/cgroups.plugin/integrations/proxmox_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/systemd_services.md (renamed from collectors/cgroups.plugin/integrations/systemd_services.md) | 6 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/virtual_machines.md (renamed from collectors/cgroups.plugin/integrations/virtual_machines.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/sys_fs_cgroup.c (renamed from collectors/cgroups.plugin/sys_fs_cgroup.c) | 116 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/sys_fs_cgroup.h (renamed from collectors/cgroups.plugin/sys_fs_cgroup.h) | 0 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/tests/test_cgroups_plugin.c (renamed from collectors/cgroups.plugin/tests/test_cgroups_plugin.c) | 0 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/tests/test_cgroups_plugin.h (renamed from collectors/cgroups.plugin/tests/test_cgroups_plugin.h) | 0 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/tests/test_doubles.c (renamed from collectors/cgroups.plugin/tests/test_doubles.c) | 4 |
23 files changed, 196 insertions, 1547 deletions
diff --git a/collectors/cgroups.plugin/Makefile.am b/collectors/cgroups.plugin/Makefile.am deleted file mode 100644 index 0f6062420..000000000 --- a/collectors/cgroups.plugin/Makefile.am +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later - -AUTOMAKE_OPTIONS = subdir-objects -MAINTAINERCLEANFILES = $(srcdir)/Makefile.in - -CLEANFILES = \ - cgroup-name.sh \ - cgroup-network-helper.sh \ - $(NULL) - -include $(top_srcdir)/build/subst.inc -SUFFIXES = .in - -dist_plugins_SCRIPTS = \ - cgroup-name.sh \ - cgroup-network-helper.sh \ - $(NULL) - -dist_noinst_DATA = \ - cgroup-name.sh.in \ - cgroup-network-helper.sh.in \ - README.md \ - $(NULL) diff --git a/collectors/cgroups.plugin/README.md b/collectors/cgroups.plugin/README.md deleted file mode 100644 index ba6a20e5e..000000000 --- a/collectors/cgroups.plugin/README.md +++ /dev/null @@ -1,302 +0,0 @@ -<!-- -title: "Monitor Cgroups (cgroups.plugin)" -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/README.md" -sidebar_label: "Monitor Cgroups" -learn_status: "Published" -learn_topic_type: "References" -learn_rel_path: "Integrations/Monitor/Virtualized environments/Containers" ---> - -# Monitor Cgroups (cgroups.plugin) - -You can monitor containers and virtual machines using **cgroups**. - -cgroups (or control groups), are a Linux kernel feature that provides accounting and resource usage limiting for -processes. When cgroups are bundled with namespaces (i.e. isolation), they form what we usually call **containers**. - -cgroups are hierarchical, meaning that cgroups can contain child cgroups, which can contain more cgroups, etc. All -accounting is reported (and resource usage limits are applied) also in a hierarchical way. - -To visualize cgroup metrics Netdata provides configuration for cherry picking the cgroups of interest. By default ( -without any configuration) Netdata should pick **systemd services**, all kinds of **containers** (lxc, docker, etc) -and **virtual machines** spawn by managers that register them with cgroups (qemu, libvirt, etc). - -## Configuring Netdata for cgroups - -In general, no additional settings are required. Netdata discovers all available cgroups on the host system and -collects their metrics. - -### How Netdata finds the available cgroups - -Linux exposes resource usage reporting and provides dynamic configuration for cgroups, using virtual files (usually) -under `/sys/fs/cgroup`. Netdata reads `/proc/self/mountinfo` to detect the exact mount point of cgroups. Netdata also -allows manual configuration of this mount point, using these settings: - -```text -[plugin:cgroups] - check for new cgroups every = 10 - path to /sys/fs/cgroup/cpuacct = /sys/fs/cgroup/cpuacct - path to /sys/fs/cgroup/blkio = /sys/fs/cgroup/blkio - path to /sys/fs/cgroup/memory = /sys/fs/cgroup/memory - path to /sys/fs/cgroup/devices = /sys/fs/cgroup/devices -``` - -Netdata rescans these directories for added or removed cgroups every `check for new cgroups every` seconds. - -### Hierarchical search for cgroups - -Since cgroups are hierarchical, for each of the directories shown above, Netdata walks through the subdirectories -recursively searching for cgroups (each subdirectory is another cgroup). - -To provide a sane default for this setting, Netdata uses the following pattern list (patterns starting with `!` give a -negative match and their order is important: the first matching a path will be used): - -```text -[plugin:cgroups] - search for cgroups in subpaths matching = !*/init.scope !*-qemu !/init.scope !/system !/systemd !/user !/user.slice * -``` - -So, we disable checking for **child cgroups** in systemd internal -cgroups ([systemd services are monitored by Netdata](#monitoring-systemd-services)), user cgroups (normally used for -desktop and remote user sessions), qemu virtual machines (child cgroups of virtual machines) and `init.scope`. All -others are enabled. - -### Unified cgroups (cgroups v2) support - -Netdata automatically detects cgroups version. If detection fails Netdata assumes v1. -To switch to v2 manually add: - -```text -[plugin:cgroups] - use unified cgroups = yes - path to unified cgroups = /sys/fs/cgroup -``` - -Unified cgroups use same name pattern matching as v1 cgroups. `cgroup_enable_systemd_services_detailed_memory` is -currently unsupported when using unified cgroups. - -### Enabled cgroups - -To provide a sane default, Netdata uses the -following [pattern list](https://github.com/netdata/netdata/blob/master/libnetdata/simple_pattern/README.md): - -- Checks the pattern against the path of the cgroup - - ```text - [plugin:cgroups] - enable by default cgroups matching = !*/init.scope *.scope !*/vcpu* !*/emulator !*.mount !*.partition !*.service !*.slice !*.swap !*.user !/ !/docker !/libvirt !/lxc !/lxc/*/ns !/lxc/*/ns/* !/machine !/qemu !/system !/systemd !/user * - ``` - -- Checks the pattern against the name of the cgroup (as you see it on the dashboard) - - ```text - [plugin:cgroups] - enable by default cgroups names matching = * - ``` - -Renaming is configured with the following options: - -```text -[plugin:cgroups] - run script to rename cgroups matching = *.scope *docker* *lxc* *qemu* !/ !*.mount !*.partition !*.service !*.slice !*.swap !*.user * - script to get cgroup names = /usr/libexec/netdata/plugins.d/cgroup-name.sh -``` - -The whole point for the additional pattern list, is to limit the number of times the script will be called. Without this -pattern list, the script might be called thousands of times, depending on the number of cgroups available in the system. - -The above pattern list is matched against the path of the cgroup. For matched cgroups, Netdata calls the -script [cgroup-name.sh](https://raw.githubusercontent.com/netdata/netdata/master/collectors/cgroups.plugin/cgroup-name.sh) -to get its name. This script queries `docker`, `kubectl`, `podman`, or applies heuristics to find give a name for the -cgroup. - -#### Note on Podman container names - -Podman's security model is a lot more restrictive than Docker's, so Netdata will not be able to detect container names -out of the box unless they were started by the same user as Netdata itself. - -If Podman is used in "rootful" mode, it's also possible to use `podman system service` to grant Netdata access to -container names. To do this, ensure `podman system service` is running and Netdata has access -to `/run/podman/podman.sock` (the default permissions as specified by upstream are `0600`, with owner `root`, so you -will have to adjust the configuration). - -[Docker Socket Proxy (HAProxy)](https://github.com/Tecnativa/docker-socket-proxy) or [CetusGuard](https://github.com/hectorm/cetusguard) -can also be used to give Netdata restricted access to the socket. Note that `PODMAN_HOST` in Netdata's environment should -be set to the proxy's URL in this case. - -### Charts with zero metrics - -By default, Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are -ignored. Metrics that will start having values, after Netdata is started, will be detected and charts will be -automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a -chart instead of `auto` to enable it permanently. For example: - -```text -[plugin:cgroups] - enable memory (used mem including cache) = yes -``` - -You can also set the `enable zero metrics` option to `yes` in the `[global]` section which enables charts with zero -metrics for all internal Netdata plugins. - -### Alerts - -CPU and memory limits are watched and used to rise alerts. Memory usage for every cgroup is checked against `ram` -and `ram+swap` limits. CPU usage for every cgroup is checked against `cpuset.cpus` and `cpu.cfs_period_us` + `cpu.cfs_quota_us` pair assigned for the cgroup. Configuration for the alerts is available in `health.d/cgroups.conf` -file. - -## Monitoring systemd services - -Netdata monitors **systemd services**. Example: - -![image](https://cloud.githubusercontent.com/assets/2662304/21964372/20cd7b84-db53-11e6-98a2-b9c986b082c0.png) - -Support per distribution: - -| system | charts shown | `/sys/fs/cgroup` tree | comments | -|:----------------:|:------------:|:------------------------------------:|:--------------------------| -| Arch Linux | YES | | | -| Gentoo | NO | | can be enabled, see below | -| Ubuntu 16.04 LTS | YES | | | -| Ubuntu 16.10 | YES | [here](http://pastebin.com/PiWbQEXy) | | -| Fedora 25 | YES | [here](http://pastebin.com/ax0373wF) | | -| Debian 8 | NO | | can be enabled, see below | -| AMI | NO | [here](http://pastebin.com/FrxmptjL) | not a systemd system | -| CentOS 7.3.1611 | NO | [here](http://pastebin.com/SpzgezAg) | can be enabled, see below | - -### Monitored systemd service metrics - -- CPU utilization -- Used memory -- RSS memory -- Mapped memory -- Cache memory -- Writeback memory -- Memory minor page faults -- Memory major page faults -- Memory charging activity -- Memory uncharging activity -- Memory limit failures -- Swap memory used -- Disk read bandwidth -- Disk write bandwidth -- Disk read operations -- Disk write operations -- Throttle disk read bandwidth -- Throttle disk write bandwidth -- Throttle disk read operations -- Throttle disk write operations -- Queued disk read operations -- Queued disk write operations -- Merged disk read operations -- Merged disk write operations - -### How to enable cgroup accounting on systemd systems that is by default disabled - -You can verify there is no accounting enabled, by running `systemd-cgtop`. The program will show only resources for -cgroup `/`, but all services will show nothing. - -To enable cgroup accounting, execute this: - -```sh -sed -e 's|^#Default\(.*\)Accounting=.*$|Default\1Accounting=yes|g' /etc/systemd/system.conf >/tmp/system.conf -``` - -To see the changes it made, run this: - -```sh -# diff /etc/systemd/system.conf /tmp/system.conf -40,44c40,44 -< #DefaultCPUAccounting=no -< #DefaultIOAccounting=no -< #DefaultBlockIOAccounting=no -< #DefaultMemoryAccounting=no -< #DefaultTasksAccounting=yes ---- -> DefaultCPUAccounting=yes -> DefaultIOAccounting=yes -> DefaultBlockIOAccounting=yes -> DefaultMemoryAccounting=yes -> DefaultTasksAccounting=yes -``` - -If you are happy with the changes, run: - -```sh -# copy the file to the right location -sudo cp /tmp/system.conf /etc/systemd/system.conf - -# restart systemd to take it into account -sudo systemctl daemon-reexec -``` - -(`systemctl daemon-reload` does not reload the configuration of the server - so you have to -execute `systemctl daemon-reexec`). - -Now, when you run `systemd-cgtop`, services will start reporting usage (if it does not, restart any service to wake it up). Refresh your Netdata dashboard, and you will have the charts too. - -In case memory accounting is missing, you will need to enable it at your kernel, by appending the following kernel boot -options and rebooting: - -```sh -cgroup_enable=memory swapaccount=1 -``` - -You can add the above, directly at the `linux` line in your `/boot/grub/grub.cfg` or appending them to -the `GRUB_CMDLINE_LINUX` in `/etc/default/grub` (in which case you will have to run `update-grub` before rebooting). On -DigitalOcean debian images you may have to set it at `/etc/default/grub.d/50-cloudimg-settings.cfg`. - -Which systemd services are monitored by Netdata is determined by the following pattern list: - -```text -[plugin:cgroups] - cgroups to match as systemd services = !/system.slice/*/*.service /system.slice/*.service -``` - -- - - - -## Monitoring ephemeral containers - -Netdata monitors containers automatically when it is installed at the host, or when it is installed in a container that -has access to the `/proc` and `/sys` filesystems of the host. - -Network interfaces and cgroups (containers) are self-cleaned. When a network interface or container stops, Netdata might log -a few errors in error.log complaining about files it cannot find, but immediately: - -1. It will detect this is a removed container or network interface -2. It will freeze/pause all alerts for them -3. It will mark their charts as obsolete -4. Obsolete charts are not be offered on new dashboard sessions (so hit F5 and the charts are gone) -5. Existing dashboard sessions will continue to see them, but of course they will not refresh -6. Obsolete charts will be removed from memory, 1 hour after the last user viewed them (configurable - with `[global].cleanup obsolete charts after seconds = 3600` (at `netdata.conf`). -7. When obsolete charts are removed from memory they are also deleted from disk (configurable - with `[global].delete obsolete charts files = yes`) - -### Monitored container metrics - -- CPU usage -- CPU usage within the limits -- CPU usage per core -- Memory usage -- Writeback memory -- Memory activity -- Memory page faults -- Used memory -- Used RAM within the limits -- Memory utilization -- Memory limit failures -- I/O bandwidth (all disks) -- Serviced I/O operations (all disks) -- Throttle I/O bandwidth (all disks) -- Throttle serviced I/O operations (all disks) -- Queued I/O operations (all disks) -- Merged I/O operations (all disks) -- CPU pressure -- Memory pressure -- Memory full pressure -- I/O pressure -- I/O full pressure - -Network interfaces are monitored by means of -the [proc plugin](https://github.com/netdata/netdata/blob/master/collectors/proc.plugin/README.md#monitored-network-interface-metrics). diff --git a/collectors/cgroups.plugin/metadata.yaml b/collectors/cgroups.plugin/metadata.yaml deleted file mode 100644 index a1abbb5a9..000000000 --- a/collectors/cgroups.plugin/metadata.yaml +++ /dev/null @@ -1,1022 +0,0 @@ -plugin_name: cgroups.plugin -modules: - - &module - meta: &meta - plugin_name: cgroups.plugin - module_name: /sys/fs/cgroup - monitored_instance: - name: Containers - link: "" - categories: - - data-collection.containers-and-vms - icon_filename: container.svg - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - containers - most_popular: true - overview: &overview - data_collection: &data_collection - metrics_description: "Monitor Containers for performance, resource usage, and health status." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: cgroup_10min_cpu_usage - link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf - metric: cgroup.cpu_limit - info: average cgroup CPU utilization over the last 10 minutes - - name: cgroup_ram_in_use - link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf - metric: cgroup.mem_usage - info: cgroup memory utilization - - name: cgroup_1m_received_packets_rate - link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf - metric: cgroup.net_packets - info: average number of packets received by the network interface ${label:device} over the last minute - - name: cgroup_10s_received_packets_storm - link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf - metric: cgroup.net_packets - info: - ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over - the last minute - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: cgroup - description: "" - labels: - - name: container_name - description: The container name or group path if name resolution fails. - - name: image - description: Docker/Podman container image name. - metrics: - - name: cgroup.cpu_limit - description: CPU Usage within the limits - unit: "percentage" - chart_type: line - dimensions: - - name: used - - name: cgroup.cpu - description: CPU Usage (100% = 1 core) - unit: "percentage" - chart_type: stacked - dimensions: - - name: user - - name: system - - name: cgroup.cpu_per_core - description: CPU Usage (100% = 1 core) Per Core - unit: "percentage" - chart_type: line - dimensions: - - name: a dimension per core - - name: cgroup.throttled - description: CPU Throttled Runnable Periods - unit: "percentage" - chart_type: line - dimensions: - - name: throttled - - name: cgroup.throttled_duration - description: CPU Throttled Time Duration - unit: "ms" - chart_type: line - dimensions: - - name: duration - - name: cgroup.cpu_shares - description: CPU Time Relative Share - unit: "shares" - chart_type: line - dimensions: - - name: shares - - name: cgroup.mem - description: Memory Usage - unit: "MiB" - chart_type: stacked - dimensions: - - name: cache - - name: rss - - name: swap - - name: rss_huge - - name: mapped_file - - name: cgroup.writeback - description: Writeback Memory - unit: "MiB" - chart_type: area - dimensions: - - name: dirty - - name: writeback - - name: cgroup.mem_activity - description: Memory Activity - unit: "MiB/s" - chart_type: line - dimensions: - - name: in - - name: out - - name: cgroup.pgfaults - description: Memory Page Faults - unit: "MiB/s" - chart_type: line - dimensions: - - name: pgfault - - name: swap - - name: cgroup.mem_usage - description: Used Memory - unit: "MiB" - chart_type: stacked - dimensions: - - name: ram - - name: swap - - name: cgroup.mem_usage_limit - description: Used RAM within the limits - unit: "MiB" - chart_type: stacked - dimensions: - - name: available - - name: used - - name: cgroup.mem_utilization - description: Memory Utilization - unit: "percentage" - chart_type: line - dimensions: - - name: utilization - - name: cgroup.mem_failcnt - description: Memory Limit Failures - unit: "count" - chart_type: line - dimensions: - - name: failures - - name: cgroup.io - description: I/O Bandwidth (all disks) - unit: "KiB/s" - chart_type: area - dimensions: - - name: read - - name: write - - name: cgroup.serviced_ops - description: Serviced I/O Operations (all disks) - unit: "operations/s" - chart_type: line - dimensions: - - name: read - - name: write - - name: cgroup.throttle_io - description: Throttle I/O Bandwidth (all disks) - unit: "KiB/s" - chart_type: area - dimensions: - - name: read - - name: write - - name: cgroup.throttle_serviced_ops - description: Throttle Serviced I/O Operations (all disks) - unit: "operations/s" - chart_type: line - dimensions: - - name: read - - name: write - - name: cgroup.queued_ops - description: Queued I/O Operations (all disks) - unit: "operations" - chart_type: line - dimensions: - - name: read - - name: write - - name: cgroup.merged_ops - description: Merged I/O Operations (all disks) - unit: "operations/s" - chart_type: line - dimensions: - - name: read - - name: write - - name: cgroup.cpu_some_pressure - description: CPU some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: cgroup.cpu_some_pressure_stall_time - description: CPU some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: cgroup.cpu_full_pressure - description: CPU full pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: cgroup.cpu_full_pressure_stall_time - description: CPU full pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: cgroup.memory_some_pressure - description: Memory some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: cgroup.memory_some_pressure_stall_time - description: Memory some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: cgroup.memory_full_pressure - description: Memory full pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: cgroup.memory_full_pressure_stall_time - description: Memory full pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: cgroup.io_some_pressure - description: I/O some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: cgroup.io_some_pressure_stall_time - description: I/O some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: cgroup.io_full_pressure - description: I/O some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: cgroup.io_full_pressure_stall_time - description: I/O some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: cgroup.pids_current - description: Number of processes - unit: "pids" - chart_type: line - dimensions: - - name: pids - - name: cgroup network device - description: "" - labels: - - name: container_name - description: The container name or group path if name resolution fails. - - name: image - description: Docker/Podman container image name. - - name: device - description: "The name of the host network interface linked to the container's network interface." - - name: container_device - description: Container network interface name. - - name: interface_type - description: 'Network interface type. Always "virtual" for the containers.' - metrics: - - name: cgroup.net_net - description: Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: cgroup.net_packets - description: Packets - unit: "pps" - chart_type: line - dimensions: - - name: received - - name: sent - - name: multicast - - name: cgroup.net_errors - description: Interface Errors - unit: "errors/s" - chart_type: line - dimensions: - - name: inbound - - name: outbound - - name: cgroup.net_drops - description: Interface Drops - unit: "errors/s" - chart_type: line - dimensions: - - name: inbound - - name: outbound - - name: cgroup.net_fifo - description: Interface FIFO Buffer Errors - unit: "errors/s" - chart_type: line - dimensions: - - name: receive - - name: transmit - - name: cgroup.net_compressed - description: Interface FIFO Buffer Errors - unit: "pps" - chart_type: line - dimensions: - - name: receive - - name: sent - - name: cgroup.net_events - description: Network Interface Events - unit: "events/s" - chart_type: line - dimensions: - - name: frames - - name: collisions - - name: carrier - - name: cgroup.net_operstate - description: Interface Operational State - unit: "state" - chart_type: line - dimensions: - - name: up - - name: down - - name: notpresent - - name: lowerlayerdown - - name: testing - - name: dormant - - name: unknown - - name: cgroup.net_carrier - description: Interface Physical Link State - unit: "state" - chart_type: line - dimensions: - - name: up - - name: down - - name: cgroup.net_mtu - description: Interface MTU - unit: "octets" - chart_type: line - dimensions: - - name: mtu - - <<: *module - meta: - <<: *meta - monitored_instance: - name: Kubernetes Containers - link: https://kubernetes.io/ - icon_filename: kubernetes.svg - categories: - #- data-collection.containers-and-vms - - data-collection.kubernetes - keywords: - - k8s - - kubernetes - - pods - - containers - overview: - <<: *overview - data-collection: - <<: *data_collection - metrics_description: Monitor Kubernetes Clusters for performance, resource usage, and health status. - alerts: - - name: k8s_cgroup_10min_cpu_usage - link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf - metric: k8s.cgroup.cpu_limit - info: average cgroup CPU utilization over the last 10 minutes - - name: k8s_cgroup_ram_in_use - link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf - metric: k8s.cgroup.mem_usage - info: cgroup memory utilization - - name: k8s_cgroup_1m_received_packets_rate - link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf - metric: k8s.cgroup.net_packets - info: average number of packets received by the network interface ${label:device} over the last minute - - name: k8s_cgroup_10s_received_packets_storm - link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf - metric: k8s.cgroup.net_packets - info: - ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over - the last minute - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: k8s cgroup - description: These metrics refer to the Pod container. - labels: - - name: k8s_node_name - description: 'Node name. The value of _pod.spec.nodeName_.' - - name: k8s_namespace - description: 'Namespace name. The value of _pod.metadata.namespace_.' - - name: k8s_controller_kind - description: 'Controller kind (ReplicaSet, DaemonSet, StatefulSet, Job, etc.). The value of _pod.OwnerReferences.Controller.Kind_.' - - name: k8s_controller_name - description: 'Controller name.The value of _pod.OwnerReferences.Controller.Name_.' - - name: k8s_pod_name - description: 'Pod name. The value of _pod.metadata.name_.' - - name: k8s_container_name - description: 'Container name. The value of _pod.spec.containers.name_.' - - name: k8s_kind - description: 'Instance kind: "pod" or "container".' - - name: k8s_qos_class - description: 'QoS class (guaranteed, burstable, besteffort).' - - name: k8s_cluster_id - description: 'Cluster ID. The value of kube-system namespace _namespace.metadata.uid_.' - metrics: - - name: k8s.cgroup.cpu_limit - description: CPU Usage within the limits - unit: "percentage" - chart_type: line - dimensions: - - name: used - - name: k8s.cgroup.cpu - description: CPU Usage (100% = 1000 mCPU) - unit: "percentage" - chart_type: stacked - dimensions: - - name: user - - name: system - - name: k8s.cgroup.cpu_per_core - description: CPU Usage (100% = 1000 mCPU) Per Core - unit: "percentage" - chart_type: line - dimensions: - - name: a dimension per core - - name: k8s.cgroup.throttled - description: CPU Throttled Runnable Periods - unit: "percentage" - chart_type: line - dimensions: - - name: throttled - - name: k8s.cgroup.throttled_duration - description: CPU Throttled Time Duration - unit: "ms" - chart_type: line - dimensions: - - name: duration - - name: k8s.cgroup.cpu_shares - description: CPU Time Relative Share - unit: "shares" - chart_type: line - dimensions: - - name: shares - - name: k8s.cgroup.mem - description: Memory Usage - unit: "MiB" - chart_type: stacked - dimensions: - - name: cache - - name: rss - - name: swap - - name: rss_huge - - name: mapped_file - - name: k8s.cgroup.writeback - description: Writeback Memory - unit: "MiB" - chart_type: area - dimensions: - - name: dirty - - name: writeback - - name: k8s.cgroup.mem_activity - description: Memory Activity - unit: "MiB/s" - chart_type: line - dimensions: - - name: in - - name: out - - name: k8s.cgroup.pgfaults - description: Memory Page Faults - unit: "MiB/s" - chart_type: line - dimensions: - - name: pgfault - - name: swap - - name: k8s.cgroup.mem_usage - description: Used Memory - unit: "MiB" - chart_type: stacked - dimensions: - - name: ram - - name: swap - - name: k8s.cgroup.mem_usage_limit - description: Used RAM within the limits - unit: "MiB" - chart_type: stacked - dimensions: - - name: available - - name: used - - name: k8s.cgroup.mem_utilization - description: Memory Utilization - unit: "percentage" - chart_type: line - dimensions: - - name: utilization - - name: k8s.cgroup.mem_failcnt - description: Memory Limit Failures - unit: "count" - chart_type: line - dimensions: - - name: failures - - name: k8s.cgroup.io - description: I/O Bandwidth (all disks) - unit: "KiB/s" - chart_type: area - dimensions: - - name: read - - name: write - - name: k8s.cgroup.serviced_ops - description: Serviced I/O Operations (all disks) - unit: "operations/s" - chart_type: line - dimensions: - - name: read - - name: write - - name: k8s.cgroup.throttle_io - description: Throttle I/O Bandwidth (all disks) - unit: "KiB/s" - chart_type: area - dimensions: - - name: read - - name: write - - name: k8s.cgroup.throttle_serviced_ops - description: Throttle Serviced I/O Operations (all disks) - unit: "operations/s" - chart_type: line - dimensions: - - name: read - - name: write - - name: k8s.cgroup.queued_ops - description: Queued I/O Operations (all disks) - unit: "operations" - chart_type: line - dimensions: - - name: read - - name: write - - name: k8s.cgroup.merged_ops - description: Merged I/O Operations (all disks) - unit: "operations/s" - chart_type: line - dimensions: - - name: read - - name: write - - name: k8s.cgroup.cpu_some_pressure - description: CPU some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: k8s.cgroup.cpu_some_pressure_stall_time - description: CPU some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: k8s.cgroup.cpu_full_pressure - description: CPU full pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: k8s.cgroup.cpu_full_pressure_stall_time - description: CPU full pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: k8s.cgroup.memory_some_pressure - description: Memory some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: k8s.cgroup.memory_some_pressure_stall_time - description: Memory some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: k8s.cgroup.memory_full_pressure - description: Memory full pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: k8s.cgroup.memory_full_pressure_stall_time - description: Memory full pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: k8s.cgroup.io_some_pressure - description: I/O some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: k8s.cgroup.io_some_pressure_stall_time - description: I/O some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: k8s.cgroup.io_full_pressure - description: I/O some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: k8s.cgroup.io_full_pressure_stall_time - description: I/O some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: k8s.cgroup.pids_current - description: Number of processes - unit: "pids" - chart_type: line - dimensions: - - name: pids - - name: k8s cgroup network device - description: These metrics refer to the Pod container network interface. - labels: - - name: device - description: "The name of the host network interface linked to the container's network interface." - - name: container_device - description: Container network interface name. - - name: interface_type - description: 'Network interface type. Always "virtual" for the containers.' - - name: k8s_node_name - description: 'Node name. The value of _pod.spec.nodeName_.' - - name: k8s_namespace - description: 'Namespace name. The value of _pod.metadata.namespace_.' - - name: k8s_controller_kind - description: 'Controller kind (ReplicaSet, DaemonSet, StatefulSet, Job, etc.). The value of _pod.OwnerReferences.Controller.Kind_.' - - name: k8s_controller_name - description: 'Controller name.The value of _pod.OwnerReferences.Controller.Name_.' - - name: k8s_pod_name - description: 'Pod name. The value of _pod.metadata.name_.' - - name: k8s_container_name - description: 'Container name. The value of _pod.spec.containers.name_.' - - name: k8s_kind - description: 'Instance kind: "pod" or "container".' - - name: k8s_qos_class - description: 'QoS class (guaranteed, burstable, besteffort).' - - name: k8s_cluster_id - description: 'Cluster ID. The value of kube-system namespace _namespace.metadata.uid_.' - metrics: - - name: k8s.cgroup.net_net - description: Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: k8s.cgroup.net_packets - description: Packets - unit: "pps" - chart_type: line - dimensions: - - name: received - - name: sent - - name: multicast - - name: k8s.cgroup.net_errors - description: Interface Errors - unit: "errors/s" - chart_type: line - dimensions: - - name: inbound - - name: outbound - - name: k8s.cgroup.net_drops - description: Interface Drops - unit: "errors/s" - chart_type: line - dimensions: - - name: inbound - - name: outbound - - name: k8s.cgroup.net_fifo - description: Interface FIFO Buffer Errors - unit: "errors/s" - chart_type: line - dimensions: - - name: receive - - name: transmit - - name: k8s.cgroup.net_compressed - description: Interface FIFO Buffer Errors - unit: "pps" - chart_type: line - dimensions: - - name: receive - - name: sent - - name: k8s.cgroup.net_events - description: Network Interface Events - unit: "events/s" - chart_type: line - dimensions: - - name: frames - - name: collisions - - name: carrier - - name: k8s.cgroup.net_operstate - description: Interface Operational State - unit: "state" - chart_type: line - dimensions: - - name: up - - name: down - - name: notpresent - - name: lowerlayerdown - - name: testing - - name: dormant - - name: unknown - - name: k8s.cgroup.net_carrier - description: Interface Physical Link State - unit: "state" - chart_type: line - dimensions: - - name: up - - name: down - - name: k8s.cgroup.net_mtu - description: Interface MTU - unit: "octets" - chart_type: line - dimensions: - - name: mtu - - <<: *module - meta: - <<: *meta - monitored_instance: - name: Systemd Services - link: "" - icon_filename: systemd.svg - categories: - - data-collection.systemd - keywords: - - systemd - - services - overview: - <<: *overview - data-collection: - <<: *data_collection - metrics_desctiption: "Monitor Systemd Services for performance, resource usage, and health status." - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: systemd service - description: "" - labels: - - name: service_name - description: Service name - metrics: - - name: systemd.service.cpu.utilization - description: Systemd Services CPU utilization (100% = 1 core) - unit: percentage - chart_type: stacked - dimensions: - - name: user - - name: system - - name: systemd.service.memory.usage - description: Systemd Services Used Memory - unit: MiB - chart_type: stacked - dimensions: - - name: ram - - name: swap - - name: systemd.service.memory.failcnt - description: Systemd Services Memory Limit Failures - unit: failures/s - chart_type: line - dimensions: - - name: fail - - name: systemd.service.memory.ram.usage - description: Systemd Services Memory - unit: MiB - chart_type: stacked - dimensions: - - name: rss - - name: cache - - name: mapped_file - - name: rss_huge - - name: systemd.service.memory.writeback - description: Systemd Services Writeback Memory - unit: MiB - chart_type: stacked - dimensions: - - name: writeback - - name: dirty - - name: systemd.service.memory.paging.faults - description: Systemd Services Memory Minor and Major Page Faults - unit: MiB/s - chart_type: area - dimensions: - - name: minor - - name: major - - name: systemd.service.memory.paging.io - description: Systemd Services Memory Paging IO - unit: MiB/s - chart_type: area - dimensions: - - name: in - - name: out - - name: systemd.service.disk.io - description: Systemd Services Disk Read/Write Bandwidth - unit: KiB/s - chart_type: area - dimensions: - - name: read - - name: write - - name: systemd.service.disk.iops - description: Systemd Services Disk Read/Write Operations - unit: operations/s - chart_type: line - dimensions: - - name: read - - name: write - - name: systemd.service.disk.throttle.io - description: Systemd Services Throttle Disk Read/Write Bandwidth - unit: KiB/s - chart_type: area - dimensions: - - name: read - - name: write - - name: systemd.service.disk.throttle.iops - description: Systemd Services Throttle Disk Read/Write Operations - unit: operations/s - chart_type: line - dimensions: - - name: read - - name: write - - name: systemd.service.disk.queued_iops - description: Systemd Services Queued Disk Read/Write Operations - unit: operations/s - chart_type: line - dimensions: - - name: read - - name: write - - name: systemd.service.disk.merged_iops - description: Systemd Services Merged Disk Read/Write Operations - unit: operations/s - chart_type: line - dimensions: - - name: read - - name: write - - name: systemd.service.pids.current - description: Systemd Services Number of Processes - unit: pids - chart_type: line - dimensions: - - name: pids - - <<: *module - meta: - <<: *meta - monitored_instance: - name: Virtual Machines - link: "" - icon_filename: container.svg - categories: - - data-collection.containers-and-vms - keywords: - - vms - - virtualization - - container - overview: - <<: *overview - data_collection: - <<: *data_collection - metrics_description: "Monitor Virtual Machines for performance, resource usage, and health status." - - <<: *module - meta: - <<: *meta - monitored_instance: - name: LXC Containers - link: "" - icon_filename: lxc.png - categories: - - data-collection.containers-and-vms - keywords: - - lxc - - lxd - - container - overview: - <<: *overview - data_collection: - <<: *data_collection - metrics_description: "Monitor LXC Containers for performance, resource usage, and health status." - - <<: *module - meta: - <<: *meta - monitored_instance: - name: Libvirt Containers - link: "" - icon_filename: libvirt.png - categories: - - data-collection.containers-and-vms - keywords: - - libvirt - - container - overview: - <<: *overview - data_collection: - <<: *data_collection - metrics_description: "Monitor Libvirt for performance, resource usage, and health status." - - <<: *module - meta: - <<: *meta - monitored_instance: - name: oVirt Containers - link: "" - icon_filename: ovirt.svg - categories: - - data-collection.containers-and-vms - keywords: - - ovirt - - container - overview: - <<: *overview - data_collection: - <<: *data_collection - metrics_description: "Monitor oVirt for performance, resource usage, and health status." - - <<: *module - meta: - <<: *meta - monitored_instance: - name: Proxmox Containers - link: "" - icon_filename: proxmox.png - categories: - - data-collection.containers-and-vms - keywords: - - proxmox - - container - overview: - <<: *overview - data_collection: - <<: *data_collection - metrics_description: "Monitor Proxmox for performance, resource usage, and health status." diff --git a/collectors/cgroups.plugin/cgroup-charts.c b/src/collectors/cgroups.plugin/cgroup-charts.c index a89e8ac45..1a337cd99 100644 --- a/collectors/cgroups.plugin/cgroup-charts.c +++ b/src/collectors/cgroups.plugin/cgroup-charts.c @@ -96,7 +96,7 @@ void update_cpu_utilization_limit_chart(struct cgroup *cg, NETDATA_DOUBLE cpu_li cg->prev_cpu_usage = cpu_usage; - rrdsetvar_custom_chart_variable_set(cg->st_cpu, cg->chart_var_cpu_limit, cpu_limit); + rrdvar_chart_variable_set(cg->st_cpu, cg->chart_var_cpu_limit, cpu_limit); rrdset_done(chart); } diff --git a/collectors/cgroups.plugin/cgroup-discovery.c b/src/collectors/cgroups.plugin/cgroup-discovery.c index ede35ed8a..e5d029cfb 100644 --- a/collectors/cgroups.plugin/cgroup-discovery.c +++ b/src/collectors/cgroups.plugin/cgroup-discovery.c @@ -25,6 +25,10 @@ char cgroup_chart_id_prefix[] = "cgroup_"; char services_chart_id_prefix[] = "systemd_"; char *cgroups_rename_script = NULL; +// Shared memory with information from detected cgroups +netdata_ebpf_cgroup_shm_t shm_cgroup_ebpf = {NULL, NULL}; +int shm_fd_cgroup_ebpf = -1; +sem_t *shm_mutex_cgroup_ebpf = SEM_FAILED; // ---------------------------------------------------------------------------- @@ -42,7 +46,7 @@ static inline void cgroup_free_network_interfaces(struct cgroup *cg) { cg->interfaces = i->next; // delete the registration of proc_net_dev rename - netdev_rename_device_del(i->host_device); + cgroup_rename_task_device_del(i->host_device); freez((void *)i->host_device); freez((void *)i->container_device); @@ -1027,6 +1031,82 @@ static int discovery_is_cgroup_duplicate(struct cgroup *cg) { } // ---------------------------------------------------------------------------- +// ebpf shared memory + +static void netdata_cgroup_ebpf_set_values(size_t length) +{ + sem_wait(shm_mutex_cgroup_ebpf); + + shm_cgroup_ebpf.header->cgroup_max = cgroup_root_max; + shm_cgroup_ebpf.header->systemd_enabled = cgroup_enable_systemd_services | + cgroup_enable_systemd_services_detailed_memory | + cgroup_used_memory; + shm_cgroup_ebpf.header->body_length = length; + + sem_post(shm_mutex_cgroup_ebpf); +} + +static void netdata_cgroup_ebpf_initialize_shm() +{ + shm_fd_cgroup_ebpf = shm_open(NETDATA_SHARED_MEMORY_EBPF_CGROUP_NAME, O_CREAT | O_RDWR, 0660); + if (shm_fd_cgroup_ebpf < 0) { + collector_error("Cannot initialize shared memory used by cgroup and eBPF, integration won't happen."); + return; + } + + size_t length = sizeof(netdata_ebpf_cgroup_shm_header_t) + cgroup_root_max * sizeof(netdata_ebpf_cgroup_shm_body_t); + if (ftruncate(shm_fd_cgroup_ebpf, length)) { + collector_error("Cannot set size for shared memory."); + goto end_init_shm; + } + + shm_cgroup_ebpf.header = (netdata_ebpf_cgroup_shm_header_t *) mmap(NULL, length, + PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd_cgroup_ebpf, 0); + + if (unlikely(MAP_FAILED == shm_cgroup_ebpf.header)) { + shm_cgroup_ebpf.header = NULL; + collector_error("Cannot map shared memory used between cgroup and eBPF, integration won't happen"); + goto end_init_shm; + } + shm_cgroup_ebpf.body = (netdata_ebpf_cgroup_shm_body_t *) ((char *)shm_cgroup_ebpf.header + + sizeof(netdata_ebpf_cgroup_shm_header_t)); + + shm_mutex_cgroup_ebpf = sem_open(NETDATA_NAMED_SEMAPHORE_EBPF_CGROUP_NAME, O_CREAT, + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, 1); + + if (shm_mutex_cgroup_ebpf != SEM_FAILED) { + netdata_cgroup_ebpf_set_values(length); + return; + } + + collector_error("Cannot create semaphore, integration between eBPF and cgroup won't happen"); + munmap(shm_cgroup_ebpf.header, length); + shm_cgroup_ebpf.header = NULL; + + end_init_shm: + close(shm_fd_cgroup_ebpf); + shm_fd_cgroup_ebpf = -1; + shm_unlink(NETDATA_SHARED_MEMORY_EBPF_CGROUP_NAME); +} + +static void cgroup_cleanup_ebpf_integration() +{ + if (shm_mutex_cgroup_ebpf != SEM_FAILED) { + sem_close(shm_mutex_cgroup_ebpf); + } + + if (shm_cgroup_ebpf.header) { + shm_cgroup_ebpf.header->cgroup_root_count = 0; + munmap(shm_cgroup_ebpf.header, shm_cgroup_ebpf.header->body_length); + } + + if (shm_fd_cgroup_ebpf > 0) { + close(shm_fd_cgroup_ebpf); + } +} + +// ---------------------------------------------------------------------------- // cgroup network interfaces #define CGROUP_NETWORK_INTERFACE_MAX_LINE 2048 @@ -1084,8 +1164,13 @@ static inline void read_cgroup_network_interfaces(struct cgroup *cg) { collector_info("CGROUP: cgroup '%s' has network interface '%s' as '%s'", cg->id, i->host_device, i->container_device); // register a device rename to proc_net_dev.c - netdev_rename_device_add(i->host_device, i->container_device, cg->chart_id, cg->chart_labels, - k8s_is_kubepod(cg) ? "k8s." : "", cgroup_netdev_get(cg)); + cgroup_rename_task_add( + i->host_device, + i->container_device, + cg->chart_id, + cg->chart_labels, + k8s_is_kubepod(cg) ? "k8s." : "", + cgroup_netdev_get(cg)); } } @@ -1226,6 +1311,8 @@ void cgroup_discovery_worker(void *ptr) service_register(SERVICE_THREAD_TYPE_LIBUV, NULL, NULL, NULL, false); + netdata_cgroup_ebpf_initialize_shm(); + while (service_running(SERVICE_COLLECTORS)) { worker_is_idle(); @@ -1239,6 +1326,7 @@ void cgroup_discovery_worker(void *ptr) discovery_find_all_cgroups(); } collector_info("discovery thread stopped"); + cgroup_cleanup_ebpf_integration(); worker_unregister(); service_exits(); __atomic_store_n(&discovery_thread.exited,1,__ATOMIC_RELAXED); diff --git a/collectors/cgroups.plugin/cgroup-internals.h b/src/collectors/cgroups.plugin/cgroup-internals.h index a69802240..1f5be7707 100644 --- a/collectors/cgroups.plugin/cgroup-internals.h +++ b/src/collectors/cgroups.plugin/cgroup-internals.h @@ -287,16 +287,16 @@ struct cgroup { char *filename_cpu_cfs_quota; unsigned long long cpu_cfs_quota; - const RRDSETVAR_ACQUIRED *chart_var_cpu_limit; + const RRDVAR_ACQUIRED *chart_var_cpu_limit; NETDATA_DOUBLE prev_cpu_usage; char *filename_memory_limit; unsigned long long memory_limit; - const RRDSETVAR_ACQUIRED *chart_var_memory_limit; + const RRDVAR_ACQUIRED *chart_var_memory_limit; char *filename_memoryswap_limit; unsigned long long memoryswap_limit; - const RRDSETVAR_ACQUIRED *chart_var_memoryswap_limit; + const RRDVAR_ACQUIRED *chart_var_memoryswap_limit; const DICTIONARY_ITEM *cgroup_netdev_link; @@ -386,10 +386,6 @@ extern uint32_t throttled_time_hash; extern uint32_t throttled_usec_hash; extern struct cgroup *cgroup_root; -extern netdata_ebpf_cgroup_shm_t shm_cgroup_ebpf; -extern int shm_fd_cgroup_ebpf; -extern sem_t *shm_mutex_cgroup_ebpf; - enum cgroups_type { CGROUPS_AUTODETECT_FAIL, CGROUPS_V1, CGROUPS_V2 }; enum cgroups_systemd_setting { @@ -452,15 +448,10 @@ static inline char *cgroup_chart_type(char *buffer, struct cgroup *cg) { } #define RRDFUNCTIONS_CGTOP_HELP "View running containers" +#define RRDFUNCTIONS_SYSTEMD_SERVICES_HELP "View systemd services" -int cgroup_function_cgroup_top(BUFFER *wb, int timeout, const char *function, void *collector_data, - rrd_function_result_callback_t result_cb, void *result_cb_data, - rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data, - rrd_function_register_canceller_cb_t register_canceller_cb, void *register_canceller_cb_data); -int cgroup_function_systemd_top(BUFFER *wb, int timeout, const char *function, void *collector_data, - rrd_function_result_callback_t result_cb, void *result_cb_data, - rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data, - rrd_function_register_canceller_cb_t register_canceller_cb, void *register_canceller_cb_data); +int cgroup_function_cgroup_top(BUFFER *wb, const char *function); +int cgroup_function_systemd_top(BUFFER *wb, const char *function); void cgroup_netdev_link_init(void); const DICTIONARY_ITEM *cgroup_netdev_get(struct cgroup *cg); diff --git a/collectors/cgroups.plugin/cgroup-name.sh.in b/src/collectors/cgroups.plugin/cgroup-name.sh.in index 0f8b63256..0f8b63256 100755 --- a/collectors/cgroups.plugin/cgroup-name.sh.in +++ b/src/collectors/cgroups.plugin/cgroup-name.sh.in diff --git a/collectors/cgroups.plugin/cgroup-network-helper.sh.in b/src/collectors/cgroups.plugin/cgroup-network-helper.sh.in index da9b9162a..da9b9162a 100755 --- a/collectors/cgroups.plugin/cgroup-network-helper.sh.in +++ b/src/collectors/cgroups.plugin/cgroup-network-helper.sh.in diff --git a/collectors/cgroups.plugin/cgroup-network.c b/src/collectors/cgroups.plugin/cgroup-network.c index 508ea07c6..085a6aa6f 100644 --- a/collectors/cgroups.plugin/cgroup-network.c +++ b/src/collectors/cgroups.plugin/cgroup-network.c @@ -7,7 +7,6 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE /* See feature_test_macros(7) */ #endif -#include <sched.h> #endif char env_netdata_host_prefix[FILENAME_MAX + 50] = ""; @@ -183,7 +182,7 @@ int proc_pid_fd(const char *prefix, const char *ns, pid_t pid) { char filename[FILENAME_MAX + 1]; snprintfz(filename, FILENAME_MAX, "%s/proc/%d/%s", prefix, (int)pid, ns); - int fd = open(filename, O_RDONLY); + int fd = open(filename, O_RDONLY | O_CLOEXEC); if(fd == -1) collector_error("Cannot open proc_pid_fd() file '%s'", filename); diff --git a/collectors/cgroups.plugin/cgroup-top.c b/src/collectors/cgroups.plugin/cgroup-top.c index 8d44d3b56..aa413dad1 100644 --- a/collectors/cgroups.plugin/cgroup-top.c +++ b/src/collectors/cgroups.plugin/cgroup-top.c @@ -98,13 +98,7 @@ void cgroup_netdev_get_bandwidth(struct cgroup *cg, NETDATA_DOUBLE *received, NE *sent = t->sent[slot]; } -int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const char *function __maybe_unused, - void *collector_data __maybe_unused, - rrd_function_result_callback_t result_cb, void *result_cb_data, - rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data, - rrd_function_register_canceller_cb_t register_canceller_cb __maybe_unused, - void *register_canceller_cb_data __maybe_unused) { - +int cgroup_function_cgroup_top(BUFFER *wb, const char *function __maybe_unused) { buffer_flush(wb); wb->content_type = CT_APPLICATION_JSON; buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); @@ -113,11 +107,13 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha buffer_json_member_add_uint64(wb, "status", HTTP_RESP_OK); buffer_json_member_add_string(wb, "type", "table"); buffer_json_member_add_time_t(wb, "update_every", 1); + buffer_json_member_add_boolean(wb, "has_history", false); buffer_json_member_add_string(wb, "help", RRDFUNCTIONS_CGTOP_HELP); buffer_json_member_add_array(wb, "data"); double max_pids = 0.0; double max_cpu = 0.0; + double max_cpu_throttled = 0.0; double max_ram = 0.0; double max_disk_io_read = 0.0; double max_disk_io_written = 0.0; @@ -149,6 +145,9 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha max_cpu = MAX(max_cpu, cpu); } + double cpu_throttled = (double)cg->cpuacct_cpu_throttling.nr_throttled_perc; + max_cpu_throttled = MAX(max_cpu_throttled, cpu_throttled); + double ram = rrddim_get_last_stored_value(cg->st_mem_rd_ram, &max_ram, 1.0); rd = cg->st_throttle_io_rd_read ? cg->st_throttle_io_rd_read : cg->st_io_rd_read; @@ -167,6 +166,7 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha buffer_json_add_array_item_double(wb, pids_current); buffer_json_add_array_item_double(wb, cpu); + buffer_json_add_array_item_double(wb, cpu_throttled); buffer_json_add_array_item_double(wb, ram); buffer_json_add_array_item_double(wb, disk_io_read); buffer_json_add_array_item_double(wb, disk_io_written); @@ -215,6 +215,13 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha RRDF_FIELD_OPTS_VISIBLE, NULL); + buffer_rrdf_table_add_field(wb, field_id++, "CPU Throttling", "CPU Throttled Runnable Periods", + RRDF_FIELD_TYPE_BAR_WITH_INTEGER, RRDF_FIELD_VISUAL_BAR, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "%", max_cpu_throttled, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_NONE, + is_inside_k8s ? RRDF_FIELD_OPTS_VISIBLE : RRDF_FIELD_OPTS_NONE, + NULL); + // RAM buffer_rrdf_table_add_field(wb, field_id++, "RAM", "RAM Usage", RRDF_FIELD_TYPE_BAR_WITH_INTEGER, RRDF_FIELD_VISUAL_BAR, RRDF_FIELD_TRANSFORM_NUMBER, @@ -331,25 +338,10 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha buffer_json_member_add_time_t(wb, "expires", now_realtime_sec() + 1); buffer_json_finalize(wb); - int response = HTTP_RESP_OK; - if(is_cancelled_cb && is_cancelled_cb(is_cancelled_cb_data)) { - buffer_flush(wb); - response = HTTP_RESP_CLIENT_CLOSED_REQUEST; - } - - if(result_cb) - result_cb(wb, response, result_cb_data); - - return response; + return HTTP_RESP_OK; } -int cgroup_function_systemd_top(BUFFER *wb, int timeout __maybe_unused, const char *function __maybe_unused, - void *collector_data __maybe_unused, - rrd_function_result_callback_t result_cb, void *result_cb_data, - rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data, - rrd_function_register_canceller_cb_t register_canceller_cb __maybe_unused, - void *register_canceller_cb_data __maybe_unused) { - +int cgroup_function_systemd_top(BUFFER *wb, const char *function __maybe_unused) { buffer_flush(wb); wb->content_type = CT_APPLICATION_JSON; buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); @@ -358,6 +350,7 @@ int cgroup_function_systemd_top(BUFFER *wb, int timeout __maybe_unused, const ch buffer_json_member_add_uint64(wb, "status", HTTP_RESP_OK); buffer_json_member_add_string(wb, "type", "table"); buffer_json_member_add_time_t(wb, "update_every", 1); + buffer_json_member_add_boolean(wb, "has_history", false); buffer_json_member_add_string(wb, "help", RRDFUNCTIONS_CGTOP_HELP); buffer_json_member_add_array(wb, "data"); @@ -507,14 +500,5 @@ int cgroup_function_systemd_top(BUFFER *wb, int timeout __maybe_unused, const ch buffer_json_member_add_time_t(wb, "expires", now_realtime_sec() + 1); buffer_json_finalize(wb); - int response = HTTP_RESP_OK; - if(is_cancelled_cb && is_cancelled_cb(is_cancelled_cb_data)) { - buffer_flush(wb); - response = HTTP_RESP_CLIENT_CLOSED_REQUEST; - } - - if(result_cb) - result_cb(wb, response, result_cb_data); - - return response; + return HTTP_RESP_OK; } diff --git a/collectors/cgroups.plugin/integrations/containers.md b/src/collectors/cgroups.plugin/integrations/containers.md index 6273d1e91..e769fc5f9 100644 --- a/collectors/cgroups.plugin/integrations/containers.md +++ b/src/collectors/cgroups.plugin/integrations/containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/kubernetes_containers.md b/src/collectors/cgroups.plugin/integrations/kubernetes_containers.md index 9be32a12a..c71b5736c 100644 --- a/collectors/cgroups.plugin/integrations/kubernetes_containers.md +++ b/src/collectors/cgroups.plugin/integrations/kubernetes_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/kubernetes_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/kubernetes_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Kubernetes Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Kubernetes" +learn_rel_path: "Collecting Metrics/Kubernetes" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -154,10 +154,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ k8s_cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ k8s_cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.mem_usage | cgroup memory utilization | -| [ k8s_cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ k8s_cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ k8s_cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ k8s_cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.mem_usage | cgroup memory utilization | +| [ k8s_cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ k8s_cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/libvirt_containers.md b/src/collectors/cgroups.plugin/integrations/libvirt_containers.md index fed454698..409b43609 100644 --- a/collectors/cgroups.plugin/integrations/libvirt_containers.md +++ b/src/collectors/cgroups.plugin/integrations/libvirt_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/libvirt_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/libvirt_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Libvirt Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/lxc_containers.md b/src/collectors/cgroups.plugin/integrations/lxc_containers.md index 3f05ffd5f..14897e468 100644 --- a/collectors/cgroups.plugin/integrations/lxc_containers.md +++ b/src/collectors/cgroups.plugin/integrations/lxc_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/lxc_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/lxc_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "LXC Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/ovirt_containers.md b/src/collectors/cgroups.plugin/integrations/ovirt_containers.md index 5771aeea1..49f8c8091 100644 --- a/collectors/cgroups.plugin/integrations/ovirt_containers.md +++ b/src/collectors/cgroups.plugin/integrations/ovirt_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/ovirt_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/ovirt_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "oVirt Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/proxmox_containers.md b/src/collectors/cgroups.plugin/integrations/proxmox_containers.md index 1804a40ca..fa2177b46 100644 --- a/collectors/cgroups.plugin/integrations/proxmox_containers.md +++ b/src/collectors/cgroups.plugin/integrations/proxmox_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/proxmox_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/proxmox_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Proxmox Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/systemd_services.md b/src/collectors/cgroups.plugin/integrations/systemd_services.md index 0ce906366..e3a80d549 100644 --- a/collectors/cgroups.plugin/integrations/systemd_services.md +++ b/src/collectors/cgroups.plugin/integrations/systemd_services.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/systemd_services.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/systemd_services.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Systemd Services" learn_status: "Published" -learn_rel_path: "Data Collection/Systemd" +learn_rel_path: "Collecting Metrics/Systemd" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> diff --git a/collectors/cgroups.plugin/integrations/virtual_machines.md b/src/collectors/cgroups.plugin/integrations/virtual_machines.md index 6a64923c4..b81c82fb9 100644 --- a/collectors/cgroups.plugin/integrations/virtual_machines.md +++ b/src/collectors/cgroups.plugin/integrations/virtual_machines.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/virtual_machines.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/virtual_machines.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Virtual Machines" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.c b/src/collectors/cgroups.plugin/sys_fs_cgroup.c index 705edf6f7..fd30b4acc 100644 --- a/collectors/cgroups.plugin/sys_fs_cgroup.c +++ b/src/collectors/cgroups.plugin/sys_fs_cgroup.c @@ -85,11 +85,6 @@ struct cgroups_systemd_config_setting cgroups_systemd_options[] = { { .name = NULL, .setting = SYSTEMD_CGROUP_ERR }, }; -// Shared memory with information from detected cgroups -netdata_ebpf_cgroup_shm_t shm_cgroup_ebpf = {NULL, NULL}; -int shm_fd_cgroup_ebpf = -1; -sem_t *shm_mutex_cgroup_ebpf = SEM_FAILED; - struct discovery_thread discovery_thread; @@ -255,7 +250,7 @@ void read_cgroup_plugin_configuration() { if(cgroup_update_every < localhost->rrd_update_every) cgroup_update_every = localhost->rrd_update_every; - cgroup_check_for_new_every = (int)config_get_number("plugin:cgroups", "check for new cgroups every", (long long)cgroup_check_for_new_every * (long long)cgroup_update_every); + cgroup_check_for_new_every = (int)config_get_number("plugin:cgroups", "check for new cgroups every", cgroup_check_for_new_every); if(cgroup_check_for_new_every < cgroup_update_every) cgroup_check_for_new_every = cgroup_update_every; @@ -418,6 +413,8 @@ void read_cgroup_plugin_configuration() { " !*/init.scope " // ignore init.scope " !/system.slice/run-*.scope " // ignore system.slice/run-XXXX.scope + " *user.slice/docker-*" // allow docker rootless containers + " !*user.slice*" // ignore the rest stuff in user.slice " *.scope " // we need all other *.scope for sure // ---------------------------------------------------------------- @@ -452,6 +449,7 @@ void read_cgroup_plugin_configuration() { " !/lxc.monitor* " " !/lxc.pivot " " !/lxc.payload " + " !*lxcfs.service/.control" " !/machine " " !/qemu " " !/system " @@ -474,7 +472,6 @@ void read_cgroup_plugin_configuration() { " !/system " " !/systemd " " !/user " - " !/user.slice " " !/lxc/*/* " // #2161 #2649 " !/lxc.monitor " " !/lxc.payload/*/* " @@ -526,63 +523,6 @@ void read_cgroup_plugin_configuration() { mountinfo_free_all(root); } -void netdata_cgroup_ebpf_set_values(size_t length) -{ - sem_wait(shm_mutex_cgroup_ebpf); - - shm_cgroup_ebpf.header->cgroup_max = cgroup_root_max; - shm_cgroup_ebpf.header->systemd_enabled = cgroup_enable_systemd_services | - cgroup_enable_systemd_services_detailed_memory | - cgroup_used_memory; - shm_cgroup_ebpf.header->body_length = length; - - sem_post(shm_mutex_cgroup_ebpf); -} - -void netdata_cgroup_ebpf_initialize_shm() -{ - shm_fd_cgroup_ebpf = shm_open(NETDATA_SHARED_MEMORY_EBPF_CGROUP_NAME, O_CREAT | O_RDWR, 0660); - if (shm_fd_cgroup_ebpf < 0) { - collector_error("Cannot initialize shared memory used by cgroup and eBPF, integration won't happen."); - return; - } - - size_t length = sizeof(netdata_ebpf_cgroup_shm_header_t) + cgroup_root_max * sizeof(netdata_ebpf_cgroup_shm_body_t); - if (ftruncate(shm_fd_cgroup_ebpf, length)) { - collector_error("Cannot set size for shared memory."); - goto end_init_shm; - } - - shm_cgroup_ebpf.header = (netdata_ebpf_cgroup_shm_header_t *) mmap(NULL, length, - PROT_READ | PROT_WRITE, MAP_SHARED, - shm_fd_cgroup_ebpf, 0); - - if (unlikely(MAP_FAILED == shm_cgroup_ebpf.header)) { - shm_cgroup_ebpf.header = NULL; - collector_error("Cannot map shared memory used between cgroup and eBPF, integration won't happen"); - goto end_init_shm; - } - shm_cgroup_ebpf.body = (netdata_ebpf_cgroup_shm_body_t *) ((char *)shm_cgroup_ebpf.header + - sizeof(netdata_ebpf_cgroup_shm_header_t)); - - shm_mutex_cgroup_ebpf = sem_open(NETDATA_NAMED_SEMAPHORE_EBPF_CGROUP_NAME, O_CREAT, - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, 1); - - if (shm_mutex_cgroup_ebpf != SEM_FAILED) { - netdata_cgroup_ebpf_set_values(length); - return; - } - - collector_error("Cannot create semaphore, integration between eBPF and cgroup won't happen"); - munmap(shm_cgroup_ebpf.header, length); - shm_cgroup_ebpf.header = NULL; - -end_init_shm: - close(shm_fd_cgroup_ebpf); - shm_fd_cgroup_ebpf = -1; - shm_unlink(NETDATA_SHARED_MEMORY_EBPF_CGROUP_NAME); -} - // --------------------------------------------------------------------------------------------- static unsigned long long calc_delta(unsigned long long curr, unsigned long long prev) { @@ -1280,12 +1220,12 @@ cpu_limits2_err: static inline int update_memory_limits(struct cgroup *cg) { char **filename = &cg->filename_memory_limit; - const RRDSETVAR_ACQUIRED **chart_var = &cg->chart_var_memory_limit; + const RRDVAR_ACQUIRED **chart_var = &cg->chart_var_memory_limit; unsigned long long *value = &cg->memory_limit; if(*filename) { if(unlikely(!*chart_var)) { - *chart_var = rrdsetvar_custom_chart_variable_add_and_acquire(cg->st_mem_usage, "memory_limit"); + *chart_var = rrdvar_chart_variable_add_and_acquire(cg->st_mem_usage, "memory_limit"); if(!*chart_var) { collector_error("Cannot create cgroup %s chart variable '%s'. Will not update its limit anymore.", cg->id, "memory_limit"); freez(*filename); @@ -1301,12 +1241,13 @@ static inline int update_memory_limits(struct cgroup *cg) { *filename = NULL; } else { - rrdsetvar_custom_chart_variable_set(cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); + rrdvar_chart_variable_set( + cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); return 1; } } else { - char buffer[30 + 1]; - int ret = read_file(*filename, buffer, 30); + char buffer[32]; + int ret = read_txt_file(*filename, buffer, sizeof(buffer)); if(ret) { collector_error("Cannot refresh cgroup %s memory limit by reading '%s'. Will not update its limit anymore.", cg->id, *filename); freez(*filename); @@ -1316,11 +1257,12 @@ static inline int update_memory_limits(struct cgroup *cg) { char *s = "max\n\0"; if(strcmp(s, buffer) == 0){ *value = UINT64_MAX; - rrdsetvar_custom_chart_variable_set(cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); + rrdvar_chart_variable_set( + cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); return 1; } *value = str2ull(buffer, NULL); - rrdsetvar_custom_chart_variable_set(cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); + rrdvar_chart_variable_set(cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); return 1; } } @@ -1398,7 +1340,7 @@ void update_cgroup_charts() { } if(unlikely(!cg->chart_var_cpu_limit)) { - cg->chart_var_cpu_limit = rrdsetvar_custom_chart_variable_add_and_acquire(cg->st_cpu, "cpu_limit"); + cg->chart_var_cpu_limit = rrdvar_chart_variable_add_and_acquire(cg->st_cpu, "cpu_limit"); if(!cg->chart_var_cpu_limit) { collector_error("Cannot create cgroup %s chart variable 'cpu_limit'. Will not update its limit anymore.", cg->id); if(cg->filename_cpuset_cpus) freez(cg->filename_cpuset_cpus); @@ -1428,7 +1370,7 @@ void update_cgroup_charts() { rrdset_is_obsolete___safe_from_collector_thread(cg->st_cpu_limit); cg->st_cpu_limit = NULL; } - rrdsetvar_custom_chart_variable_set(cg->st_cpu, cg->chart_var_cpu_limit, NAN); + rrdvar_chart_variable_set(cg->st_cpu, cg->chart_var_cpu_limit, NAN); } } } @@ -1592,19 +1534,6 @@ static void cgroup_main_cleanup(void *ptr) { } } - if (shm_mutex_cgroup_ebpf != SEM_FAILED) { - sem_close(shm_mutex_cgroup_ebpf); - } - - if (shm_cgroup_ebpf.header) { - shm_cgroup_ebpf.header->cgroup_root_count = 0; - munmap(shm_cgroup_ebpf.header, shm_cgroup_ebpf.header->body_length); - } - - if (shm_fd_cgroup_ebpf > 0) { - close(shm_fd_cgroup_ebpf); - } - static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } @@ -1641,8 +1570,6 @@ void *cgroups_main(void *ptr) { cgroup_read_host_total_ram(); - netdata_cgroup_ebpf_initialize_shm(); - if (uv_mutex_init(&cgroup_root_mutex)) { collector_error("CGROUP: cannot initialize mutex for the main cgroup list"); goto exit; @@ -1669,10 +1596,17 @@ void *cgroups_main(void *ptr) { // we register this only on localhost // for the other nodes, the origin server should register it - rrd_collector_started(); // this creates a collector that runs for as long as netdata runs cgroup_netdev_link_init(); - rrd_function_add(localhost, NULL, "containers-vms", 10, RRDFUNCTIONS_CGTOP_HELP, true, cgroup_function_cgroup_top, NULL); - rrd_function_add(localhost, NULL, "systemd-services", 10, RRDFUNCTIONS_CGTOP_HELP, true, cgroup_function_systemd_top, NULL); + + rrd_function_add_inline(localhost, NULL, "containers-vms", 10, + RRDFUNCTIONS_PRIORITY_DEFAULT / 2, RRDFUNCTIONS_CGTOP_HELP, + "top", HTTP_ACCESS_ANONYMOUS_DATA, + cgroup_function_cgroup_top); + + rrd_function_add_inline(localhost, NULL, "systemd-services", 10, + RRDFUNCTIONS_PRIORITY_DEFAULT / 3, RRDFUNCTIONS_SYSTEMD_SERVICES_HELP, + "top", HTTP_ACCESS_ANONYMOUS_DATA, + cgroup_function_systemd_top); heartbeat_t hb; heartbeat_init(&hb); diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.h b/src/collectors/cgroups.plugin/sys_fs_cgroup.h index e8cfcf5f6..e8cfcf5f6 100644 --- a/collectors/cgroups.plugin/sys_fs_cgroup.h +++ b/src/collectors/cgroups.plugin/sys_fs_cgroup.h diff --git a/collectors/cgroups.plugin/tests/test_cgroups_plugin.c b/src/collectors/cgroups.plugin/tests/test_cgroups_plugin.c index bb1fb3988..bb1fb3988 100644 --- a/collectors/cgroups.plugin/tests/test_cgroups_plugin.c +++ b/src/collectors/cgroups.plugin/tests/test_cgroups_plugin.c diff --git a/collectors/cgroups.plugin/tests/test_cgroups_plugin.h b/src/collectors/cgroups.plugin/tests/test_cgroups_plugin.h index 3d68e9230..3d68e9230 100644 --- a/collectors/cgroups.plugin/tests/test_cgroups_plugin.h +++ b/src/collectors/cgroups.plugin/tests/test_cgroups_plugin.h diff --git a/collectors/cgroups.plugin/tests/test_doubles.c b/src/collectors/cgroups.plugin/tests/test_doubles.c index b13d4b19c..53fefa9c2 100644 --- a/collectors/cgroups.plugin/tests/test_doubles.c +++ b/src/collectors/cgroups.plugin/tests/test_doubles.c @@ -101,7 +101,7 @@ collected_number rrddim_set_by_pointer(RRDSET *st, RRDDIM *rd, collected_number return 0; } -const RRDSETVAR_ACQUIRED *rrdsetvar_custom_chart_variable_add_and_acquire(RRDSET *st, const char *name) +const RRDVAR_ACQUIRED *rrdvar_chart_variable_add_and_acquire(RRDSET *st, const char *name) { UNUSED(st); UNUSED(name); @@ -109,7 +109,7 @@ const RRDSETVAR_ACQUIRED *rrdsetvar_custom_chart_variable_add_and_acquire(RRDSET return NULL; } -void rrdsetvar_custom_chart_variable_set(RRDSET *st, const RRDSETVAR_ACQUIRED *rsa, NETDATA_DOUBLE value) +void rrdvar_chart_variable_set(RRDSET *st, const RRDVAR_ACQUIRED *rsa, NETDATA_DOUBLE value) { UNUSED(st); UNUSED(rsa); |