diff options
Diffstat (limited to '')
-rw-r--r-- | src/collectors/cgroups.plugin/README.md | 300 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-charts.c (renamed from collectors/cgroups.plugin/cgroup-charts.c) | 2 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-discovery.c (renamed from collectors/cgroups.plugin/cgroup-discovery.c) | 94 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-internals.h (renamed from collectors/cgroups.plugin/cgroup-internals.h) | 21 | ||||
-rwxr-xr-x | src/collectors/cgroups.plugin/cgroup-name.sh.in (renamed from collectors/cgroups.plugin/cgroup-name.sh.in) | 0 | ||||
-rwxr-xr-x | src/collectors/cgroups.plugin/cgroup-network-helper.sh.in (renamed from collectors/cgroups.plugin/cgroup-network-helper.sh.in) | 0 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-network.c (renamed from collectors/cgroups.plugin/cgroup-network.c) | 3 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/cgroup-top.c (renamed from collectors/cgroups.plugin/cgroup-top.c) | 52 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/containers.md (renamed from collectors/cgroups.plugin/integrations/containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/kubernetes_containers.md (renamed from collectors/cgroups.plugin/integrations/kubernetes_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/libvirt_containers.md (renamed from collectors/cgroups.plugin/integrations/libvirt_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/lxc_containers.md (renamed from collectors/cgroups.plugin/integrations/lxc_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/ovirt_containers.md (renamed from collectors/cgroups.plugin/integrations/ovirt_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/proxmox_containers.md (renamed from collectors/cgroups.plugin/integrations/proxmox_containers.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/systemd_services.md (renamed from collectors/cgroups.plugin/integrations/systemd_services.md) | 6 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/integrations/virtual_machines.md (renamed from collectors/cgroups.plugin/integrations/virtual_machines.md) | 14 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/metadata.yaml | 1022 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/sys_fs_cgroup.c (renamed from collectors/cgroups.plugin/sys_fs_cgroup.c) | 116 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/sys_fs_cgroup.h (renamed from collectors/cgroups.plugin/sys_fs_cgroup.h) | 0 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/tests/test_cgroups_plugin.c (renamed from collectors/cgroups.plugin/tests/test_cgroups_plugin.c) | 0 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/tests/test_cgroups_plugin.h (renamed from collectors/cgroups.plugin/tests/test_cgroups_plugin.h) | 0 | ||||
-rw-r--r-- | src/collectors/cgroups.plugin/tests/test_doubles.c (renamed from collectors/cgroups.plugin/tests/test_doubles.c) | 4 |
22 files changed, 1518 insertions, 200 deletions
diff --git a/src/collectors/cgroups.plugin/README.md b/src/collectors/cgroups.plugin/README.md new file mode 100644 index 000000000..49e5bd54e --- /dev/null +++ b/src/collectors/cgroups.plugin/README.md @@ -0,0 +1,300 @@ +<!-- +title: "Monitor Cgroups (cgroups.plugin)" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/README.md" +sidebar_label: "Monitor Cgroups" +learn_status: "Published" +learn_topic_type: "References" +learn_rel_path: "Integrations/Monitor/Virtualized environments/Containers" +--> + +# Monitor Cgroups (cgroups.plugin) + +You can monitor containers and virtual machines using **cgroups**. + +cgroups (or control groups), are a Linux kernel feature that provides accounting and resource usage limiting for +processes. When cgroups are bundled with namespaces (i.e. isolation), they form what we usually call **containers**. + +cgroups are hierarchical, meaning that cgroups can contain child cgroups, which can contain more cgroups, etc. All +accounting is reported (and resource usage limits are applied) also in a hierarchical way. + +To visualize cgroup metrics Netdata provides configuration for cherry picking the cgroups of interest. By default ( +without any configuration) Netdata should pick **systemd services**, all kinds of **containers** (lxc, docker, etc) +and **virtual machines** spawn by managers that register them with cgroups (qemu, libvirt, etc). + +## Configuring Netdata for cgroups + +In general, no additional settings are required. Netdata discovers all available cgroups on the host system and +collects their metrics. + +### How Netdata finds the available cgroups + +Linux exposes resource usage reporting and provides dynamic configuration for cgroups, using virtual files (usually) +under `/sys/fs/cgroup`. Netdata reads `/proc/self/mountinfo` to detect the exact mount point of cgroups. Netdata also +allows manual configuration of this mount point, using these settings: + +```text +[plugin:cgroups] + check for new cgroups every = 10 + path to /sys/fs/cgroup/cpuacct = /sys/fs/cgroup/cpuacct + path to /sys/fs/cgroup/blkio = /sys/fs/cgroup/blkio + path to /sys/fs/cgroup/memory = /sys/fs/cgroup/memory + path to /sys/fs/cgroup/devices = /sys/fs/cgroup/devices +``` + +Netdata rescans these directories for added or removed cgroups every `check for new cgroups every` seconds. + +### Hierarchical search for cgroups + +Since cgroups are hierarchical, for each of the directories shown above, Netdata walks through the subdirectories +recursively searching for cgroups (each subdirectory is another cgroup). + +To provide a sane default for this setting, Netdata uses the following pattern list (patterns starting with `!` give a +negative match and their order is important: the first matching a path will be used): + +```text +[plugin:cgroups] + search for cgroups in subpaths matching = !*/init.scope !*-qemu !/init.scope !/system !/systemd !/user !/user.slice * +``` + +So, we disable checking for **child cgroups** in systemd internal +cgroups ([systemd services are monitored by Netdata](#monitoring-systemd-services)), user cgroups (normally used for +desktop and remote user sessions), qemu virtual machines (child cgroups of virtual machines) and `init.scope`. All +others are enabled. + +### Unified cgroups (cgroups v2) support + +Netdata automatically detects cgroups version. If detection fails Netdata assumes v1. +To switch to v2 manually add: + +```text +[plugin:cgroups] + use unified cgroups = yes + path to unified cgroups = /sys/fs/cgroup +``` + +Unified cgroups use same name pattern matching as v1 cgroups. `cgroup_enable_systemd_services_detailed_memory` is +currently unsupported when using unified cgroups. + +### Enabled cgroups + +To provide a sane default, Netdata uses the +following [pattern list](https://github.com/netdata/netdata/blob/master/src/libnetdata/simple_pattern/README.md): + +- Checks the pattern against the path of the cgroup + + ```text + [plugin:cgroups] + enable by default cgroups matching = !*/init.scope *.scope !*/vcpu* !*/emulator !*.mount !*.partition !*.service !*.slice !*.swap !*.user !/ !/docker !/libvirt !/lxc !/lxc/*/ns !/lxc/*/ns/* !/machine !/qemu !/system !/systemd !/user * + ``` + +- Checks the pattern against the name of the cgroup (as you see it on the dashboard) + + ```text + [plugin:cgroups] + enable by default cgroups names matching = * + ``` + +Renaming is configured with the following options: + +```text +[plugin:cgroups] + run script to rename cgroups matching = *.scope *docker* *lxc* *qemu* !/ !*.mount !*.partition !*.service !*.slice !*.swap !*.user * + script to get cgroup names = /usr/libexec/netdata/plugins.d/cgroup-name.sh +``` + +The whole point for the additional pattern list, is to limit the number of times the script will be called. Without this +pattern list, the script might be called thousands of times, depending on the number of cgroups available in the system. + +The above pattern list is matched against the path of the cgroup. For matched cgroups, Netdata calls the +script [cgroup-name.sh](https://raw.githubusercontent.com/netdata/netdata/master/src/collectors/cgroups.plugin/cgroup-name.sh) +to get its name. This script queries `docker`, `kubectl`, `podman`, or applies heuristics to find give a name for the +cgroup. + +#### Note on Podman container names + +Podman's security model is a lot more restrictive than Docker's, so Netdata will not be able to detect container names +out of the box unless they were started by the same user as Netdata itself. + +If Podman is used in "rootful" mode, it's also possible to use `podman system service` to grant Netdata access to +container names. To do this, ensure `podman system service` is running and Netdata has access +to `/run/podman/podman.sock` (the default permissions as specified by upstream are `0600`, with owner `root`, so you +will have to adjust the configuration). + +[Docker Socket Proxy (HAProxy)](https://github.com/Tecnativa/docker-socket-proxy) or [CetusGuard](https://github.com/hectorm/cetusguard) +can also be used to give Netdata restricted access to the socket. Note that `PODMAN_HOST` in Netdata's environment should +be set to the proxy's URL in this case. + +### Charts with zero metrics + +By default, Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are +ignored. Metrics that will start having values, after Netdata is started, will be detected and charts will be +automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a +chart instead of `auto` to enable it permanently. For example: + +```text +[plugin:cgroups] + enable memory (used mem including cache) = yes +``` + +You can also set the `enable zero metrics` option to `yes` in the `[global]` section which enables charts with zero +metrics for all internal Netdata plugins. + +### Alerts + +CPU and memory limits are watched and used to rise alerts. Memory usage for every cgroup is checked against `ram` +and `ram+swap` limits. CPU usage for every cgroup is checked against `cpuset.cpus` and `cpu.cfs_period_us` + `cpu.cfs_quota_us` pair assigned for the cgroup. Configuration for the alerts is available in `health.d/cgroups.conf` +file. + +## Monitoring systemd services + +Netdata monitors **systemd services**. Example: + +![image](https://cloud.githubusercontent.com/assets/2662304/21964372/20cd7b84-db53-11e6-98a2-b9c986b082c0.png) + +Support per distribution: + +| system | charts shown | `/sys/fs/cgroup` tree | comments | +|:----------------:|:------------:|:------------------------------------:|:--------------------------| +| Arch Linux | YES | | | +| Gentoo | NO | | can be enabled, see below | +| Ubuntu 16.04 LTS | YES | | | +| Ubuntu 16.10 | YES | [here](http://pastebin.com/PiWbQEXy) | | +| Fedora 25 | YES | [here](http://pastebin.com/ax0373wF) | | +| Debian 8 | NO | | can be enabled, see below | +| AMI | NO | [here](http://pastebin.com/FrxmptjL) | not a systemd system | +| CentOS 7.3.1611 | NO | [here](http://pastebin.com/SpzgezAg) | can be enabled, see below | + +### Monitored systemd service metrics + +- CPU utilization +- Used memory +- RSS memory +- Mapped memory +- Cache memory +- Writeback memory +- Memory minor page faults +- Memory major page faults +- Memory charging activity +- Memory uncharging activity +- Memory limit failures +- Swap memory used +- Disk read bandwidth +- Disk write bandwidth +- Disk read operations +- Disk write operations +- Throttle disk read bandwidth +- Throttle disk write bandwidth +- Throttle disk read operations +- Throttle disk write operations +- Queued disk read operations +- Queued disk write operations +- Merged disk read operations +- Merged disk write operations + +### How to enable cgroup accounting on systemd systems that is by default disabled + +You can verify there is no accounting enabled, by running `systemd-cgtop`. The program will show only resources for +cgroup `/`, but all services will show nothing. + +To enable cgroup accounting, execute this: + +```sh +sed -e 's|^#Default\(.*\)Accounting=.*$|Default\1Accounting=yes|g' /etc/systemd/system.conf >/tmp/system.conf +``` + +To see the changes it made, run this: + +```sh +# diff /etc/systemd/system.conf /tmp/system.conf +40,44c40,44 +< #DefaultCPUAccounting=no +< #DefaultIOAccounting=no +< #DefaultBlockIOAccounting=no +< #DefaultMemoryAccounting=no +< #DefaultTasksAccounting=yes +--- +> DefaultCPUAccounting=yes +> DefaultIOAccounting=yes +> DefaultBlockIOAccounting=yes +> DefaultMemoryAccounting=yes +> DefaultTasksAccounting=yes +``` + +If you are happy with the changes, run: + +```sh +# copy the file to the right location +sudo cp /tmp/system.conf /etc/systemd/system.conf + +# restart systemd to take it into account +sudo systemctl daemon-reexec +``` + +(`systemctl daemon-reload` does not reload the configuration of the server - so you have to +execute `systemctl daemon-reexec`). + +Now, when you run `systemd-cgtop`, services will start reporting usage (if it does not, restart any service to wake it up). Refresh your Netdata dashboard, and you will have the charts too. + +In case memory accounting is missing, you will need to enable it at your kernel, by appending the following kernel boot +options and rebooting: + +```sh +cgroup_enable=memory swapaccount=1 +``` + +You can add the above, directly at the `linux` line in your `/boot/grub/grub.cfg` or appending them to +the `GRUB_CMDLINE_LINUX` in `/etc/default/grub` (in which case you will have to run `update-grub` before rebooting). On +DigitalOcean debian images you may have to set it at `/etc/default/grub.d/50-cloudimg-settings.cfg`. + +Which systemd services are monitored by Netdata is determined by the following pattern list: + +```text +[plugin:cgroups] + cgroups to match as systemd services = !/system.slice/*/*.service /system.slice/*.service +``` + +- - - + +## Monitoring ephemeral containers + +Netdata monitors containers automatically when it is installed at the host, or when it is installed in a container that +has access to the `/proc` and `/sys` filesystems of the host. + +Network interfaces and cgroups (containers) are self-cleaned. When a network interface or container stops, Netdata might log +a few errors in error.log complaining about files it cannot find, but immediately: + +1. It will detect this is a removed container or network interface +2. It will freeze/pause all alerts for them +3. It will mark their charts as obsolete +4. Obsolete charts are not be offered on new dashboard sessions (so hit F5 and the charts are gone) +5. Existing dashboard sessions will continue to see them, but of course they will not refresh +6. Obsolete charts will be removed from memory, 1 hour after the last user viewed them (configurable + with `[global].cleanup obsolete charts after seconds = 3600` (at `netdata.conf`). + +### Monitored container metrics + +- CPU usage +- CPU usage within the limits +- CPU usage per core +- Memory usage +- Writeback memory +- Memory activity +- Memory page faults +- Used memory +- Used RAM within the limits +- Memory utilization +- Memory limit failures +- I/O bandwidth (all disks) +- Serviced I/O operations (all disks) +- Throttle I/O bandwidth (all disks) +- Throttle serviced I/O operations (all disks) +- Queued I/O operations (all disks) +- Merged I/O operations (all disks) +- CPU pressure +- Memory pressure +- Memory full pressure +- I/O pressure +- I/O full pressure + +Network interfaces are monitored by means of +the [proc plugin](https://github.com/netdata/netdata/blob/master/src/collectors/proc.plugin/README.md#monitored-network-interface-metrics). diff --git a/collectors/cgroups.plugin/cgroup-charts.c b/src/collectors/cgroups.plugin/cgroup-charts.c index a89e8ac45..1a337cd99 100644 --- a/collectors/cgroups.plugin/cgroup-charts.c +++ b/src/collectors/cgroups.plugin/cgroup-charts.c @@ -96,7 +96,7 @@ void update_cpu_utilization_limit_chart(struct cgroup *cg, NETDATA_DOUBLE cpu_li cg->prev_cpu_usage = cpu_usage; - rrdsetvar_custom_chart_variable_set(cg->st_cpu, cg->chart_var_cpu_limit, cpu_limit); + rrdvar_chart_variable_set(cg->st_cpu, cg->chart_var_cpu_limit, cpu_limit); rrdset_done(chart); } diff --git a/collectors/cgroups.plugin/cgroup-discovery.c b/src/collectors/cgroups.plugin/cgroup-discovery.c index ede35ed8a..e5d029cfb 100644 --- a/collectors/cgroups.plugin/cgroup-discovery.c +++ b/src/collectors/cgroups.plugin/cgroup-discovery.c @@ -25,6 +25,10 @@ char cgroup_chart_id_prefix[] = "cgroup_"; char services_chart_id_prefix[] = "systemd_"; char *cgroups_rename_script = NULL; +// Shared memory with information from detected cgroups +netdata_ebpf_cgroup_shm_t shm_cgroup_ebpf = {NULL, NULL}; +int shm_fd_cgroup_ebpf = -1; +sem_t *shm_mutex_cgroup_ebpf = SEM_FAILED; // ---------------------------------------------------------------------------- @@ -42,7 +46,7 @@ static inline void cgroup_free_network_interfaces(struct cgroup *cg) { cg->interfaces = i->next; // delete the registration of proc_net_dev rename - netdev_rename_device_del(i->host_device); + cgroup_rename_task_device_del(i->host_device); freez((void *)i->host_device); freez((void *)i->container_device); @@ -1027,6 +1031,82 @@ static int discovery_is_cgroup_duplicate(struct cgroup *cg) { } // ---------------------------------------------------------------------------- +// ebpf shared memory + +static void netdata_cgroup_ebpf_set_values(size_t length) +{ + sem_wait(shm_mutex_cgroup_ebpf); + + shm_cgroup_ebpf.header->cgroup_max = cgroup_root_max; + shm_cgroup_ebpf.header->systemd_enabled = cgroup_enable_systemd_services | + cgroup_enable_systemd_services_detailed_memory | + cgroup_used_memory; + shm_cgroup_ebpf.header->body_length = length; + + sem_post(shm_mutex_cgroup_ebpf); +} + +static void netdata_cgroup_ebpf_initialize_shm() +{ + shm_fd_cgroup_ebpf = shm_open(NETDATA_SHARED_MEMORY_EBPF_CGROUP_NAME, O_CREAT | O_RDWR, 0660); + if (shm_fd_cgroup_ebpf < 0) { + collector_error("Cannot initialize shared memory used by cgroup and eBPF, integration won't happen."); + return; + } + + size_t length = sizeof(netdata_ebpf_cgroup_shm_header_t) + cgroup_root_max * sizeof(netdata_ebpf_cgroup_shm_body_t); + if (ftruncate(shm_fd_cgroup_ebpf, length)) { + collector_error("Cannot set size for shared memory."); + goto end_init_shm; + } + + shm_cgroup_ebpf.header = (netdata_ebpf_cgroup_shm_header_t *) mmap(NULL, length, + PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd_cgroup_ebpf, 0); + + if (unlikely(MAP_FAILED == shm_cgroup_ebpf.header)) { + shm_cgroup_ebpf.header = NULL; + collector_error("Cannot map shared memory used between cgroup and eBPF, integration won't happen"); + goto end_init_shm; + } + shm_cgroup_ebpf.body = (netdata_ebpf_cgroup_shm_body_t *) ((char *)shm_cgroup_ebpf.header + + sizeof(netdata_ebpf_cgroup_shm_header_t)); + + shm_mutex_cgroup_ebpf = sem_open(NETDATA_NAMED_SEMAPHORE_EBPF_CGROUP_NAME, O_CREAT, + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, 1); + + if (shm_mutex_cgroup_ebpf != SEM_FAILED) { + netdata_cgroup_ebpf_set_values(length); + return; + } + + collector_error("Cannot create semaphore, integration between eBPF and cgroup won't happen"); + munmap(shm_cgroup_ebpf.header, length); + shm_cgroup_ebpf.header = NULL; + + end_init_shm: + close(shm_fd_cgroup_ebpf); + shm_fd_cgroup_ebpf = -1; + shm_unlink(NETDATA_SHARED_MEMORY_EBPF_CGROUP_NAME); +} + +static void cgroup_cleanup_ebpf_integration() +{ + if (shm_mutex_cgroup_ebpf != SEM_FAILED) { + sem_close(shm_mutex_cgroup_ebpf); + } + + if (shm_cgroup_ebpf.header) { + shm_cgroup_ebpf.header->cgroup_root_count = 0; + munmap(shm_cgroup_ebpf.header, shm_cgroup_ebpf.header->body_length); + } + + if (shm_fd_cgroup_ebpf > 0) { + close(shm_fd_cgroup_ebpf); + } +} + +// ---------------------------------------------------------------------------- // cgroup network interfaces #define CGROUP_NETWORK_INTERFACE_MAX_LINE 2048 @@ -1084,8 +1164,13 @@ static inline void read_cgroup_network_interfaces(struct cgroup *cg) { collector_info("CGROUP: cgroup '%s' has network interface '%s' as '%s'", cg->id, i->host_device, i->container_device); // register a device rename to proc_net_dev.c - netdev_rename_device_add(i->host_device, i->container_device, cg->chart_id, cg->chart_labels, - k8s_is_kubepod(cg) ? "k8s." : "", cgroup_netdev_get(cg)); + cgroup_rename_task_add( + i->host_device, + i->container_device, + cg->chart_id, + cg->chart_labels, + k8s_is_kubepod(cg) ? "k8s." : "", + cgroup_netdev_get(cg)); } } @@ -1226,6 +1311,8 @@ void cgroup_discovery_worker(void *ptr) service_register(SERVICE_THREAD_TYPE_LIBUV, NULL, NULL, NULL, false); + netdata_cgroup_ebpf_initialize_shm(); + while (service_running(SERVICE_COLLECTORS)) { worker_is_idle(); @@ -1239,6 +1326,7 @@ void cgroup_discovery_worker(void *ptr) discovery_find_all_cgroups(); } collector_info("discovery thread stopped"); + cgroup_cleanup_ebpf_integration(); worker_unregister(); service_exits(); __atomic_store_n(&discovery_thread.exited,1,__ATOMIC_RELAXED); diff --git a/collectors/cgroups.plugin/cgroup-internals.h b/src/collectors/cgroups.plugin/cgroup-internals.h index a69802240..1f5be7707 100644 --- a/collectors/cgroups.plugin/cgroup-internals.h +++ b/src/collectors/cgroups.plugin/cgroup-internals.h @@ -287,16 +287,16 @@ struct cgroup { char *filename_cpu_cfs_quota; unsigned long long cpu_cfs_quota; - const RRDSETVAR_ACQUIRED *chart_var_cpu_limit; + const RRDVAR_ACQUIRED *chart_var_cpu_limit; NETDATA_DOUBLE prev_cpu_usage; char *filename_memory_limit; unsigned long long memory_limit; - const RRDSETVAR_ACQUIRED *chart_var_memory_limit; + const RRDVAR_ACQUIRED *chart_var_memory_limit; char *filename_memoryswap_limit; unsigned long long memoryswap_limit; - const RRDSETVAR_ACQUIRED *chart_var_memoryswap_limit; + const RRDVAR_ACQUIRED *chart_var_memoryswap_limit; const DICTIONARY_ITEM *cgroup_netdev_link; @@ -386,10 +386,6 @@ extern uint32_t throttled_time_hash; extern uint32_t throttled_usec_hash; extern struct cgroup *cgroup_root; -extern netdata_ebpf_cgroup_shm_t shm_cgroup_ebpf; -extern int shm_fd_cgroup_ebpf; -extern sem_t *shm_mutex_cgroup_ebpf; - enum cgroups_type { CGROUPS_AUTODETECT_FAIL, CGROUPS_V1, CGROUPS_V2 }; enum cgroups_systemd_setting { @@ -452,15 +448,10 @@ static inline char *cgroup_chart_type(char *buffer, struct cgroup *cg) { } #define RRDFUNCTIONS_CGTOP_HELP "View running containers" +#define RRDFUNCTIONS_SYSTEMD_SERVICES_HELP "View systemd services" -int cgroup_function_cgroup_top(BUFFER *wb, int timeout, const char *function, void *collector_data, - rrd_function_result_callback_t result_cb, void *result_cb_data, - rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data, - rrd_function_register_canceller_cb_t register_canceller_cb, void *register_canceller_cb_data); -int cgroup_function_systemd_top(BUFFER *wb, int timeout, const char *function, void *collector_data, - rrd_function_result_callback_t result_cb, void *result_cb_data, - rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data, - rrd_function_register_canceller_cb_t register_canceller_cb, void *register_canceller_cb_data); +int cgroup_function_cgroup_top(BUFFER *wb, const char *function); +int cgroup_function_systemd_top(BUFFER *wb, const char *function); void cgroup_netdev_link_init(void); const DICTIONARY_ITEM *cgroup_netdev_get(struct cgroup *cg); diff --git a/collectors/cgroups.plugin/cgroup-name.sh.in b/src/collectors/cgroups.plugin/cgroup-name.sh.in index 0f8b63256..0f8b63256 100755 --- a/collectors/cgroups.plugin/cgroup-name.sh.in +++ b/src/collectors/cgroups.plugin/cgroup-name.sh.in diff --git a/collectors/cgroups.plugin/cgroup-network-helper.sh.in b/src/collectors/cgroups.plugin/cgroup-network-helper.sh.in index da9b9162a..da9b9162a 100755 --- a/collectors/cgroups.plugin/cgroup-network-helper.sh.in +++ b/src/collectors/cgroups.plugin/cgroup-network-helper.sh.in diff --git a/collectors/cgroups.plugin/cgroup-network.c b/src/collectors/cgroups.plugin/cgroup-network.c index 508ea07c6..085a6aa6f 100644 --- a/collectors/cgroups.plugin/cgroup-network.c +++ b/src/collectors/cgroups.plugin/cgroup-network.c @@ -7,7 +7,6 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE /* See feature_test_macros(7) */ #endif -#include <sched.h> #endif char env_netdata_host_prefix[FILENAME_MAX + 50] = ""; @@ -183,7 +182,7 @@ int proc_pid_fd(const char *prefix, const char *ns, pid_t pid) { char filename[FILENAME_MAX + 1]; snprintfz(filename, FILENAME_MAX, "%s/proc/%d/%s", prefix, (int)pid, ns); - int fd = open(filename, O_RDONLY); + int fd = open(filename, O_RDONLY | O_CLOEXEC); if(fd == -1) collector_error("Cannot open proc_pid_fd() file '%s'", filename); diff --git a/collectors/cgroups.plugin/cgroup-top.c b/src/collectors/cgroups.plugin/cgroup-top.c index 8d44d3b56..aa413dad1 100644 --- a/collectors/cgroups.plugin/cgroup-top.c +++ b/src/collectors/cgroups.plugin/cgroup-top.c @@ -98,13 +98,7 @@ void cgroup_netdev_get_bandwidth(struct cgroup *cg, NETDATA_DOUBLE *received, NE *sent = t->sent[slot]; } -int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const char *function __maybe_unused, - void *collector_data __maybe_unused, - rrd_function_result_callback_t result_cb, void *result_cb_data, - rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data, - rrd_function_register_canceller_cb_t register_canceller_cb __maybe_unused, - void *register_canceller_cb_data __maybe_unused) { - +int cgroup_function_cgroup_top(BUFFER *wb, const char *function __maybe_unused) { buffer_flush(wb); wb->content_type = CT_APPLICATION_JSON; buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); @@ -113,11 +107,13 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha buffer_json_member_add_uint64(wb, "status", HTTP_RESP_OK); buffer_json_member_add_string(wb, "type", "table"); buffer_json_member_add_time_t(wb, "update_every", 1); + buffer_json_member_add_boolean(wb, "has_history", false); buffer_json_member_add_string(wb, "help", RRDFUNCTIONS_CGTOP_HELP); buffer_json_member_add_array(wb, "data"); double max_pids = 0.0; double max_cpu = 0.0; + double max_cpu_throttled = 0.0; double max_ram = 0.0; double max_disk_io_read = 0.0; double max_disk_io_written = 0.0; @@ -149,6 +145,9 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha max_cpu = MAX(max_cpu, cpu); } + double cpu_throttled = (double)cg->cpuacct_cpu_throttling.nr_throttled_perc; + max_cpu_throttled = MAX(max_cpu_throttled, cpu_throttled); + double ram = rrddim_get_last_stored_value(cg->st_mem_rd_ram, &max_ram, 1.0); rd = cg->st_throttle_io_rd_read ? cg->st_throttle_io_rd_read : cg->st_io_rd_read; @@ -167,6 +166,7 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha buffer_json_add_array_item_double(wb, pids_current); buffer_json_add_array_item_double(wb, cpu); + buffer_json_add_array_item_double(wb, cpu_throttled); buffer_json_add_array_item_double(wb, ram); buffer_json_add_array_item_double(wb, disk_io_read); buffer_json_add_array_item_double(wb, disk_io_written); @@ -215,6 +215,13 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha RRDF_FIELD_OPTS_VISIBLE, NULL); + buffer_rrdf_table_add_field(wb, field_id++, "CPU Throttling", "CPU Throttled Runnable Periods", + RRDF_FIELD_TYPE_BAR_WITH_INTEGER, RRDF_FIELD_VISUAL_BAR, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "%", max_cpu_throttled, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_NONE, + is_inside_k8s ? RRDF_FIELD_OPTS_VISIBLE : RRDF_FIELD_OPTS_NONE, + NULL); + // RAM buffer_rrdf_table_add_field(wb, field_id++, "RAM", "RAM Usage", RRDF_FIELD_TYPE_BAR_WITH_INTEGER, RRDF_FIELD_VISUAL_BAR, RRDF_FIELD_TRANSFORM_NUMBER, @@ -331,25 +338,10 @@ int cgroup_function_cgroup_top(BUFFER *wb, int timeout __maybe_unused, const cha buffer_json_member_add_time_t(wb, "expires", now_realtime_sec() + 1); buffer_json_finalize(wb); - int response = HTTP_RESP_OK; - if(is_cancelled_cb && is_cancelled_cb(is_cancelled_cb_data)) { - buffer_flush(wb); - response = HTTP_RESP_CLIENT_CLOSED_REQUEST; - } - - if(result_cb) - result_cb(wb, response, result_cb_data); - - return response; + return HTTP_RESP_OK; } -int cgroup_function_systemd_top(BUFFER *wb, int timeout __maybe_unused, const char *function __maybe_unused, - void *collector_data __maybe_unused, - rrd_function_result_callback_t result_cb, void *result_cb_data, - rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data, - rrd_function_register_canceller_cb_t register_canceller_cb __maybe_unused, - void *register_canceller_cb_data __maybe_unused) { - +int cgroup_function_systemd_top(BUFFER *wb, const char *function __maybe_unused) { buffer_flush(wb); wb->content_type = CT_APPLICATION_JSON; buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); @@ -358,6 +350,7 @@ int cgroup_function_systemd_top(BUFFER *wb, int timeout __maybe_unused, const ch buffer_json_member_add_uint64(wb, "status", HTTP_RESP_OK); buffer_json_member_add_string(wb, "type", "table"); buffer_json_member_add_time_t(wb, "update_every", 1); + buffer_json_member_add_boolean(wb, "has_history", false); buffer_json_member_add_string(wb, "help", RRDFUNCTIONS_CGTOP_HELP); buffer_json_member_add_array(wb, "data"); @@ -507,14 +500,5 @@ int cgroup_function_systemd_top(BUFFER *wb, int timeout __maybe_unused, const ch buffer_json_member_add_time_t(wb, "expires", now_realtime_sec() + 1); buffer_json_finalize(wb); - int response = HTTP_RESP_OK; - if(is_cancelled_cb && is_cancelled_cb(is_cancelled_cb_data)) { - buffer_flush(wb); - response = HTTP_RESP_CLIENT_CLOSED_REQUEST; - } - - if(result_cb) - result_cb(wb, response, result_cb_data); - - return response; + return HTTP_RESP_OK; } diff --git a/collectors/cgroups.plugin/integrations/containers.md b/src/collectors/cgroups.plugin/integrations/containers.md index 6273d1e91..e769fc5f9 100644 --- a/collectors/cgroups.plugin/integrations/containers.md +++ b/src/collectors/cgroups.plugin/integrations/containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/kubernetes_containers.md b/src/collectors/cgroups.plugin/integrations/kubernetes_containers.md index 9be32a12a..c71b5736c 100644 --- a/collectors/cgroups.plugin/integrations/kubernetes_containers.md +++ b/src/collectors/cgroups.plugin/integrations/kubernetes_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/kubernetes_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/kubernetes_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Kubernetes Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Kubernetes" +learn_rel_path: "Collecting Metrics/Kubernetes" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -154,10 +154,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ k8s_cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ k8s_cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.mem_usage | cgroup memory utilization | -| [ k8s_cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ k8s_cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ k8s_cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ k8s_cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.mem_usage | cgroup memory utilization | +| [ k8s_cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ k8s_cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/libvirt_containers.md b/src/collectors/cgroups.plugin/integrations/libvirt_containers.md index fed454698..409b43609 100644 --- a/collectors/cgroups.plugin/integrations/libvirt_containers.md +++ b/src/collectors/cgroups.plugin/integrations/libvirt_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/libvirt_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/libvirt_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Libvirt Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/lxc_containers.md b/src/collectors/cgroups.plugin/integrations/lxc_containers.md index 3f05ffd5f..14897e468 100644 --- a/collectors/cgroups.plugin/integrations/lxc_containers.md +++ b/src/collectors/cgroups.plugin/integrations/lxc_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/lxc_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/lxc_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "LXC Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/ovirt_containers.md b/src/collectors/cgroups.plugin/integrations/ovirt_containers.md index 5771aeea1..49f8c8091 100644 --- a/collectors/cgroups.plugin/integrations/ovirt_containers.md +++ b/src/collectors/cgroups.plugin/integrations/ovirt_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/ovirt_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/ovirt_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "oVirt Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/proxmox_containers.md b/src/collectors/cgroups.plugin/integrations/proxmox_containers.md index 1804a40ca..fa2177b46 100644 --- a/collectors/cgroups.plugin/integrations/proxmox_containers.md +++ b/src/collectors/cgroups.plugin/integrations/proxmox_containers.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/proxmox_containers.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/proxmox_containers.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Proxmox Containers" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/collectors/cgroups.plugin/integrations/systemd_services.md b/src/collectors/cgroups.plugin/integrations/systemd_services.md index 0ce906366..e3a80d549 100644 --- a/collectors/cgroups.plugin/integrations/systemd_services.md +++ b/src/collectors/cgroups.plugin/integrations/systemd_services.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/systemd_services.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/systemd_services.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Systemd Services" learn_status: "Published" -learn_rel_path: "Data Collection/Systemd" +learn_rel_path: "Collecting Metrics/Systemd" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> diff --git a/collectors/cgroups.plugin/integrations/virtual_machines.md b/src/collectors/cgroups.plugin/integrations/virtual_machines.md index 6a64923c4..b81c82fb9 100644 --- a/collectors/cgroups.plugin/integrations/virtual_machines.md +++ b/src/collectors/cgroups.plugin/integrations/virtual_machines.md @@ -1,9 +1,9 @@ <!--startmeta -custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/integrations/virtual_machines.md" -meta_yaml: "https://github.com/netdata/netdata/edit/master/collectors/cgroups.plugin/metadata.yaml" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/integrations/virtual_machines.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/cgroups.plugin/metadata.yaml" sidebar_label: "Virtual Machines" learn_status: "Published" -learn_rel_path: "Data Collection/Containers and VMs" +learn_rel_path: "Collecting Metrics/Containers and VMs" most_popular: True message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" endmeta--> @@ -140,10 +140,10 @@ The following alerts are available: | Alert name | On metric | Description | |:------------|:----------|:------------| -| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | -| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | -| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | -| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | +| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes | +| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization | +| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute | +| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute | ## Setup diff --git a/src/collectors/cgroups.plugin/metadata.yaml b/src/collectors/cgroups.plugin/metadata.yaml new file mode 100644 index 000000000..100681386 --- /dev/null +++ b/src/collectors/cgroups.plugin/metadata.yaml @@ -0,0 +1,1022 @@ +plugin_name: cgroups.plugin +modules: + - &module + meta: &meta + plugin_name: cgroups.plugin + module_name: /sys/fs/cgroup + monitored_instance: + name: Containers + link: "" + categories: + - data-collection.containers-and-vms + icon_filename: container.svg + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + keywords: + - containers + most_popular: true + overview: &overview + data_collection: &data_collection + metrics_description: "Monitor Containers for performance, resource usage, and health status." + method_description: "" + supported_platforms: + include: [] + exclude: [] + multi_instance: true + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "" + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: [] + configuration: + file: + name: "" + description: "" + options: + description: "" + folding: + title: "" + enabled: true + list: [] + examples: + folding: + enabled: true + title: "" + list: [] + troubleshooting: + problems: + list: [] + alerts: + - name: cgroup_10min_cpu_usage + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf + metric: cgroup.cpu_limit + info: average cgroup CPU utilization over the last 10 minutes + - name: cgroup_ram_in_use + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf + metric: cgroup.mem_usage + info: cgroup memory utilization + - name: cgroup_1m_received_packets_rate + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf + metric: cgroup.net_packets + info: average number of packets received by the network interface ${label:device} over the last minute + - name: cgroup_10s_received_packets_storm + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf + metric: cgroup.net_packets + info: + ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over + the last minute + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: [] + scopes: + - name: cgroup + description: "" + labels: + - name: container_name + description: The container name or group path if name resolution fails. + - name: image + description: Docker/Podman container image name. + metrics: + - name: cgroup.cpu_limit + description: CPU Usage within the limits + unit: "percentage" + chart_type: line + dimensions: + - name: used + - name: cgroup.cpu + description: CPU Usage (100% = 1 core) + unit: "percentage" + chart_type: stacked + dimensions: + - name: user + - name: system + - name: cgroup.cpu_per_core + description: CPU Usage (100% = 1 core) Per Core + unit: "percentage" + chart_type: line + dimensions: + - name: a dimension per core + - name: cgroup.throttled + description: CPU Throttled Runnable Periods + unit: "percentage" + chart_type: line + dimensions: + - name: throttled + - name: cgroup.throttled_duration + description: CPU Throttled Time Duration + unit: "ms" + chart_type: line + dimensions: + - name: duration + - name: cgroup.cpu_shares + description: CPU Time Relative Share + unit: "shares" + chart_type: line + dimensions: + - name: shares + - name: cgroup.mem + description: Memory Usage + unit: "MiB" + chart_type: stacked + dimensions: + - name: cache + - name: rss + - name: swap + - name: rss_huge + - name: mapped_file + - name: cgroup.writeback + description: Writeback Memory + unit: "MiB" + chart_type: area + dimensions: + - name: dirty + - name: writeback + - name: cgroup.mem_activity + description: Memory Activity + unit: "MiB/s" + chart_type: line + dimensions: + - name: in + - name: out + - name: cgroup.pgfaults + description: Memory Page Faults + unit: "MiB/s" + chart_type: line + dimensions: + - name: pgfault + - name: swap + - name: cgroup.mem_usage + description: Used Memory + unit: "MiB" + chart_type: stacked + dimensions: + - name: ram + - name: swap + - name: cgroup.mem_usage_limit + description: Used RAM within the limits + unit: "MiB" + chart_type: stacked + dimensions: + - name: available + - name: used + - name: cgroup.mem_utilization + description: Memory Utilization + unit: "percentage" + chart_type: line + dimensions: + - name: utilization + - name: cgroup.mem_failcnt + description: Memory Limit Failures + unit: "count" + chart_type: line + dimensions: + - name: failures + - name: cgroup.io + description: I/O Bandwidth (all disks) + unit: "KiB/s" + chart_type: area + dimensions: + - name: read + - name: write + - name: cgroup.serviced_ops + description: Serviced I/O Operations (all disks) + unit: "operations/s" + chart_type: line + dimensions: + - name: read + - name: write + - name: cgroup.throttle_io + description: Throttle I/O Bandwidth (all disks) + unit: "KiB/s" + chart_type: area + dimensions: + - name: read + - name: write + - name: cgroup.throttle_serviced_ops + description: Throttle Serviced I/O Operations (all disks) + unit: "operations/s" + chart_type: line + dimensions: + - name: read + - name: write + - name: cgroup.queued_ops + description: Queued I/O Operations (all disks) + unit: "operations" + chart_type: line + dimensions: + - name: read + - name: write + - name: cgroup.merged_ops + description: Merged I/O Operations (all disks) + unit: "operations/s" + chart_type: line + dimensions: + - name: read + - name: write + - name: cgroup.cpu_some_pressure + description: CPU some pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: cgroup.cpu_some_pressure_stall_time + description: CPU some pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: cgroup.cpu_full_pressure + description: CPU full pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: cgroup.cpu_full_pressure_stall_time + description: CPU full pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: cgroup.memory_some_pressure + description: Memory some pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: cgroup.memory_some_pressure_stall_time + description: Memory some pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: cgroup.memory_full_pressure + description: Memory full pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: cgroup.memory_full_pressure_stall_time + description: Memory full pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: cgroup.io_some_pressure + description: I/O some pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: cgroup.io_some_pressure_stall_time + description: I/O some pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: cgroup.io_full_pressure + description: I/O some pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: cgroup.io_full_pressure_stall_time + description: I/O some pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: cgroup.pids_current + description: Number of processes + unit: "pids" + chart_type: line + dimensions: + - name: pids + - name: cgroup network device + description: "" + labels: + - name: container_name + description: The container name or group path if name resolution fails. + - name: image + description: Docker/Podman container image name. + - name: device + description: "The name of the host network interface linked to the container's network interface." + - name: container_device + description: Container network interface name. + - name: interface_type + description: 'Network interface type. Always "virtual" for the containers.' + metrics: + - name: cgroup.net_net + description: Bandwidth + unit: "kilobits/s" + chart_type: area + dimensions: + - name: received + - name: sent + - name: cgroup.net_packets + description: Packets + unit: "pps" + chart_type: line + dimensions: + - name: received + - name: sent + - name: multicast + - name: cgroup.net_errors + description: Interface Errors + unit: "errors/s" + chart_type: line + dimensions: + - name: inbound + - name: outbound + - name: cgroup.net_drops + description: Interface Drops + unit: "errors/s" + chart_type: line + dimensions: + - name: inbound + - name: outbound + - name: cgroup.net_fifo + description: Interface FIFO Buffer Errors + unit: "errors/s" + chart_type: line + dimensions: + - name: receive + - name: transmit + - name: cgroup.net_compressed + description: Interface FIFO Buffer Errors + unit: "pps" + chart_type: line + dimensions: + - name: receive + - name: sent + - name: cgroup.net_events + description: Network Interface Events + unit: "events/s" + chart_type: line + dimensions: + - name: frames + - name: collisions + - name: carrier + - name: cgroup.net_operstate + description: Interface Operational State + unit: "state" + chart_type: line + dimensions: + - name: up + - name: down + - name: notpresent + - name: lowerlayerdown + - name: testing + - name: dormant + - name: unknown + - name: cgroup.net_carrier + description: Interface Physical Link State + unit: "state" + chart_type: line + dimensions: + - name: up + - name: down + - name: cgroup.net_mtu + description: Interface MTU + unit: "octets" + chart_type: line + dimensions: + - name: mtu + - <<: *module + meta: + <<: *meta + monitored_instance: + name: Kubernetes Containers + link: https://kubernetes.io/ + icon_filename: kubernetes.svg + categories: + #- data-collection.containers-and-vms + - data-collection.kubernetes + keywords: + - k8s + - kubernetes + - pods + - containers + overview: + <<: *overview + data-collection: + <<: *data_collection + metrics_description: Monitor Kubernetes Clusters for performance, resource usage, and health status. + alerts: + - name: k8s_cgroup_10min_cpu_usage + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf + metric: k8s.cgroup.cpu_limit + info: average cgroup CPU utilization over the last 10 minutes + - name: k8s_cgroup_ram_in_use + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf + metric: k8s.cgroup.mem_usage + info: cgroup memory utilization + - name: k8s_cgroup_1m_received_packets_rate + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf + metric: k8s.cgroup.net_packets + info: average number of packets received by the network interface ${label:device} over the last minute + - name: k8s_cgroup_10s_received_packets_storm + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf + metric: k8s.cgroup.net_packets + info: + ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over + the last minute + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: [] + scopes: + - name: k8s cgroup + description: These metrics refer to the Pod container. + labels: + - name: k8s_node_name + description: 'Node name. The value of _pod.spec.nodeName_.' + - name: k8s_namespace + description: 'Namespace name. The value of _pod.metadata.namespace_.' + - name: k8s_controller_kind + description: 'Controller kind (ReplicaSet, DaemonSet, StatefulSet, Job, etc.). The value of _pod.OwnerReferences.Controller.Kind_.' + - name: k8s_controller_name + description: 'Controller name.The value of _pod.OwnerReferences.Controller.Name_.' + - name: k8s_pod_name + description: 'Pod name. The value of _pod.metadata.name_.' + - name: k8s_container_name + description: 'Container name. The value of _pod.spec.containers.name_.' + - name: k8s_kind + description: 'Instance kind: "pod" or "container".' + - name: k8s_qos_class + description: 'QoS class (guaranteed, burstable, besteffort).' + - name: k8s_cluster_id + description: 'Cluster ID. The value of kube-system namespace _namespace.metadata.uid_.' + metrics: + - name: k8s.cgroup.cpu_limit + description: CPU Usage within the limits + unit: "percentage" + chart_type: line + dimensions: + - name: used + - name: k8s.cgroup.cpu + description: CPU Usage (100% = 1000 mCPU) + unit: "percentage" + chart_type: stacked + dimensions: + - name: user + - name: system + - name: k8s.cgroup.cpu_per_core + description: CPU Usage (100% = 1000 mCPU) Per Core + unit: "percentage" + chart_type: line + dimensions: + - name: a dimension per core + - name: k8s.cgroup.throttled + description: CPU Throttled Runnable Periods + unit: "percentage" + chart_type: line + dimensions: + - name: throttled + - name: k8s.cgroup.throttled_duration + description: CPU Throttled Time Duration + unit: "ms" + chart_type: line + dimensions: + - name: duration + - name: k8s.cgroup.cpu_shares + description: CPU Time Relative Share + unit: "shares" + chart_type: line + dimensions: + - name: shares + - name: k8s.cgroup.mem + description: Memory Usage + unit: "MiB" + chart_type: stacked + dimensions: + - name: cache + - name: rss + - name: swap + - name: rss_huge + - name: mapped_file + - name: k8s.cgroup.writeback + description: Writeback Memory + unit: "MiB" + chart_type: area + dimensions: + - name: dirty + - name: writeback + - name: k8s.cgroup.mem_activity + description: Memory Activity + unit: "MiB/s" + chart_type: line + dimensions: + - name: in + - name: out + - name: k8s.cgroup.pgfaults + description: Memory Page Faults + unit: "MiB/s" + chart_type: line + dimensions: + - name: pgfault + - name: swap + - name: k8s.cgroup.mem_usage + description: Used Memory + unit: "MiB" + chart_type: stacked + dimensions: + - name: ram + - name: swap + - name: k8s.cgroup.mem_usage_limit + description: Used RAM within the limits + unit: "MiB" + chart_type: stacked + dimensions: + - name: available + - name: used + - name: k8s.cgroup.mem_utilization + description: Memory Utilization + unit: "percentage" + chart_type: line + dimensions: + - name: utilization + - name: k8s.cgroup.mem_failcnt + description: Memory Limit Failures + unit: "count" + chart_type: line + dimensions: + - name: failures + - name: k8s.cgroup.io + description: I/O Bandwidth (all disks) + unit: "KiB/s" + chart_type: area + dimensions: + - name: read + - name: write + - name: k8s.cgroup.serviced_ops + description: Serviced I/O Operations (all disks) + unit: "operations/s" + chart_type: line + dimensions: + - name: read + - name: write + - name: k8s.cgroup.throttle_io + description: Throttle I/O Bandwidth (all disks) + unit: "KiB/s" + chart_type: area + dimensions: + - name: read + - name: write + - name: k8s.cgroup.throttle_serviced_ops + description: Throttle Serviced I/O Operations (all disks) + unit: "operations/s" + chart_type: line + dimensions: + - name: read + - name: write + - name: k8s.cgroup.queued_ops + description: Queued I/O Operations (all disks) + unit: "operations" + chart_type: line + dimensions: + - name: read + - name: write + - name: k8s.cgroup.merged_ops + description: Merged I/O Operations (all disks) + unit: "operations/s" + chart_type: line + dimensions: + - name: read + - name: write + - name: k8s.cgroup.cpu_some_pressure + description: CPU some pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: k8s.cgroup.cpu_some_pressure_stall_time + description: CPU some pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: k8s.cgroup.cpu_full_pressure + description: CPU full pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: k8s.cgroup.cpu_full_pressure_stall_time + description: CPU full pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: k8s.cgroup.memory_some_pressure + description: Memory some pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: k8s.cgroup.memory_some_pressure_stall_time + description: Memory some pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: k8s.cgroup.memory_full_pressure + description: Memory full pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: k8s.cgroup.memory_full_pressure_stall_time + description: Memory full pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: k8s.cgroup.io_some_pressure + description: I/O some pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: k8s.cgroup.io_some_pressure_stall_time + description: I/O some pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: k8s.cgroup.io_full_pressure + description: I/O some pressure + unit: "percentage" + chart_type: line + dimensions: + - name: some10 + - name: some60 + - name: some300 + - name: k8s.cgroup.io_full_pressure_stall_time + description: I/O some pressure stall time + unit: "ms" + chart_type: line + dimensions: + - name: time + - name: k8s.cgroup.pids_current + description: Number of processes + unit: "pids" + chart_type: line + dimensions: + - name: pids + - name: k8s cgroup network device + description: These metrics refer to the Pod container network interface. + labels: + - name: device + description: "The name of the host network interface linked to the container's network interface." + - name: container_device + description: Container network interface name. + - name: interface_type + description: 'Network interface type. Always "virtual" for the containers.' + - name: k8s_node_name + description: 'Node name. The value of _pod.spec.nodeName_.' + - name: k8s_namespace + description: 'Namespace name. The value of _pod.metadata.namespace_.' + - name: k8s_controller_kind + description: 'Controller kind (ReplicaSet, DaemonSet, StatefulSet, Job, etc.). The value of _pod.OwnerReferences.Controller.Kind_.' + - name: k8s_controller_name + description: 'Controller name.The value of _pod.OwnerReferences.Controller.Name_.' + - name: k8s_pod_name + description: 'Pod name. The value of _pod.metadata.name_.' + - name: k8s_container_name + description: 'Container name. The value of _pod.spec.containers.name_.' + - name: k8s_kind + description: 'Instance kind: "pod" or "container".' + - name: k8s_qos_class + description: 'QoS class (guaranteed, burstable, besteffort).' + - name: k8s_cluster_id + description: 'Cluster ID. The value of kube-system namespace _namespace.metadata.uid_.' + metrics: + - name: k8s.cgroup.net_net + description: Bandwidth + unit: "kilobits/s" + chart_type: area + dimensions: + - name: received + - name: sent + - name: k8s.cgroup.net_packets + description: Packets + unit: "pps" + chart_type: line + dimensions: + - name: received + - name: sent + - name: multicast + - name: k8s.cgroup.net_errors + description: Interface Errors + unit: "errors/s" + chart_type: line + dimensions: + - name: inbound + - name: outbound + - name: k8s.cgroup.net_drops + description: Interface Drops + unit: "errors/s" + chart_type: line + dimensions: + - name: inbound + - name: outbound + - name: k8s.cgroup.net_fifo + description: Interface FIFO Buffer Errors + unit: "errors/s" + chart_type: line + dimensions: + - name: receive + - name: transmit + - name: k8s.cgroup.net_compressed + description: Interface FIFO Buffer Errors + unit: "pps" + chart_type: line + dimensions: + - name: receive + - name: sent + - name: k8s.cgroup.net_events + description: Network Interface Events + unit: "events/s" + chart_type: line + dimensions: + - name: frames + - name: collisions + - name: carrier + - name: k8s.cgroup.net_operstate + description: Interface Operational State + unit: "state" + chart_type: line + dimensions: + - name: up + - name: down + - name: notpresent + - name: lowerlayerdown + - name: testing + - name: dormant + - name: unknown + - name: k8s.cgroup.net_carrier + description: Interface Physical Link State + unit: "state" + chart_type: line + dimensions: + - name: up + - name: down + - name: k8s.cgroup.net_mtu + description: Interface MTU + unit: "octets" + chart_type: line + dimensions: + - name: mtu + - <<: *module + meta: + <<: *meta + monitored_instance: + name: Systemd Services + link: "" + icon_filename: systemd.svg + categories: + - data-collection.systemd + keywords: + - systemd + - services + overview: + <<: *overview + data-collection: + <<: *data_collection + metrics_desctiption: "Monitor Systemd Services for performance, resource usage, and health status." + alerts: [] + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: [] + scopes: + - name: systemd service + description: "" + labels: + - name: service_name + description: Service name + metrics: + - name: systemd.service.cpu.utilization + description: Systemd Services CPU utilization (100% = 1 core) + unit: percentage + chart_type: stacked + dimensions: + - name: user + - name: system + - name: systemd.service.memory.usage + description: Systemd Services Used Memory + unit: MiB + chart_type: stacked + dimensions: + - name: ram + - name: swap + - name: systemd.service.memory.failcnt + description: Systemd Services Memory Limit Failures + unit: failures/s + chart_type: line + dimensions: + - name: fail + - name: systemd.service.memory.ram.usage + description: Systemd Services Memory + unit: MiB + chart_type: stacked + dimensions: + - name: rss + - name: cache + - name: mapped_file + - name: rss_huge + - name: systemd.service.memory.writeback + description: Systemd Services Writeback Memory + unit: MiB + chart_type: stacked + dimensions: + - name: writeback + - name: dirty + - name: systemd.service.memory.paging.faults + description: Systemd Services Memory Minor and Major Page Faults + unit: MiB/s + chart_type: area + dimensions: + - name: minor + - name: major + - name: systemd.service.memory.paging.io + description: Systemd Services Memory Paging IO + unit: MiB/s + chart_type: area + dimensions: + - name: in + - name: out + - name: systemd.service.disk.io + description: Systemd Services Disk Read/Write Bandwidth + unit: KiB/s + chart_type: area + dimensions: + - name: read + - name: write + - name: systemd.service.disk.iops + description: Systemd Services Disk Read/Write Operations + unit: operations/s + chart_type: line + dimensions: + - name: read + - name: write + - name: systemd.service.disk.throttle.io + description: Systemd Services Throttle Disk Read/Write Bandwidth + unit: KiB/s + chart_type: area + dimensions: + - name: read + - name: write + - name: systemd.service.disk.throttle.iops + description: Systemd Services Throttle Disk Read/Write Operations + unit: operations/s + chart_type: line + dimensions: + - name: read + - name: write + - name: systemd.service.disk.queued_iops + description: Systemd Services Queued Disk Read/Write Operations + unit: operations/s + chart_type: line + dimensions: + - name: read + - name: write + - name: systemd.service.disk.merged_iops + description: Systemd Services Merged Disk Read/Write Operations + unit: operations/s + chart_type: line + dimensions: + - name: read + - name: write + - name: systemd.service.pids.current + description: Systemd Services Number of Processes + unit: pids + chart_type: line + dimensions: + - name: pids + - <<: *module + meta: + <<: *meta + monitored_instance: + name: Virtual Machines + link: "" + icon_filename: container.svg + categories: + - data-collection.containers-and-vms + keywords: + - vms + - virtualization + - container + overview: + <<: *overview + data_collection: + <<: *data_collection + metrics_description: "Monitor Virtual Machines for performance, resource usage, and health status." + - <<: *module + meta: + <<: *meta + monitored_instance: + name: LXC Containers + link: "" + icon_filename: lxc.png + categories: + - data-collection.containers-and-vms + keywords: + - lxc + - lxd + - container + overview: + <<: *overview + data_collection: + <<: *data_collection + metrics_description: "Monitor LXC Containers for performance, resource usage, and health status." + - <<: *module + meta: + <<: *meta + monitored_instance: + name: Libvirt Containers + link: "" + icon_filename: libvirt.png + categories: + - data-collection.containers-and-vms + keywords: + - libvirt + - container + overview: + <<: *overview + data_collection: + <<: *data_collection + metrics_description: "Monitor Libvirt for performance, resource usage, and health status." + - <<: *module + meta: + <<: *meta + monitored_instance: + name: oVirt Containers + link: "" + icon_filename: ovirt.svg + categories: + - data-collection.containers-and-vms + keywords: + - ovirt + - container + overview: + <<: *overview + data_collection: + <<: *data_collection + metrics_description: "Monitor oVirt for performance, resource usage, and health status." + - <<: *module + meta: + <<: *meta + monitored_instance: + name: Proxmox Containers + link: "" + icon_filename: proxmox.png + categories: + - data-collection.containers-and-vms + keywords: + - proxmox + - container + overview: + <<: *overview + data_collection: + <<: *data_collection + metrics_description: "Monitor Proxmox for performance, resource usage, and health status." diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.c b/src/collectors/cgroups.plugin/sys_fs_cgroup.c index 705edf6f7..fd30b4acc 100644 --- a/collectors/cgroups.plugin/sys_fs_cgroup.c +++ b/src/collectors/cgroups.plugin/sys_fs_cgroup.c @@ -85,11 +85,6 @@ struct cgroups_systemd_config_setting cgroups_systemd_options[] = { { .name = NULL, .setting = SYSTEMD_CGROUP_ERR }, }; -// Shared memory with information from detected cgroups -netdata_ebpf_cgroup_shm_t shm_cgroup_ebpf = {NULL, NULL}; -int shm_fd_cgroup_ebpf = -1; -sem_t *shm_mutex_cgroup_ebpf = SEM_FAILED; - struct discovery_thread discovery_thread; @@ -255,7 +250,7 @@ void read_cgroup_plugin_configuration() { if(cgroup_update_every < localhost->rrd_update_every) cgroup_update_every = localhost->rrd_update_every; - cgroup_check_for_new_every = (int)config_get_number("plugin:cgroups", "check for new cgroups every", (long long)cgroup_check_for_new_every * (long long)cgroup_update_every); + cgroup_check_for_new_every = (int)config_get_number("plugin:cgroups", "check for new cgroups every", cgroup_check_for_new_every); if(cgroup_check_for_new_every < cgroup_update_every) cgroup_check_for_new_every = cgroup_update_every; @@ -418,6 +413,8 @@ void read_cgroup_plugin_configuration() { " !*/init.scope " // ignore init.scope " !/system.slice/run-*.scope " // ignore system.slice/run-XXXX.scope + " *user.slice/docker-*" // allow docker rootless containers + " !*user.slice*" // ignore the rest stuff in user.slice " *.scope " // we need all other *.scope for sure // ---------------------------------------------------------------- @@ -452,6 +449,7 @@ void read_cgroup_plugin_configuration() { " !/lxc.monitor* " " !/lxc.pivot " " !/lxc.payload " + " !*lxcfs.service/.control" " !/machine " " !/qemu " " !/system " @@ -474,7 +472,6 @@ void read_cgroup_plugin_configuration() { " !/system " " !/systemd " " !/user " - " !/user.slice " " !/lxc/*/* " // #2161 #2649 " !/lxc.monitor " " !/lxc.payload/*/* " @@ -526,63 +523,6 @@ void read_cgroup_plugin_configuration() { mountinfo_free_all(root); } -void netdata_cgroup_ebpf_set_values(size_t length) -{ - sem_wait(shm_mutex_cgroup_ebpf); - - shm_cgroup_ebpf.header->cgroup_max = cgroup_root_max; - shm_cgroup_ebpf.header->systemd_enabled = cgroup_enable_systemd_services | - cgroup_enable_systemd_services_detailed_memory | - cgroup_used_memory; - shm_cgroup_ebpf.header->body_length = length; - - sem_post(shm_mutex_cgroup_ebpf); -} - -void netdata_cgroup_ebpf_initialize_shm() -{ - shm_fd_cgroup_ebpf = shm_open(NETDATA_SHARED_MEMORY_EBPF_CGROUP_NAME, O_CREAT | O_RDWR, 0660); - if (shm_fd_cgroup_ebpf < 0) { - collector_error("Cannot initialize shared memory used by cgroup and eBPF, integration won't happen."); - return; - } - - size_t length = sizeof(netdata_ebpf_cgroup_shm_header_t) + cgroup_root_max * sizeof(netdata_ebpf_cgroup_shm_body_t); - if (ftruncate(shm_fd_cgroup_ebpf, length)) { - collector_error("Cannot set size for shared memory."); - goto end_init_shm; - } - - shm_cgroup_ebpf.header = (netdata_ebpf_cgroup_shm_header_t *) mmap(NULL, length, - PROT_READ | PROT_WRITE, MAP_SHARED, - shm_fd_cgroup_ebpf, 0); - - if (unlikely(MAP_FAILED == shm_cgroup_ebpf.header)) { - shm_cgroup_ebpf.header = NULL; - collector_error("Cannot map shared memory used between cgroup and eBPF, integration won't happen"); - goto end_init_shm; - } - shm_cgroup_ebpf.body = (netdata_ebpf_cgroup_shm_body_t *) ((char *)shm_cgroup_ebpf.header + - sizeof(netdata_ebpf_cgroup_shm_header_t)); - - shm_mutex_cgroup_ebpf = sem_open(NETDATA_NAMED_SEMAPHORE_EBPF_CGROUP_NAME, O_CREAT, - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, 1); - - if (shm_mutex_cgroup_ebpf != SEM_FAILED) { - netdata_cgroup_ebpf_set_values(length); - return; - } - - collector_error("Cannot create semaphore, integration between eBPF and cgroup won't happen"); - munmap(shm_cgroup_ebpf.header, length); - shm_cgroup_ebpf.header = NULL; - -end_init_shm: - close(shm_fd_cgroup_ebpf); - shm_fd_cgroup_ebpf = -1; - shm_unlink(NETDATA_SHARED_MEMORY_EBPF_CGROUP_NAME); -} - // --------------------------------------------------------------------------------------------- static unsigned long long calc_delta(unsigned long long curr, unsigned long long prev) { @@ -1280,12 +1220,12 @@ cpu_limits2_err: static inline int update_memory_limits(struct cgroup *cg) { char **filename = &cg->filename_memory_limit; - const RRDSETVAR_ACQUIRED **chart_var = &cg->chart_var_memory_limit; + const RRDVAR_ACQUIRED **chart_var = &cg->chart_var_memory_limit; unsigned long long *value = &cg->memory_limit; if(*filename) { if(unlikely(!*chart_var)) { - *chart_var = rrdsetvar_custom_chart_variable_add_and_acquire(cg->st_mem_usage, "memory_limit"); + *chart_var = rrdvar_chart_variable_add_and_acquire(cg->st_mem_usage, "memory_limit"); if(!*chart_var) { collector_error("Cannot create cgroup %s chart variable '%s'. Will not update its limit anymore.", cg->id, "memory_limit"); freez(*filename); @@ -1301,12 +1241,13 @@ static inline int update_memory_limits(struct cgroup *cg) { *filename = NULL; } else { - rrdsetvar_custom_chart_variable_set(cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); + rrdvar_chart_variable_set( + cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); return 1; } } else { - char buffer[30 + 1]; - int ret = read_file(*filename, buffer, 30); + char buffer[32]; + int ret = read_txt_file(*filename, buffer, sizeof(buffer)); if(ret) { collector_error("Cannot refresh cgroup %s memory limit by reading '%s'. Will not update its limit anymore.", cg->id, *filename); freez(*filename); @@ -1316,11 +1257,12 @@ static inline int update_memory_limits(struct cgroup *cg) { char *s = "max\n\0"; if(strcmp(s, buffer) == 0){ *value = UINT64_MAX; - rrdsetvar_custom_chart_variable_set(cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); + rrdvar_chart_variable_set( + cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); return 1; } *value = str2ull(buffer, NULL); - rrdsetvar_custom_chart_variable_set(cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); + rrdvar_chart_variable_set(cg->st_mem_usage, *chart_var, (NETDATA_DOUBLE)(*value) / (1024.0 * 1024.0)); return 1; } } @@ -1398,7 +1340,7 @@ void update_cgroup_charts() { } if(unlikely(!cg->chart_var_cpu_limit)) { - cg->chart_var_cpu_limit = rrdsetvar_custom_chart_variable_add_and_acquire(cg->st_cpu, "cpu_limit"); + cg->chart_var_cpu_limit = rrdvar_chart_variable_add_and_acquire(cg->st_cpu, "cpu_limit"); if(!cg->chart_var_cpu_limit) { collector_error("Cannot create cgroup %s chart variable 'cpu_limit'. Will not update its limit anymore.", cg->id); if(cg->filename_cpuset_cpus) freez(cg->filename_cpuset_cpus); @@ -1428,7 +1370,7 @@ void update_cgroup_charts() { rrdset_is_obsolete___safe_from_collector_thread(cg->st_cpu_limit); cg->st_cpu_limit = NULL; } - rrdsetvar_custom_chart_variable_set(cg->st_cpu, cg->chart_var_cpu_limit, NAN); + rrdvar_chart_variable_set(cg->st_cpu, cg->chart_var_cpu_limit, NAN); } } } @@ -1592,19 +1534,6 @@ static void cgroup_main_cleanup(void *ptr) { } } - if (shm_mutex_cgroup_ebpf != SEM_FAILED) { - sem_close(shm_mutex_cgroup_ebpf); - } - - if (shm_cgroup_ebpf.header) { - shm_cgroup_ebpf.header->cgroup_root_count = 0; - munmap(shm_cgroup_ebpf.header, shm_cgroup_ebpf.header->body_length); - } - - if (shm_fd_cgroup_ebpf > 0) { - close(shm_fd_cgroup_ebpf); - } - static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } @@ -1641,8 +1570,6 @@ void *cgroups_main(void *ptr) { cgroup_read_host_total_ram(); - netdata_cgroup_ebpf_initialize_shm(); - if (uv_mutex_init(&cgroup_root_mutex)) { collector_error("CGROUP: cannot initialize mutex for the main cgroup list"); goto exit; @@ -1669,10 +1596,17 @@ void *cgroups_main(void *ptr) { // we register this only on localhost // for the other nodes, the origin server should register it - rrd_collector_started(); // this creates a collector that runs for as long as netdata runs cgroup_netdev_link_init(); - rrd_function_add(localhost, NULL, "containers-vms", 10, RRDFUNCTIONS_CGTOP_HELP, true, cgroup_function_cgroup_top, NULL); - rrd_function_add(localhost, NULL, "systemd-services", 10, RRDFUNCTIONS_CGTOP_HELP, true, cgroup_function_systemd_top, NULL); + + rrd_function_add_inline(localhost, NULL, "containers-vms", 10, + RRDFUNCTIONS_PRIORITY_DEFAULT / 2, RRDFUNCTIONS_CGTOP_HELP, + "top", HTTP_ACCESS_ANONYMOUS_DATA, + cgroup_function_cgroup_top); + + rrd_function_add_inline(localhost, NULL, "systemd-services", 10, + RRDFUNCTIONS_PRIORITY_DEFAULT / 3, RRDFUNCTIONS_SYSTEMD_SERVICES_HELP, + "top", HTTP_ACCESS_ANONYMOUS_DATA, + cgroup_function_systemd_top); heartbeat_t hb; heartbeat_init(&hb); diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.h b/src/collectors/cgroups.plugin/sys_fs_cgroup.h index e8cfcf5f6..e8cfcf5f6 100644 --- a/collectors/cgroups.plugin/sys_fs_cgroup.h +++ b/src/collectors/cgroups.plugin/sys_fs_cgroup.h diff --git a/collectors/cgroups.plugin/tests/test_cgroups_plugin.c b/src/collectors/cgroups.plugin/tests/test_cgroups_plugin.c index bb1fb3988..bb1fb3988 100644 --- a/collectors/cgroups.plugin/tests/test_cgroups_plugin.c +++ b/src/collectors/cgroups.plugin/tests/test_cgroups_plugin.c diff --git a/collectors/cgroups.plugin/tests/test_cgroups_plugin.h b/src/collectors/cgroups.plugin/tests/test_cgroups_plugin.h index 3d68e9230..3d68e9230 100644 --- a/collectors/cgroups.plugin/tests/test_cgroups_plugin.h +++ b/src/collectors/cgroups.plugin/tests/test_cgroups_plugin.h diff --git a/collectors/cgroups.plugin/tests/test_doubles.c b/src/collectors/cgroups.plugin/tests/test_doubles.c index b13d4b19c..53fefa9c2 100644 --- a/collectors/cgroups.plugin/tests/test_doubles.c +++ b/src/collectors/cgroups.plugin/tests/test_doubles.c @@ -101,7 +101,7 @@ collected_number rrddim_set_by_pointer(RRDSET *st, RRDDIM *rd, collected_number return 0; } -const RRDSETVAR_ACQUIRED *rrdsetvar_custom_chart_variable_add_and_acquire(RRDSET *st, const char *name) +const RRDVAR_ACQUIRED *rrdvar_chart_variable_add_and_acquire(RRDSET *st, const char *name) { UNUSED(st); UNUSED(name); @@ -109,7 +109,7 @@ const RRDSETVAR_ACQUIRED *rrdsetvar_custom_chart_variable_add_and_acquire(RRDSET return NULL; } -void rrdsetvar_custom_chart_variable_set(RRDSET *st, const RRDSETVAR_ACQUIRED *rsa, NETDATA_DOUBLE value) +void rrdvar_chart_variable_set(RRDSET *st, const RRDVAR_ACQUIRED *rsa, NETDATA_DOUBLE value) { UNUSED(st); UNUSED(rsa); |