diff options
Diffstat (limited to 'collectors/proc.plugin/metadata.yaml')
-rw-r--r-- | collectors/proc.plugin/metadata.yaml | 5299 |
1 files changed, 0 insertions, 5299 deletions
diff --git a/collectors/proc.plugin/metadata.yaml b/collectors/proc.plugin/metadata.yaml deleted file mode 100644 index 45351b36f..000000000 --- a/collectors/proc.plugin/metadata.yaml +++ /dev/null @@ -1,5299 +0,0 @@ -plugin_name: proc.plugin -modules: - - meta: - plugin_name: proc.plugin - module_name: /proc/stat - monitored_instance: - name: System statistics - link: "" - categories: - - data-collection.linux-systems.system-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - cpu utilization - - process counts - most_popular: false - overview: - data_collection: - metrics_description: | - CPU utilization, states and frequencies and key Linux system performance metrics. - - The `/proc/stat` file provides various types of system statistics: - - - The overall system CPU usage statistics - - Per CPU core statistics - - The total context switching of the system - - The total number of processes running - - The total CPU interrupts - - The total CPU softirqs - - The collector also reads: - - - `/proc/schedstat` for statistics about the process scheduler in the Linux kernel. - - `/sys/devices/system/cpu/[X]/thermal_throttle/core_throttle_count` to get the count of thermal throttling events for a specific CPU core on Linux systems. - - `/sys/devices/system/cpu/[X]/thermal_throttle/package_throttle_count` to get the count of thermal throttling events for a specific CPU package on a Linux system. - - `/sys/devices/system/cpu/[X]/cpufreq/scaling_cur_freq` to get the current operating frequency of a specific CPU core. - - `/sys/devices/system/cpu/[X]/cpufreq/stats/time_in_state` to get the amount of time the CPU has spent in each of its available frequency states. - - `/sys/devices/system/cpu/[X]/cpuidle/state[X]/name` to get the names of the idle states for each CPU core in a Linux system. - - `/sys/devices/system/cpu/[X]/cpuidle/state[X]/time` to get the total time each specific CPU core has spent in each idle state since the system was started. - method_description: "" - supported_platforms: - include: ["linux"] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: | - The collector auto-detects all metrics. No configuration is needed. - limits: - description: "" - performance_impact: - description: | - The collector disables cpu frequency and idle state monitoring when there are more than 128 CPU cores available. - setup: - prerequisites: - list: [] - configuration: - file: - section_name: "plugin:proc:/proc/stat" - name: "netdata.conf" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: 10min_cpu_usage - link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf - metric: system.cpu - info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) - os: "linux" - - name: 10min_cpu_iowait - link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf - metric: system.cpu - info: average CPU iowait time over the last 10 minutes - os: "linux" - - name: 20min_steal_cpu - link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf - metric: system.cpu - info: average CPU steal time over the last 20 minutes - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.cpu - description: Total CPU utilization - unit: "percentage" - chart_type: stacked - dimensions: - - name: guest_nice - - name: guest - - name: steal - - name: softirq - - name: irq - - name: user - - name: system - - name: nice - - name: iowait - - name: idle - - name: system.intr - description: CPU Interrupts - unit: "interrupts/s" - chart_type: line - dimensions: - - name: interrupts - - name: system.ctxt - description: CPU Context Switches - unit: "context switches/s" - chart_type: line - dimensions: - - name: switches - - name: system.forks - description: Started Processes - unit: "processes/s" - chart_type: line - dimensions: - - name: started - - name: system.processes - description: System Processes - unit: "processes" - chart_type: line - dimensions: - - name: running - - name: blocked - - name: cpu.core_throttling - description: Core Thermal Throttling Events - unit: "events/s" - chart_type: line - dimensions: - - name: a dimension per cpu core - - name: cpu.package_throttling - description: Package Thermal Throttling Events - unit: "events/s" - chart_type: line - dimensions: - - name: a dimension per package - - name: cpu.cpufreq - description: Current CPU Frequency - unit: "MHz" - chart_type: line - dimensions: - - name: a dimension per cpu core - - name: cpu core - description: "" - labels: - - name: cpu - description: TBD - metrics: - - name: cpu.cpu - description: Core utilization - unit: "percentage" - chart_type: stacked - dimensions: - - name: guest_nice - - name: guest - - name: steal - - name: softirq - - name: irq - - name: user - - name: system - - name: nice - - name: iowait - - name: idle - - name: cpuidle.cpu_cstate_residency_time - description: C-state residency time - unit: "percentage" - chart_type: stacked - dimensions: - - name: a dimension per c-state - - meta: - plugin_name: proc.plugin - module_name: /proc/sys/kernel/random/entropy_avail - monitored_instance: - name: Entropy - link: "" - categories: - - data-collection.linux-systems.system-metrics - icon_filename: "syslog.png" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - entropy - most_popular: false - overview: - data_collection: - metrics_description: | - Entropy, a measure of the randomness or unpredictability of data. - - In the context of cryptography, entropy is used to generate random numbers or keys that are essential for - secure communication and encryption. Without a good source of entropy, cryptographic protocols can become - vulnerable to attacks that exploit the predictability of the generated keys. - - In most operating systems, entropy is generated by collecting random events from various sources, such as - hardware interrupts, mouse movements, keyboard presses, and disk activity. These events are fed into a pool - of entropy, which is then used to generate random numbers when needed. - - The `/dev/random` device in Linux is one such source of entropy, and it provides an interface for programs - to access the pool of entropy. When a program requests random numbers, it reads from the `/dev/random` device, - which blocks until enough entropy is available to generate the requested numbers. This ensures that the - generated numbers are truly random and not predictable. - - However, if the pool of entropy gets depleted, the `/dev/random` device may block indefinitely, causing - programs that rely on random numbers to slow down or even freeze. This is especially problematic for - cryptographic protocols that require a continuous stream of random numbers, such as SSL/TLS and SSH. - - To avoid this issue, some systems use a hardware random number generator (RNG) to generate high-quality - entropy. A hardware RNG generates random numbers by measuring physical phenomena, such as thermal noise or - radioactive decay. These sources of randomness are considered to be more reliable and unpredictable than - software-based sources. - - One such hardware RNG is the Trusted Platform Module (TPM), which is a dedicated hardware chip that is used - for cryptographic operations and secure boot. The TPM contains a built-in hardware RNG that generates - high-quality entropy, which can be used to seed the pool of entropy in the operating system. - - Alternatively, software-based solutions such as `Haveged` can be used to generate additional entropy by - exploiting sources of randomness in the system, such as CPU utilization and network traffic. These solutions - can help to mitigate the risk of entropy depletion, but they may not be as reliable as hardware-based solutions. - method_description: "" - supported_platforms: - include: ["linux"] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: lowest_entropy - link: https://github.com/netdata/netdata/blob/master/health/health.d/entropy.conf - metric: system.entropy - info: minimum number of bits of entropy available for the kernel’s random number generator - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.entropy - description: Available Entropy - unit: "entropy" - chart_type: line - dimensions: - - name: entropy - - meta: - plugin_name: proc.plugin - module_name: /proc/uptime - monitored_instance: - name: System Uptime - link: "" - categories: - - data-collection.linux-systems.system-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - uptime - most_popular: false - overview: - data_collection: - metrics_description: | - The amount of time the system has been up (running). - - Uptime is a critical aspect of overall system performance: - - - **Availability**: Uptime monitoring can show whether a server is consistently available or experiences frequent downtimes. - - **Performance Monitoring**: While server uptime alone doesn't provide detailed performance data, analyzing the duration and frequency of downtimes can help identify patterns or trends. - - **Proactive problem detection**: If server uptime monitoring reveals unexpected downtimes or a decreasing uptime trend, it can serve as an early warning sign of potential problems. - - **Root cause analysis**: When investigating server downtime, the uptime metric alone may not provide enough information to pinpoint the exact cause. - - **Load balancing**: Uptime data can indirectly indicate load balancing issues if certain servers have significantly lower uptimes than others. - - **Optimize maintenance efforts**: Servers with consistently low uptimes or frequent downtimes may require more attention. - - **Compliance requirements**: Server uptime data can be used to demonstrate compliance with regulatory requirements or SLAs that mandate a minimum level of server availability. - method_description: "" - supported_platforms: - include: ["linux"] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.uptime - description: System Uptime - unit: "seconds" - chart_type: line - dimensions: - - name: uptime - - meta: - plugin_name: proc.plugin - module_name: /proc/vmstat - monitored_instance: - name: Memory Statistics - link: "" - categories: - - data-collection.linux-systems.memory-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - swap - - page faults - - oom - - numa - most_popular: false - overview: - data_collection: - metrics_description: | - Linux Virtual memory subsystem. - - Information about memory management, indicating how effectively the kernel allocates and frees - memory resources in response to system demands. - - Monitors page faults, which occur when a process requests a portion of its memory that isn't - immediately available. Monitoring these events can help diagnose inefficiencies in memory management and - provide insights into application behavior. - - Tracks swapping activity — a vital aspect of memory management where the kernel moves data from RAM to - swap space, and vice versa, based on memory demand and usage. It also monitors the utilization of zswap, - a compressed cache for swap pages, and provides insights into its usage and performance implications. - - In the context of virtualized environments, it tracks the ballooning mechanism which is used to balance - memory resources between host and guest systems. - - For systems using NUMA architecture, it provides insights into the local and remote memory accesses, which - can impact the performance based on the memory access times. - - The collector also watches for 'Out of Memory' kills, a drastic measure taken by the system when it runs out - of memory resources. - method_description: "" - supported_platforms: - include: ["linux"] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: 30min_ram_swapped_out - link: https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf - metric: mem.swapio - info: percentage of the system RAM swapped in the last 30 minutes - os: "linux freebsd" - - name: oom_kill - link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf - metric: mem.oom_kill - info: number of out of memory kills in the last 30 minutes - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: mem.swapio - description: Swap I/O - unit: "KiB/s" - chart_type: area - dimensions: - - name: in - - name: out - - name: system.pgpgio - description: Memory Paged from/to disk - unit: "KiB/s" - chart_type: area - dimensions: - - name: in - - name: out - - name: system.pgfaults - description: Memory Page Faults - unit: "faults/s" - chart_type: line - dimensions: - - name: minor - - name: major - - name: mem.balloon - description: Memory Ballooning Operations - unit: "KiB/s" - chart_type: line - dimensions: - - name: inflate - - name: deflate - - name: migrate - - name: mem.zswapio - description: ZSwap I/O - unit: "KiB/s" - chart_type: area - dimensions: - - name: in - - name: out - - name: mem.ksm_cow - description: KSM Copy On Write Operations - unit: "KiB/s" - chart_type: line - dimensions: - - name: swapin - - name: write - - name: mem.thp_faults - description: Transparent Huge Page Fault Allocations - unit: "events/s" - chart_type: line - dimensions: - - name: alloc - - name: fallback - - name: fallback_charge - - name: mem.thp_file - description: Transparent Huge Page File Allocations - unit: "events/s" - chart_type: line - dimensions: - - name: alloc - - name: fallback - - name: mapped - - name: fallback_charge - - name: mem.thp_zero - description: Transparent Huge Zero Page Allocations - unit: "events/s" - chart_type: line - dimensions: - - name: alloc - - name: failed - - name: mem.thp_collapse - description: Transparent Huge Pages Collapsed by khugepaged - unit: "events/s" - chart_type: line - dimensions: - - name: alloc - - name: failed - - name: mem.thp_split - description: Transparent Huge Page Splits - unit: "events/s" - chart_type: line - dimensions: - - name: split - - name: failed - - name: split_pmd - - name: split_deferred - - name: mem.thp_swapout - description: Transparent Huge Pages Swap Out - unit: "events/s" - chart_type: line - dimensions: - - name: swapout - - name: fallback - - name: mem.thp_compact - description: Transparent Huge Pages Compaction - unit: "events/s" - chart_type: line - dimensions: - - name: success - - name: fail - - name: stall - - name: mem.oom_kill - description: Out of Memory Kills - unit: "kills/s" - chart_type: line - dimensions: - - name: kills - - name: mem.numa - description: NUMA events - unit: "events/s" - chart_type: line - dimensions: - - name: local - - name: foreign - - name: interleave - - name: other - - name: pte_updates - - name: huge_pte_updates - - name: hint_faults - - name: hint_faults_local - - name: pages_migrated - - meta: - plugin_name: proc.plugin - module_name: /proc/interrupts - monitored_instance: - name: Interrupts - link: "" - categories: - - data-collection.linux-systems.cpu-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - interrupts - most_popular: false - overview: - data_collection: - metrics_description: | - Monitors `/proc/interrupts`, a file organized by CPU and then by the type of interrupt. - The numbers reported are the counts of the interrupts that have occurred of each type. - - An interrupt is a signal to the processor emitted by hardware or software indicating an event that needs - immediate attention. The processor then interrupts its current activities and executes the interrupt handler - to deal with the event. This is part of the way a computer multitasks and handles concurrent processing. - - The types of interrupts include: - - - **I/O interrupts**: These are caused by I/O devices like the keyboard, mouse, printer, etc. For example, when - you type something on the keyboard, an interrupt is triggered so the processor can handle the new input. - - - **Timer interrupts**: These are generated at regular intervals by the system's timer circuit. It's primarily - used to switch the CPU among different tasks. - - - **Software interrupts**: These are generated by a program requiring disk I/O operations, or other system resources. - - - **Hardware interrupts**: These are caused by hardware conditions such as power failure, overheating, etc. - - Monitoring `/proc/interrupts` can be used for: - - - **Performance tuning**: If an interrupt is happening very frequently, it could be a sign that a device is not - configured correctly, or there is a software bug causing unnecessary interrupts. This could lead to system - performance degradation. - - - **System troubleshooting**: If you're seeing a lot of unexpected interrupts, it could be a sign of a hardware problem. - - - **Understanding system behavior**: More generally, keeping an eye on what interrupts are occurring can help you - understand what your system is doing. It can provide insights into the system's interaction with hardware, - drivers, and other parts of the kernel. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.interrupts - description: System interrupts - unit: "interrupts/s" - chart_type: stacked - dimensions: - - name: a dimension per device - - name: cpu core - description: "" - labels: - - name: cpu - description: TBD - metrics: - - name: cpu.interrupts - description: CPU interrupts - unit: "interrupts/s" - chart_type: stacked - dimensions: - - name: a dimension per device - - meta: - plugin_name: proc.plugin - module_name: /proc/loadavg - monitored_instance: - name: System Load Average - link: "" - categories: - - data-collection.linux-systems.system-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - load - - load average - most_popular: false - overview: - data_collection: - metrics_description: | - The `/proc/loadavg` file provides information about the system load average. - - The load average is a measure of the amount of computational work that a system performs. It is a - representation of the average system load over a period of time. - - This file contains three numbers representing the system load averages for the last 1, 5, and 15 minutes, - respectively. It also includes the currently running processes and the total number of processes. - - Monitoring the load average can be used for: - - - **System performance**: If the load average is too high, it may indicate that your system is overloaded. - On a system with a single CPU, if the load average is 1, it means the single CPU is fully utilized. If the - load averages are consistently higher than the number of CPUs/cores, it may indicate that your system is - overloaded and tasks are waiting for CPU time. - - - **Troubleshooting**: If the load average is unexpectedly high, it can be a sign of a problem. This could be - due to a runaway process, a software bug, or a hardware issue. - - - **Capacity planning**: By monitoring the load average over time, you can understand the trends in your - system's workload. This can help with capacity planning and scaling decisions. - - Remember that load average not only considers CPU usage, but also includes processes waiting for disk I/O. - Therefore, high load averages could be due to I/O contention as well as CPU contention. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: load_cpu_number - link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf - metric: system.load - info: number of active CPU cores in the system - os: "linux" - - name: load_average_15 - link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf - metric: system.load - info: system fifteen-minute load average - os: "linux" - - name: load_average_5 - link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf - metric: system.load - info: system five-minute load average - os: "linux" - - name: load_average_1 - link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf - metric: system.load - info: system one-minute load average - os: "linux" - - name: active_processes - link: https://github.com/netdata/netdata/blob/master/health/health.d/processes.conf - metric: system.active_processes - info: system process IDs (PID) space utilization - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.load - description: System Load Average - unit: "load" - chart_type: line - dimensions: - - name: load1 - - name: load5 - - name: load15 - - name: system.active_processes - description: System Active Processes - unit: "processes" - chart_type: line - dimensions: - - name: active - - meta: - plugin_name: proc.plugin - module_name: /proc/pressure - monitored_instance: - name: Pressure Stall Information - link: "" - categories: - - data-collection.linux-systems.pressure-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - pressure - most_popular: false - overview: - data_collection: - metrics_description: | - Introduced in Linux kernel 4.20, `/proc/pressure` provides information about system pressure stall information - (PSI). PSI is a feature that allows the system to track the amount of time the system is stalled due to - resource contention, such as CPU, memory, or I/O. - - The collectors monitored 3 separate files for CPU, memory, and I/O: - - - **cpu**: Tracks the amount of time tasks are stalled due to CPU contention. - - **memory**: Tracks the amount of time tasks are stalled due to memory contention. - - **io**: Tracks the amount of time tasks are stalled due to I/O contention. - - **irq**: Tracks the amount of time tasks are stalled due to IRQ contention. - - Each of them provides metrics for stall time over the last 10 seconds, 1 minute, 5 minutes, and 15 minutes. - - Monitoring the /proc/pressure files can provide important insights into system performance and capacity planning: - - - **Identifying resource contention**: If these metrics are consistently high, it indicates that tasks are - frequently being stalled due to lack of resources, which can significantly degrade system performance. - - - **Troubleshooting performance issues**: If a system is experiencing performance issues, these metrics can - help identify whether resource contention is the cause. - - - **Capacity planning**: By monitoring these metrics over time, you can understand trends in resource - utilization and make informed decisions about when to add more resources to your system. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.cpu_some_pressure - description: CPU some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: system.cpu_some_pressure_stall_time - description: CPU some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: system.cpu_full_pressure - description: CPU full pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: system.cpu_full_pressure_stall_time - description: CPU full pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: system.memory_some_pressure - description: Memory some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: system.memory_some_pressure_stall_time - description: Memory some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: system.memory_full_pressure - description: Memory full pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: system.memory_full_pressure_stall_time - description: Memory full pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: system.io_some_pressure - description: I/O some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: system.io_some_pressure_stall_time - description: I/O some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - name: system.io_full_pressure - description: I/O some pressure - unit: "percentage" - chart_type: line - dimensions: - - name: some10 - - name: some60 - - name: some300 - - name: system.io_full_pressure_stall_time - description: I/O some pressure stall time - unit: "ms" - chart_type: line - dimensions: - - name: time - - meta: - plugin_name: proc.plugin - module_name: /proc/softirqs - monitored_instance: - name: SoftIRQ statistics - link: "" - categories: - - data-collection.linux-systems.cpu-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - softirqs - - interrupts - most_popular: false - overview: - data_collection: - metrics_description: | - In the Linux kernel, handling of hardware interrupts is split into two halves: the top half and the bottom half. - The top half is the routine that responds immediately to an interrupt, while the bottom half is deferred to be processed later. - - Softirqs are a mechanism in the Linux kernel used to handle the bottom halves of interrupts, which can be - deferred and processed later in a context where it's safe to enable interrupts. - - The actual work of handling the interrupt is offloaded to a softirq and executed later when the system - decides it's a good time to process them. This helps to keep the system responsive by not blocking the top - half for too long, which could lead to missed interrupts. - - Monitoring `/proc/softirqs` is useful for: - - - **Performance tuning**: A high rate of softirqs could indicate a performance issue. For instance, a high - rate of network softirqs (`NET_RX` and `NET_TX`) could indicate a network performance issue. - - - **Troubleshooting**: If a system is behaving unexpectedly, checking the softirqs could provide clues about - what is going on. For example, a sudden increase in block device softirqs (BLOCK) might indicate a problem - with a disk. - - - **Understanding system behavior**: Knowing what types of softirqs are happening can help you understand what - your system is doing, particularly in terms of how it's interacting with hardware and how it's handling - interrupts. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.softirqs - description: System softirqs - unit: "softirqs/s" - chart_type: stacked - dimensions: - - name: a dimension per softirq - - name: cpu core - description: "" - labels: - - name: cpu - description: TBD - metrics: - - name: cpu.softirqs - description: CPU softirqs - unit: "softirqs/s" - chart_type: stacked - dimensions: - - name: a dimension per softirq - - meta: - plugin_name: proc.plugin - module_name: /proc/net/softnet_stat - monitored_instance: - name: Softnet Statistics - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - softnet - most_popular: false - overview: - data_collection: - metrics_description: | - `/proc/net/softnet_stat` provides statistics that relate to the handling of network packets by softirq. - - It provides information about: - - - Total number of processed packets (`processed`). - - Times ksoftirq ran out of quota (`dropped`). - - Times net_rx_action was rescheduled. - - Number of times processed all lists before quota. - - Number of times did not process all lists due to quota. - - Number of times net_rx_action was rescheduled for GRO (Generic Receive Offload) cells. - - Number of times GRO cells were processed. - - Monitoring the /proc/net/softnet_stat file can be useful for: - - - **Network performance monitoring**: By tracking the total number of processed packets and how many packets - were dropped, you can gain insights into your system's network performance. - - - **Troubleshooting**: If you're experiencing network-related issues, this collector can provide valuable clues. - For instance, a high number of dropped packets may indicate a network problem. - - - **Capacity planning**: If your system is consistently processing near its maximum capacity of network - packets, it might be time to consider upgrading your network infrastructure. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: 1min_netdev_backlog_exceeded - link: https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf - metric: system.softnet_stat - info: average number of dropped packets in the last minute due to exceeded net.core.netdev_max_backlog - os: "linux" - - name: 1min_netdev_budget_ran_outs - link: https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf - metric: system.softnet_stat - info: - average number of times ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs with work remaining over the last - minute (this can be a cause for dropped packets) - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.softnet_stat - description: System softnet_stat - unit: "events/s" - chart_type: line - dimensions: - - name: processed - - name: dropped - - name: squeezed - - name: received_rps - - name: flow_limit_count - - name: cpu core - description: "" - labels: [] - metrics: - - name: cpu.softnet_stat - description: CPU softnet_stat - unit: "events/s" - chart_type: line - dimensions: - - name: processed - - name: dropped - - name: squeezed - - name: received_rps - - name: flow_limit_count - - meta: - plugin_name: proc.plugin - module_name: /proc/meminfo - monitored_instance: - name: Memory Usage - link: "" - categories: - - data-collection.linux-systems.memory-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - memory - - ram - - available - - committed - most_popular: false - overview: - data_collection: - metrics_description: | - `/proc/meminfo` provides detailed information about the system's current memory usage. It includes information - about different types of memory, RAM, Swap, ZSwap, HugePages, Transparent HugePages (THP), Kernel memory, - SLAB memory, memory mappings, and more. - - Monitoring /proc/meminfo can be useful for: - - - **Performance Tuning**: Understanding your system's memory usage can help you make decisions about system - tuning and optimization. For example, if your system is frequently low on free memory, it might benefit - from more RAM. - - - **Troubleshooting**: If your system is experiencing problems, `/proc/meminfo` can provide clues about - whether memory usage is a factor. For example, if your system is slow and cached swap is high, it could - mean that your system is swapping out a lot of memory to disk, which can degrade performance. - - - **Capacity Planning**: By monitoring memory usage over time, you can understand trends and make informed - decisions about future capacity needs. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: ram_in_use - link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf - metric: system.ram - info: system memory utilization - os: "linux" - - name: ram_available - link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf - metric: mem.available - info: percentage of estimated amount of RAM available for userspace processes, without causing swapping - os: "linux" - - name: used_swap - link: https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf - metric: mem.swap - info: swap memory utilization - os: "linux freebsd" - - name: 1hour_memory_hw_corrupted - link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf - metric: mem.hwcorrupt - info: amount of memory corrupted due to a hardware failure - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.ram - description: System RAM - unit: "MiB" - chart_type: stacked - dimensions: - - name: free - - name: used - - name: cached - - name: buffers - - name: mem.available - description: Available RAM for applications - unit: "MiB" - chart_type: area - dimensions: - - name: avail - - name: mem.swap - description: System Swap - unit: "MiB" - chart_type: stacked - dimensions: - - name: free - - name: used - - name: mem.swap_cached - description: Swap Memory Cached in RAM - unit: "MiB" - chart_type: stacked - dimensions: - - name: cached - - name: mem.zswap - description: Zswap Usage - unit: "MiB" - chart_type: stacked - dimensions: - - name: in-ram - - name: on-disk - - name: mem.hwcorrupt - description: Corrupted Memory detected by ECC - unit: "MiB" - chart_type: line - dimensions: - - name: HardwareCorrupted - - name: mem.commited - description: Committed (Allocated) Memory - unit: "MiB" - chart_type: area - dimensions: - - name: Commited_AS - - name: mem.writeback - description: Writeback Memory - unit: "MiB" - chart_type: line - dimensions: - - name: Dirty - - name: Writeback - - name: FuseWriteback - - name: NfsWriteback - - name: Bounce - - name: mem.kernel - description: Memory Used by Kernel - unit: "MiB" - chart_type: stacked - dimensions: - - name: Slab - - name: KernelStack - - name: PageTables - - name: VmallocUsed - - name: Percpu - - name: mem.slab - description: Reclaimable Kernel Memory - unit: "MiB" - chart_type: stacked - dimensions: - - name: reclaimable - - name: unreclaimable - - name: mem.hugepages - description: Dedicated HugePages Memory - unit: "MiB" - chart_type: stacked - dimensions: - - name: free - - name: used - - name: surplus - - name: reserved - - name: mem.thp - description: Transparent HugePages Memory - unit: "MiB" - chart_type: stacked - dimensions: - - name: anonymous - - name: shmem - - name: mem.thp_details - description: Details of Transparent HugePages Usage - unit: "MiB" - chart_type: line - dimensions: - - name: ShmemPmdMapped - - name: FileHugePages - - name: FilePmdMapped - - name: mem.reclaiming - description: Memory Reclaiming - unit: "MiB" - chart_type: line - dimensions: - - name: Active - - name: Inactive - - name: Active(anon) - - name: Inactive(anon) - - name: Active(file) - - name: Inactive(file) - - name: Unevictable - - name: Mlocked - - name: mem.high_low - description: High and Low Used and Free Memory Areas - unit: "MiB" - chart_type: stacked - dimensions: - - name: high_used - - name: low_used - - name: high_free - - name: low_free - - name: mem.cma - description: Contiguous Memory Allocator (CMA) Memory - unit: "MiB" - chart_type: stacked - dimensions: - - name: used - - name: free - - name: mem.directmaps - description: Direct Memory Mappings - unit: "MiB" - chart_type: stacked - dimensions: - - name: 4k - - name: 2m - - name: 4m - - name: 1g - - meta: - plugin_name: proc.plugin - module_name: /proc/pagetypeinfo - monitored_instance: - name: Page types - link: "" - categories: - - data-collection.linux-systems.memory-metrics - icon_filename: "microchip.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - memory page types - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides metrics about the system's memory page types" - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: mem.pagetype_global - description: System orders available - unit: "B" - chart_type: stacked - dimensions: - - name: a dimension per pagesize - - name: node, zone, type - description: "" - labels: - - name: node_id - description: TBD - - name: node_zone - description: TBD - - name: node_type - description: TBD - metrics: - - name: mem.pagetype - description: pagetype_Node{node}_{zone}_{type} - unit: "B" - chart_type: stacked - dimensions: - - name: a dimension per pagesize - - meta: - plugin_name: proc.plugin - module_name: /sys/devices/system/edac/mc - monitored_instance: - name: Memory modules (DIMMs) - link: "" - categories: - - data-collection.linux-systems.memory-metrics - icon_filename: "microchip.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - edac - - ecc - - dimm - - ram - - hardware - most_popular: false - overview: - data_collection: - metrics_description: | - The Error Detection and Correction (EDAC) subsystem is detecting and reporting errors in the system's memory, - primarily ECC (Error-Correcting Code) memory errors. - - The collector provides data for: - - - Per memory controller (MC): correctable and uncorrectable errors. These can be of 2 kinds: - - errors related to a DIMM - - errors that cannot be associated with a DIMM - - - Per memory DIMM: correctable and uncorrectable errors. There are 2 kinds: - - memory controllers that can identify the physical DIMMS and report errors directly for them, - - memory controllers that report errors for memory address ranges that can be linked to dimms. - In this case the DIMMS reported may be more than the physical DIMMS installed. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: ecc_memory_mc_noinfo_correctable - metric: mem.edac_mc - info: memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) in the last 10 minutes - link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf - - name: ecc_memory_mc_noinfo_uncorrectable - metric: mem.edac_mc - info: memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) in the last 10 minutes - link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf - - name: ecc_memory_dimm_correctable - metric: mem.edac_mc_dimm - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes - link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf - - name: ecc_memory_dimm_uncorrectable - metric: mem.edac_mc_dimm - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes - link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: memory controller - description: These metrics refer to the memory controller. - labels: - - name: controller - description: "[mcX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#mcx-directories) directory name of this memory controller." - - name: mc_name - description: Memory controller type. - - name: size_mb - description: The amount of memory in megabytes that this memory controller manages. - - name: max_location - description: Last available memory slot in this memory controller. - metrics: - - name: mem.edac_mc - description: Memory Controller (MC) Error Detection And Correction (EDAC) Errors - unit: errors/s - chart_type: line - dimensions: - - name: correctable - - name: uncorrectable - - name: correctable_noinfo - - name: uncorrectable_noinfo - - name: memory module - description: These metrics refer to the memory module (or rank, [depends on the memory controller](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#f5)). - labels: - - name: controller - description: "[mcX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#mcx-directories) directory name of this memory controller." - - name: dimm - description: "[dimmX or rankX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#dimmx-or-rankx-directories) directory name of this memory module." - - name: dimm_dev_type - description: Type of DRAM device used in this memory module. For example, x1, x2, x4, x8. - - name: dimm_edac_mode - description: Used type of error detection and correction. For example, S4ECD4ED would mean a Chipkill with x4 DRAM. - - name: dimm_label - description: Label assigned to this memory module. - - name: dimm_location - description: Location of the memory module. - - name: dimm_mem_type - description: Type of the memory module. - - name: size - description: The amount of memory in megabytes that this memory module manages. - metrics: - - name: mem.edac_mc - description: DIMM Error Detection And Correction (EDAC) Errors - unit: errors/s - chart_type: line - dimensions: - - name: correctable - - name: uncorrectable - - meta: - plugin_name: proc.plugin - module_name: /sys/devices/system/node - monitored_instance: - name: Non-Uniform Memory Access - link: "" - categories: - - data-collection.linux-systems.memory-metrics - icon_filename: "linuxserver.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - numa - most_popular: false - overview: - data_collection: - metrics_description: | - Information about NUMA (Non-Uniform Memory Access) nodes on the system. - - NUMA is a method of configuring a cluster of microprocessor in a multiprocessing system so that they can - share memory locally, improving performance and the ability of the system to be expanded. NUMA is used in a - symmetric multiprocessing (SMP) system. - - In a NUMA system, processors, memory, and I/O devices are grouped together into cells, also known as nodes. - Each node has its own memory and set of I/O devices, and one or more processors. While a processor can access - memory in any of the nodes, it does so faster when accessing memory within its own node. - - The collector provides statistics on memory allocations for processes running on the NUMA nodes, revealing the - efficiency of memory allocations in multi-node systems. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: numa node - description: "" - labels: - - name: numa_node - description: TBD - metrics: - - name: mem.numa_nodes - description: NUMA events - unit: "events/s" - chart_type: line - dimensions: - - name: hit - - name: miss - - name: local - - name: foreign - - name: interleave - - name: other - - meta: - plugin_name: proc.plugin - module_name: /sys/kernel/mm/ksm - monitored_instance: - name: Kernel Same-Page Merging - link: "" - categories: - - data-collection.linux-systems.memory-metrics - icon_filename: "microchip.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - ksm - - samepage - - merging - most_popular: false - overview: - data_collection: - metrics_description: | - Kernel Samepage Merging (KSM) is a memory-saving feature in Linux that enables the kernel to examine the - memory of different processes and identify identical pages. It then merges these identical pages into a - single page that the processes share. This is particularly useful for virtualization, where multiple virtual - machines might be running the same operating system or applications and have many identical pages. - - The collector provides information about the operation and effectiveness of KSM on your system. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: mem.ksm - description: Kernel Same Page Merging - unit: "MiB" - chart_type: stacked - dimensions: - - name: shared - - name: unshared - - name: sharing - - name: volatile - - name: mem.ksm_savings - description: Kernel Same Page Merging Savings - unit: "MiB" - chart_type: area - dimensions: - - name: savings - - name: offered - - name: mem.ksm_ratios - description: Kernel Same Page Merging Effectiveness - unit: "percentage" - chart_type: line - dimensions: - - name: savings - - meta: - plugin_name: proc.plugin - module_name: /sys/block/zram - monitored_instance: - name: ZRAM - link: "" - categories: - - data-collection.linux-systems.memory-metrics - icon_filename: "microchip.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - zram - most_popular: false - overview: - data_collection: - metrics_description: | - zRAM, or compressed RAM, is a block device that uses a portion of your system's RAM as a block device. - The data written to this block device is compressed and stored in memory. - - The collectors provides information about the operation and the effectiveness of zRAM on your system. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: zram device - description: "" - labels: - - name: device - description: TBD - metrics: - - name: mem.zram_usage - description: ZRAM Memory Usage - unit: "MiB" - chart_type: area - dimensions: - - name: compressed - - name: metadata - - name: mem.zram_savings - description: ZRAM Memory Savings - unit: "MiB" - chart_type: area - dimensions: - - name: savings - - name: original - - name: mem.zram_ratio - description: ZRAM Compression Ratio (original to compressed) - unit: "ratio" - chart_type: line - dimensions: - - name: ratio - - name: mem.zram_efficiency - description: ZRAM Efficiency - unit: "percentage" - chart_type: line - dimensions: - - name: percent - - meta: - plugin_name: proc.plugin - module_name: ipc - monitored_instance: - name: Inter Process Communication - link: "" - categories: - - data-collection.linux-systems.ipc-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - ipc - - semaphores - - shared memory - most_popular: false - overview: - data_collection: - metrics_description: | - IPC stands for Inter-Process Communication. It is a mechanism which allows processes to communicate with each - other and synchronize their actions. - - This collector exposes information about: - - - Message Queues: This allows messages to be exchanged between processes. It's a more flexible method that - allows messages to be placed onto a queue and read at a later time. - - - Shared Memory: This method allows for the fastest form of IPC because processes can exchange data by - reading/writing into shared memory segments. - - - Semaphores: They are used to synchronize the operations performed by independent processes. So, if multiple - processes are trying to access a single shared resource, semaphores can ensure that only one process - accesses the resource at a given time. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: false - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: semaphores_used - link: https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf - metric: system.ipc_semaphores - info: IPC semaphore utilization - os: "linux" - - name: semaphore_arrays_used - link: https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf - metric: system.ipc_semaphore_arrays - info: IPC semaphore arrays utilization - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.ipc_semaphores - description: IPC Semaphores - unit: "semaphores" - chart_type: area - dimensions: - - name: semaphores - - name: system.ipc_semaphore_arrays - description: IPC Semaphore Arrays - unit: "arrays" - chart_type: area - dimensions: - - name: arrays - - name: system.message_queue_message - description: IPC Message Queue Number of Messages - unit: "messages" - chart_type: stacked - dimensions: - - name: a dimension per queue - - name: system.message_queue_bytes - description: IPC Message Queue Used Bytes - unit: "bytes" - chart_type: stacked - dimensions: - - name: a dimension per queue - - name: system.shared_memory_segments - description: IPC Shared Memory Number of Segments - unit: "segments" - chart_type: stacked - dimensions: - - name: segments - - name: system.shared_memory_bytes - description: IPC Shared Memory Used Bytes - unit: "bytes" - chart_type: stacked - dimensions: - - name: bytes - - meta: - plugin_name: proc.plugin - module_name: /proc/diskstats - monitored_instance: - name: Disk Statistics - link: "" - categories: - - data-collection.linux-systems.disk-metrics - icon_filename: "hard-drive.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - disk - - disks - - io - - bcache - - block devices - most_popular: false - overview: - data_collection: - metrics_description: | - Detailed statistics for each of your system's disk devices and partitions. - The data is reported by the kernel and can be used to monitor disk activity on a Linux system. - - Get valuable insight into how your disks are performing and where potential bottlenecks might be. - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: 10min_disk_backlog - link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf - metric: disk.backlog - info: average backlog size of the ${label:device} disk over the last 10 minutes - os: "linux" - - name: 10min_disk_utilization - link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf - metric: disk.util - info: average percentage of time ${label:device} disk was busy over the last 10 minutes - os: "linux freebsd" - - name: bcache_cache_dirty - link: https://github.com/netdata/netdata/blob/master/health/health.d/bcache.conf - metric: disk.bcache_cache_alloc - info: percentage of cache space used for dirty data and metadata (this usually means your SSD cache is too small) - - name: bcache_cache_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/bcache.conf - metric: disk.bcache_cache_read_races - info: - number of times data was read from the cache, the bucket was reused and invalidated in the last 10 minutes (when this occurs the data is - reread from the backing device) - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.io - description: Disk I/O - unit: "KiB/s" - chart_type: area - dimensions: - - name: in - - name: out - - name: disk - description: "" - labels: - - name: device - description: TBD - - name: mount_point - description: TBD - - name: device_type - description: TBD - metrics: - - name: disk.io - description: Disk I/O Bandwidth - unit: "KiB/s" - chart_type: area - dimensions: - - name: reads - - name: writes - - name: disk_ext.io - description: Amount of Discarded Data - unit: "KiB/s" - chart_type: area - dimensions: - - name: discards - - name: disk.ops - description: Disk Completed I/O Operations - unit: "operations/s" - chart_type: line - dimensions: - - name: reads - - name: writes - - name: disk_ext.ops - description: Disk Completed Extended I/O Operations - unit: "operations/s" - chart_type: line - dimensions: - - name: discards - - name: flushes - - name: disk.qops - description: Disk Current I/O Operations - unit: "operations" - chart_type: line - dimensions: - - name: operations - - name: disk.backlog - description: Disk Backlog - unit: "milliseconds" - chart_type: area - dimensions: - - name: backlog - - name: disk.busy - description: Disk Busy Time - unit: "milliseconds" - chart_type: area - dimensions: - - name: busy - - name: disk.util - description: Disk Utilization Time - unit: "% of time working" - chart_type: area - dimensions: - - name: utilization - - name: disk.mops - description: Disk Merged Operations - unit: "merged operations/s" - chart_type: line - dimensions: - - name: reads - - name: writes - - name: disk_ext.mops - description: Disk Merged Discard Operations - unit: "merged operations/s" - chart_type: line - dimensions: - - name: discards - - name: disk.iotime - description: Disk Total I/O Time - unit: "milliseconds/s" - chart_type: line - dimensions: - - name: reads - - name: writes - - name: disk_ext.iotime - description: Disk Total I/O Time for Extended Operations - unit: "milliseconds/s" - chart_type: line - dimensions: - - name: discards - - name: flushes - - name: disk.await - description: Average Completed I/O Operation Time - unit: "milliseconds/operation" - chart_type: line - dimensions: - - name: reads - - name: writes - - name: disk_ext.await - description: Average Completed Extended I/O Operation Time - unit: "milliseconds/operation" - chart_type: line - dimensions: - - name: discards - - name: flushes - - name: disk.avgsz - description: Average Completed I/O Operation Bandwidth - unit: "KiB/operation" - chart_type: area - dimensions: - - name: reads - - name: writes - - name: disk_ext.avgsz - description: Average Amount of Discarded Data - unit: "KiB/operation" - chart_type: area - dimensions: - - name: discards - - name: disk.svctm - description: Average Service Time - unit: "milliseconds/operation" - chart_type: line - dimensions: - - name: svctm - - name: disk.bcache_cache_alloc - description: BCache Cache Allocations - unit: "percentage" - chart_type: stacked - dimensions: - - name: ununsed - - name: dirty - - name: clean - - name: metadata - - name: undefined - - name: disk.bcache_hit_ratio - description: BCache Cache Hit Ratio - unit: "percentage" - chart_type: line - dimensions: - - name: 5min - - name: 1hour - - name: 1day - - name: ever - - name: disk.bcache_rates - description: BCache Rates - unit: "KiB/s" - chart_type: area - dimensions: - - name: congested - - name: writeback - - name: disk.bcache_size - description: BCache Cache Sizes - unit: "MiB" - chart_type: area - dimensions: - - name: dirty - - name: disk.bcache_usage - description: BCache Cache Usage - unit: "percentage" - chart_type: area - dimensions: - - name: avail - - name: disk.bcache_cache_read_races - description: BCache Cache Read Races - unit: "operations/s" - chart_type: line - dimensions: - - name: races - - name: errors - - name: disk.bcache - description: BCache Cache I/O Operations - unit: "operations/s" - chart_type: line - dimensions: - - name: hits - - name: misses - - name: collisions - - name: readaheads - - name: disk.bcache_bypass - description: BCache Cache Bypass I/O Operations - unit: "operations/s" - chart_type: line - dimensions: - - name: hits - - name: misses - - meta: - plugin_name: proc.plugin - module_name: /proc/mdstat - monitored_instance: - name: MD RAID - link: "" - categories: - - data-collection.linux-systems.disk-metrics - icon_filename: "hard-drive.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - raid - - mdadm - - mdstat - - raid - most_popular: false - overview: - data_collection: - metrics_description: "This integration monitors the status of MD RAID devices." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: mdstat_last_collected - link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf - metric: md.disks - info: number of seconds since the last successful data collection - - name: mdstat_disks - link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf - metric: md.disks - info: - number of devices in the down state for the ${label:device} ${label:raid_level} array. Any number > 0 indicates that the array is degraded. - - name: mdstat_mismatch_cnt - link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf - metric: md.mismatch_cnt - info: number of unsynchronized blocks for the ${label:device} ${label:raid_level} array - - name: mdstat_nonredundant_last_collected - link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf - metric: md.nonredundant - info: number of seconds since the last successful data collection - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: md.health - description: Faulty Devices In MD - unit: "failed disks" - chart_type: line - dimensions: - - name: a dimension per md array - - name: md array - description: "" - labels: - - name: device - description: TBD - - name: raid_level - description: TBD - metrics: - - name: md.disks - description: Disks Stats - unit: "disks" - chart_type: stacked - dimensions: - - name: inuse - - name: down - - name: md.mismatch_cnt - description: Mismatch Count - unit: "unsynchronized blocks" - chart_type: line - dimensions: - - name: count - - name: md.status - description: Current Status - unit: "percent" - chart_type: line - dimensions: - - name: check - - name: resync - - name: recovery - - name: reshape - - name: md.expected_time_until_operation_finish - description: Approximate Time Until Finish - unit: "seconds" - chart_type: line - dimensions: - - name: finish_in - - name: md.operation_speed - description: Operation Speed - unit: "KiB/s" - chart_type: line - dimensions: - - name: speed - - name: md.nonredundant - description: Nonredundant Array Availability - unit: "boolean" - chart_type: line - dimensions: - - name: available - - meta: - plugin_name: proc.plugin - module_name: /proc/net/dev - monitored_instance: - name: Network interfaces - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - network interfaces - most_popular: false - overview: - data_collection: - metrics_description: "Monitor network interface metrics about bandwidth, state, errors and more." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: interface_speed - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.net - info: network interface ${label:device} current speed - os: "*" - - name: 1m_received_traffic_overflow - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.net - info: average inbound utilization for the network interface ${label:device} over the last minute - os: "linux" - - name: 1m_sent_traffic_overflow - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.net - info: average outbound utilization for the network interface ${label:device} over the last minute - os: "linux" - - name: inbound_packets_dropped_ratio - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.drops - info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes - os: "linux" - - name: outbound_packets_dropped_ratio - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.drops - info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes - os: "linux" - - name: wifi_inbound_packets_dropped_ratio - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.drops - info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes - os: "linux" - - name: wifi_outbound_packets_dropped_ratio - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.drops - info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes - os: "linux" - - name: 1m_received_packets_rate - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.packets - info: average number of packets received by the network interface ${label:device} over the last minute - os: "linux freebsd" - - name: 10s_received_packets_storm - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.packets - info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute - os: "linux freebsd" - - name: 10min_fifo_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf - metric: net.fifo - info: number of FIFO errors for the network interface ${label:device} in the last 10 minutes - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.net - description: Physical Network Interfaces Aggregated Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: network device - description: "" - labels: - - name: interface_type - description: TBD - - name: device - description: TBD - metrics: - - name: net.net - description: Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: net.speed - description: Interface Speed - unit: "kilobits/s" - chart_type: line - dimensions: - - name: speed - - name: net.duplex - description: Interface Duplex State - unit: "state" - chart_type: line - dimensions: - - name: full - - name: half - - name: unknown - - name: net.operstate - description: Interface Operational State - unit: "state" - chart_type: line - dimensions: - - name: up - - name: down - - name: notpresent - - name: lowerlayerdown - - name: testing - - name: dormant - - name: unknown - - name: net.carrier - description: Interface Physical Link State - unit: "state" - chart_type: line - dimensions: - - name: up - - name: down - - name: net.mtu - description: Interface MTU - unit: "octets" - chart_type: line - dimensions: - - name: mtu - - name: net.packets - description: Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: multicast - - name: net.errors - description: Interface Errors - unit: "errors/s" - chart_type: line - dimensions: - - name: inbound - - name: outbound - - name: net.drops - description: Interface Drops - unit: "drops/s" - chart_type: line - dimensions: - - name: inbound - - name: outbound - - name: net.fifo - description: Interface FIFO Buffer Errors - unit: "errors" - chart_type: line - dimensions: - - name: receive - - name: transmit - - name: net.compressed - description: Compressed Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: net.events - description: Network Interface Events - unit: "events/s" - chart_type: line - dimensions: - - name: frames - - name: collisions - - name: carrier - - meta: - plugin_name: proc.plugin - module_name: /proc/net/wireless - monitored_instance: - name: Wireless network interfaces - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - wireless devices - most_popular: false - overview: - data_collection: - metrics_description: "Monitor wireless devices with metrics about status, link quality, signal level, noise level and more." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: wireless device - description: "" - labels: [] - metrics: - - name: wireless.status - description: Internal status reported by interface. - unit: "status" - chart_type: line - dimensions: - - name: status - - name: wireless.link_quality - description: Overall quality of the link. This is an aggregate value, and depends on the driver and hardware. - unit: "value" - chart_type: line - dimensions: - - name: link_quality - - name: wireless.signal_level - description: - The signal level is the wireless signal power level received by the wireless client. The closer the value is to 0, the stronger the - signal. - unit: "dBm" - chart_type: line - dimensions: - - name: signal_level - - name: wireless.noise_level - description: - The noise level indicates the amount of background noise in your environment. The closer the value to 0, the greater the noise level. - unit: "dBm" - chart_type: line - dimensions: - - name: noise_level - - name: wireless.discarded_packets - description: Packet discarded in the wireless adapter due to wireless specific problems. - unit: "packets/s" - chart_type: line - dimensions: - - name: nwid - - name: crypt - - name: frag - - name: retry - - name: misc - - name: wireless.missed_beacons - description: Number of missed beacons. - unit: "frames/s" - chart_type: line - dimensions: - - name: missed_beacons - - meta: - plugin_name: proc.plugin - module_name: /sys/class/infiniband - monitored_instance: - name: InfiniBand - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - infiniband - - rdma - most_popular: false - overview: - data_collection: - metrics_description: "This integration monitors InfiniBand network inteface statistics." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: infiniband port - description: "" - labels: [] - metrics: - - name: ib.bytes - description: Bandwidth usage - unit: "kilobits/s" - chart_type: area - dimensions: - - name: Received - - name: Sent - - name: ib.packets - description: Packets Statistics - unit: "packets/s" - chart_type: area - dimensions: - - name: Received - - name: Sent - - name: Mcast_rcvd - - name: Mcast_sent - - name: Ucast_rcvd - - name: Ucast_sent - - name: ib.errors - description: Error Counters - unit: "errors/s" - chart_type: line - dimensions: - - name: Pkts_malformated - - name: Pkts_rcvd_discarded - - name: Pkts_sent_discarded - - name: Tick_Wait_to_send - - name: Pkts_missed_resource - - name: Buffer_overrun - - name: Link_Downed - - name: Link_recovered - - name: Link_integrity_err - - name: Link_minor_errors - - name: Pkts_rcvd_with_EBP - - name: Pkts_rcvd_discarded_by_switch - - name: Pkts_sent_discarded_by_switch - - name: ib.hwerrors - description: Hardware Errors - unit: "errors/s" - chart_type: line - dimensions: - - name: Duplicated_packets - - name: Pkt_Seq_Num_gap - - name: Ack_timer_expired - - name: Drop_missing_buffer - - name: Drop_out_of_sequence - - name: NAK_sequence_rcvd - - name: CQE_err_Req - - name: CQE_err_Resp - - name: CQE_Flushed_err_Req - - name: CQE_Flushed_err_Resp - - name: Remote_access_err_Req - - name: Remote_access_err_Resp - - name: Remote_invalid_req - - name: Local_length_err_Resp - - name: RNR_NAK_Packets - - name: CNP_Pkts_ignored - - name: RoCE_ICRC_Errors - - name: ib.hwpackets - description: Hardware Packets Statistics - unit: "packets/s" - chart_type: line - dimensions: - - name: RoCEv2_Congestion_sent - - name: RoCEv2_Congestion_rcvd - - name: IB_Congestion_handled - - name: ATOMIC_req_rcvd - - name: Connection_req_rcvd - - name: Read_req_rcvd - - name: Write_req_rcvd - - name: RoCE_retrans_adaptive - - name: RoCE_retrans_timeout - - name: RoCE_slow_restart - - name: RoCE_slow_restart_congestion - - name: RoCE_slow_restart_count - - meta: - plugin_name: proc.plugin - module_name: /proc/net/netstat - monitored_instance: - name: Network statistics - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - ip - - udp - - udplite - - icmp - - netstat - - snmp - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides metrics from the `netstat`, `snmp` and `snmp6` modules." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: 1m_tcp_syn_queue_drops - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf - metric: ip.tcp_syn_queue - info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute (SYN cookies were not enabled) - os: "linux" - - name: 1m_tcp_syn_queue_cookies - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf - metric: ip.tcp_syn_queue - info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute - os: "linux" - - name: 1m_tcp_accept_queue_overflows - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf - metric: ip.tcp_accept_queue - info: average number of overflows in the TCP accept queue over the last minute - os: "linux" - - name: 1m_tcp_accept_queue_drops - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf - metric: ip.tcp_accept_queue - info: average number of dropped packets in the TCP accept queue over the last minute - os: "linux" - - name: tcp_connections - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_conn.conf - metric: ip.tcpsock - info: TCP connections utilization - os: "linux" - - name: 1m_ip_tcp_resets_sent - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf - metric: ip.tcphandshake - info: average number of sent TCP RESETS over the last minute - os: "linux" - - name: 10s_ip_tcp_resets_sent - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf - metric: ip.tcphandshake - info: - average number of sent TCP RESETS over the last 10 seconds. This can indicate a port scan, or that a service running on this host has - crashed. Netdata will not send a clear notification for this alarm. - os: "linux" - - name: 1m_ip_tcp_resets_received - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf - metric: ip.tcphandshake - info: average number of received TCP RESETS over the last minute - os: "linux freebsd" - - name: 10s_ip_tcp_resets_received - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf - metric: ip.tcphandshake - info: - average number of received TCP RESETS over the last 10 seconds. This can be an indication that a service this host needs has crashed. - Netdata will not send a clear notification for this alarm. - os: "linux freebsd" - - name: 1m_ipv4_udp_receive_buffer_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf - metric: ipv4.udperrors - info: average number of UDP receive buffer errors over the last minute - os: "linux freebsd" - - name: 1m_ipv4_udp_send_buffer_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf - metric: ipv4.udperrors - info: average number of UDP send buffer errors over the last minute - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: system.ip - description: IPv4 Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: ip.tcpmemorypressures - description: TCP Memory Pressures - unit: "events/s" - chart_type: line - dimensions: - - name: pressures - - name: ip.tcpconnaborts - description: TCP Connection Aborts - unit: "connections/s" - chart_type: line - dimensions: - - name: baddata - - name: userclosed - - name: nomemory - - name: timeout - - name: linger - - name: failed - - name: ip.tcpreorders - description: TCP Reordered Packets by Detection Method - unit: "packets/s" - chart_type: line - dimensions: - - name: timestamp - - name: sack - - name: fack - - name: reno - - name: ip.tcpofo - description: TCP Out-Of-Order Queue - unit: "packets/s" - chart_type: line - dimensions: - - name: inqueue - - name: dropped - - name: merged - - name: pruned - - name: ip.tcpsyncookies - description: TCP SYN Cookies - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: failed - - name: ip.tcp_syn_queue - description: TCP SYN Queue Issues - unit: "packets/s" - chart_type: line - dimensions: - - name: drops - - name: cookies - - name: ip.tcp_accept_queue - description: TCP Accept Queue Issues - unit: "packets/s" - chart_type: line - dimensions: - - name: overflows - - name: drops - - name: ip.tcpsock - description: IPv4 TCP Connections - unit: "active connections" - chart_type: line - dimensions: - - name: connections - - name: ip.tcppackets - description: IPv4 TCP Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ip.tcperrors - description: IPv4 TCP Errors - unit: "packets/s" - chart_type: line - dimensions: - - name: InErrs - - name: InCsumErrors - - name: RetransSegs - - name: ip.tcpopens - description: IPv4 TCP Opens - unit: "connections/s" - chart_type: line - dimensions: - - name: active - - name: passive - - name: ip.tcphandshake - description: IPv4 TCP Handshake Issues - unit: "events/s" - chart_type: line - dimensions: - - name: EstabResets - - name: OutRsts - - name: AttemptFails - - name: SynRetrans - - name: ipv4.packets - description: IPv4 Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: forwarded - - name: delivered - - name: ipv4.errors - description: IPv4 Errors - unit: "packets/s" - chart_type: line - dimensions: - - name: InDiscards - - name: OutDiscards - - name: InNoRoutes - - name: OutNoRoutes - - name: InHdrErrors - - name: InAddrErrors - - name: InTruncatedPkts - - name: InCsumErrors - - name: ipc4.bcast - description: IP Broadcast Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: ipv4.bcastpkts - description: IP Broadcast Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv4.mcast - description: IPv4 Multicast Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: ipv4.mcastpkts - description: IP Multicast Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv4.icmp - description: IPv4 ICMP Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv4.icmpmsg - description: IPv4 ICMP Messages - unit: "packets/s" - chart_type: line - dimensions: - - name: InEchoReps - - name: OutEchoReps - - name: InDestUnreachs - - name: OutDestUnreachs - - name: InRedirects - - name: OutRedirects - - name: InEchos - - name: OutEchos - - name: InRouterAdvert - - name: OutRouterAdvert - - name: InRouterSelect - - name: OutRouterSelect - - name: InTimeExcds - - name: OutTimeExcds - - name: InParmProbs - - name: OutParmProbs - - name: InTimestamps - - name: OutTimestamps - - name: InTimestampReps - - name: OutTimestampReps - - name: ipv4.icmp_errors - description: IPv4 ICMP Errors - unit: "packets/s" - chart_type: line - dimensions: - - name: InErrors - - name: OutErrors - - name: InCsumErrors - - name: ipv4.udppackets - description: IPv4 UDP Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv4.udperrors - description: IPv4 UDP Errors - unit: "events/s" - chart_type: line - dimensions: - - name: RcvbufErrors - - name: SndbufErrors - - name: InErrors - - name: NoPorts - - name: InCsumErrors - - name: IgnoredMulti - - name: ipv4.udplite - description: IPv4 UDPLite Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv4.udplite_errors - description: IPv4 UDPLite Errors - unit: "packets/s" - chart_type: line - dimensions: - - name: RcvbufErrors - - name: SndbufErrors - - name: InErrors - - name: NoPorts - - name: InCsumErrors - - name: IgnoredMulti - - name: ipv4.ecnpkts - description: IP ECN Statistics - unit: "packets/s" - chart_type: line - dimensions: - - name: CEP - - name: NoECTP - - name: ECTP0 - - name: ECTP1 - - name: ipv4.fragsin - description: IPv4 Fragments Reassembly - unit: "packets/s" - chart_type: line - dimensions: - - name: ok - - name: failed - - name: all - - name: ipv4.fragsout - description: IPv4 Fragments Sent - unit: "packets/s" - chart_type: line - dimensions: - - name: ok - - name: failed - - name: created - - name: system.ipv6 - description: IPv6 Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: ipv6.packets - description: IPv6 Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: forwarded - - name: delivers - - name: ipv6.errors - description: IPv6 Errors - unit: "packets/s" - chart_type: line - dimensions: - - name: InDiscards - - name: OutDiscards - - name: InHdrErrors - - name: InAddrErrors - - name: InUnknownProtos - - name: InTooBigErrors - - name: InTruncatedPkts - - name: InNoRoutes - - name: OutNoRoutes - - name: ipv6.bcast - description: IPv6 Broadcast Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: ipv6.mcast - description: IPv6 Multicast Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - name: ipv6.mcastpkts - description: IPv6 Multicast Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv6.udppackets - description: IPv6 UDP Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv6.udperrors - description: IPv6 UDP Errors - unit: "events/s" - chart_type: line - dimensions: - - name: RcvbufErrors - - name: SndbufErrors - - name: InErrors - - name: NoPorts - - name: InCsumErrors - - name: IgnoredMulti - - name: ipv6.udplitepackets - description: IPv6 UDPlite Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv6.udpliteerrors - description: IPv6 UDP Lite Errors - unit: "events/s" - chart_type: line - dimensions: - - name: RcvbufErrors - - name: SndbufErrors - - name: InErrors - - name: NoPorts - - name: InCsumErrors - - name: ipv6.icmp - description: IPv6 ICMP Messages - unit: "messages/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv6.icmpredir - description: IPv6 ICMP Redirects - unit: "redirects/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv6.icmperrors - description: IPv6 ICMP Errors - unit: "errors/s" - chart_type: line - dimensions: - - name: InErrors - - name: OutErrors - - name: InCsumErrors - - name: InDestUnreachs - - name: InPktTooBigs - - name: InTimeExcds - - name: InParmProblems - - name: OutDestUnreachs - - name: OutPktTooBigs - - name: OutTimeExcds - - name: OutParmProblems - - name: ipv6.icmpechos - description: IPv6 ICMP Echo - unit: "messages/s" - chart_type: line - dimensions: - - name: InEchos - - name: OutEchos - - name: InEchoReplies - - name: OutEchoReplies - - name: ipv6.groupmemb - description: IPv6 ICMP Group Membership - unit: "messages/s" - chart_type: line - dimensions: - - name: InQueries - - name: OutQueries - - name: InResponses - - name: OutResponses - - name: InReductions - - name: OutReductions - - name: ipv6.icmprouter - description: IPv6 Router Messages - unit: "messages/s" - chart_type: line - dimensions: - - name: InSolicits - - name: OutSolicits - - name: InAdvertisements - - name: OutAdvertisements - - name: ipv6.icmpneighbor - description: IPv6 Neighbor Messages - unit: "messages/s" - chart_type: line - dimensions: - - name: InSolicits - - name: OutSolicits - - name: InAdvertisements - - name: OutAdvertisements - - name: ipv6.icmpmldv2 - description: IPv6 ICMP MLDv2 Reports - unit: "reports/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipv6.icmptypes - description: IPv6 ICMP Types - unit: "messages/s" - chart_type: line - dimensions: - - name: InType1 - - name: InType128 - - name: InType129 - - name: InType136 - - name: OutType1 - - name: OutType128 - - name: OutType129 - - name: OutType133 - - name: OutType135 - - name: OutType143 - - name: ipv6.ect - description: IPv6 ECT Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: InNoECTPkts - - name: InECT1Pkts - - name: InECT0Pkts - - name: InCEPkts - - name: ipv6.ect - description: IPv6 ECT Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: InNoECTPkts - - name: InECT1Pkts - - name: InECT0Pkts - - name: InCEPkts - - name: ipv6.fragsin - description: IPv6 Fragments Reassembly - unit: "packets/s" - chart_type: line - dimensions: - - name: ok - - name: failed - - name: timeout - - name: all - - name: ipv6.fragsout - description: IPv6 Fragments Sent - unit: "packets/s" - chart_type: line - dimensions: - - name: ok - - name: failed - - name: all - - meta: - plugin_name: proc.plugin - module_name: /proc/net/sockstat - monitored_instance: - name: Socket statistics - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - sockets - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides socket statistics." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: tcp_orphans - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_orphans.conf - metric: ipv4.sockstat_tcp_sockets - info: orphan IPv4 TCP sockets utilization - os: "linux" - - name: tcp_memory - link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_mem.conf - metric: ipv4.sockstat_tcp_mem - info: TCP memory utilization - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: ip.sockstat_sockets - description: Sockets used for all address families - unit: "sockets" - chart_type: line - dimensions: - - name: used - - name: ipv4.sockstat_tcp_sockets - description: IPv4 TCP Sockets - unit: "sockets" - chart_type: line - dimensions: - - name: alloc - - name: orphan - - name: inuse - - name: timewait - - name: ipv4.sockstat_tcp_mem - description: IPv4 TCP Sockets Memory - unit: "KiB" - chart_type: area - dimensions: - - name: mem - - name: ipv4.sockstat_udp_sockets - description: IPv4 UDP Sockets - unit: "sockets" - chart_type: line - dimensions: - - name: inuse - - name: ipv4.sockstat_udp_mem - description: IPv4 UDP Sockets Memory - unit: "sockets" - chart_type: line - dimensions: - - name: mem - - name: ipv4.sockstat_udplite_sockets - description: IPv4 UDPLITE Sockets - unit: "sockets" - chart_type: line - dimensions: - - name: inuse - - name: ipv4.sockstat_raw_sockets - description: IPv4 RAW Sockets - unit: "sockets" - chart_type: line - dimensions: - - name: inuse - - name: ipv4.sockstat_frag_sockets - description: IPv4 FRAG Sockets - unit: "fragments" - chart_type: line - dimensions: - - name: inuse - - name: ipv4.sockstat_frag_mem - description: IPv4 FRAG Sockets Memory - unit: "KiB" - chart_type: area - dimensions: - - name: mem - - meta: - plugin_name: proc.plugin - module_name: /proc/net/sockstat6 - monitored_instance: - name: IPv6 Socket Statistics - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - ipv6 sockets - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides IPv6 socket statistics." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: ipv6.sockstat6_tcp_sockets - description: IPv6 TCP Sockets - unit: "sockets" - chart_type: line - dimensions: - - name: inuse - - name: ipv6.sockstat6_udp_sockets - description: IPv6 UDP Sockets - unit: "sockets" - chart_type: line - dimensions: - - name: inuse - - name: ipv6.sockstat6_udplite_sockets - description: IPv6 UDPLITE Sockets - unit: "sockets" - chart_type: line - dimensions: - - name: inuse - - name: ipv6.sockstat6_raw_sockets - description: IPv6 RAW Sockets - unit: "sockets" - chart_type: line - dimensions: - - name: inuse - - name: ipv6.sockstat6_frag_sockets - description: IPv6 FRAG Sockets - unit: "fragments" - chart_type: line - dimensions: - - name: inuse - - meta: - plugin_name: proc.plugin - module_name: /proc/net/ip_vs_stats - monitored_instance: - name: IP Virtual Server - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - ip virtual server - most_popular: false - overview: - data_collection: - metrics_description: "This integration monitors IP Virtual Server statistics" - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: ipvs.sockets - description: IPVS New Connections - unit: "connections/s" - chart_type: line - dimensions: - - name: connections - - name: ipvs.packets - description: IPVS Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: ipvs.net - description: IPVS Bandwidth - unit: "kilobits/s" - chart_type: area - dimensions: - - name: received - - name: sent - - meta: - plugin_name: proc.plugin - module_name: /proc/net/rpc/nfs - monitored_instance: - name: NFS Client - link: "" - categories: - - data-collection.linux-systems.filesystem-metrics.nfs - icon_filename: "nfs.png" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - nfs client - - filesystem - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides statistics from the Linux kernel's NFS Client." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: nfs.net - description: NFS Client Network - unit: "operations/s" - chart_type: stacked - dimensions: - - name: udp - - name: tcp - - name: nfs.rpc - description: NFS Client Remote Procedure Calls Statistics - unit: "calls/s" - chart_type: line - dimensions: - - name: calls - - name: retransmits - - name: auth_refresh - - name: nfs.proc2 - description: NFS v2 Client Remote Procedure Calls - unit: "calls/s" - chart_type: stacked - dimensions: - - name: a dimension per proc2 call - - name: nfs.proc3 - description: NFS v3 Client Remote Procedure Calls - unit: "calls/s" - chart_type: stacked - dimensions: - - name: a dimension per proc3 call - - name: nfs.proc4 - description: NFS v4 Client Remote Procedure Calls - unit: "calls/s" - chart_type: stacked - dimensions: - - name: a dimension per proc4 call - - meta: - plugin_name: proc.plugin - module_name: /proc/net/rpc/nfsd - monitored_instance: - name: NFS Server - link: "" - categories: - - data-collection.linux-systems.filesystem-metrics.nfs - icon_filename: "nfs.png" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - nfs server - - filesystem - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides statistics from the Linux kernel's NFS Server." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: nfsd.readcache - description: NFS Server Read Cache - unit: "reads/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: nocache - - name: nfsd.filehandles - description: NFS Server File Handles - unit: "handles/s" - chart_type: line - dimensions: - - name: stale - - name: nfsd.io - description: NFS Server I/O - unit: "kilobytes/s" - chart_type: area - dimensions: - - name: read - - name: write - - name: nfsd.threads - description: NFS Server Threads - unit: "threads" - chart_type: line - dimensions: - - name: threads - - name: nfsd.net - description: NFS Server Network Statistics - unit: "packets/s" - chart_type: line - dimensions: - - name: udp - - name: tcp - - name: nfsd.rpc - description: NFS Server Remote Procedure Calls Statistics - unit: "calls/s" - chart_type: line - dimensions: - - name: calls - - name: bad_format - - name: bad_auth - - name: nfsd.proc2 - description: NFS v2 Server Remote Procedure Calls - unit: "calls/s" - chart_type: stacked - dimensions: - - name: a dimension per proc2 call - - name: nfsd.proc3 - description: NFS v3 Server Remote Procedure Calls - unit: "calls/s" - chart_type: stacked - dimensions: - - name: a dimension per proc3 call - - name: nfsd.proc4 - description: NFS v4 Server Remote Procedure Calls - unit: "calls/s" - chart_type: stacked - dimensions: - - name: a dimension per proc4 call - - name: nfsd.proc4ops - description: NFS v4 Server Operations - unit: "operations/s" - chart_type: stacked - dimensions: - - name: a dimension per proc4 operation - - meta: - plugin_name: proc.plugin - module_name: /proc/net/sctp/snmp - monitored_instance: - name: SCTP Statistics - link: "" - categories: - - data-collection.linux-systems.network-metrics - icon_filename: "network-wired.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - sctp - - stream control transmission protocol - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides statistics about the Stream Control Transmission Protocol (SCTP)." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: sctp.established - description: SCTP current total number of established associations - unit: "associations" - chart_type: line - dimensions: - - name: established - - name: sctp.transitions - description: SCTP Association Transitions - unit: "transitions/s" - chart_type: line - dimensions: - - name: active - - name: passive - - name: aborted - - name: shutdown - - name: sctp.packets - description: SCTP Packets - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: sent - - name: sctp.packet_errors - description: SCTP Packet Errors - unit: "packets/s" - chart_type: line - dimensions: - - name: invalid - - name: checksum - - name: sctp.fragmentation - description: SCTP Fragmentation - unit: "packets/s" - chart_type: line - dimensions: - - name: reassembled - - name: fragmented - - meta: - plugin_name: proc.plugin - module_name: /proc/net/stat/nf_conntrack - monitored_instance: - name: Conntrack - link: "" - categories: - - data-collection.linux-systems.firewall-metrics - icon_filename: "firewall.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - connection tracking mechanism - - netfilter - - conntrack - most_popular: false - overview: - data_collection: - metrics_description: "This integration monitors the connection tracking mechanism of Netfilter in the Linux Kernel." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: netfilter_conntrack_full - link: https://github.com/netdata/netdata/blob/master/health/health.d/netfilter.conf - metric: netfilter.conntrack_sockets - info: netfilter connection tracker table size utilization - os: "linux" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: netfilter.conntrack_sockets - description: Connection Tracker Connections - unit: "active connections" - chart_type: line - dimensions: - - name: connections - - name: netfilter.conntrack_new - description: Connection Tracker New Connections - unit: "connections/s" - chart_type: line - dimensions: - - name: new - - name: ignore - - name: invalid - - name: netfilter.conntrack_changes - description: Connection Tracker Changes - unit: "changes/s" - chart_type: line - dimensions: - - name: inserted - - name: deleted - - name: delete_list - - name: netfilter.conntrack_expect - description: Connection Tracker Expectations - unit: "expectations/s" - chart_type: line - dimensions: - - name: created - - name: deleted - - name: new - - name: netfilter.conntrack_search - description: Connection Tracker Searches - unit: "searches/s" - chart_type: line - dimensions: - - name: searched - - name: restarted - - name: found - - name: netfilter.conntrack_errors - description: Connection Tracker Errors - unit: "events/s" - chart_type: line - dimensions: - - name: icmp_error - - name: error_failed - - name: drop - - name: early_drop - - meta: - plugin_name: proc.plugin - module_name: /proc/net/stat/synproxy - monitored_instance: - name: Synproxy - link: "" - categories: - - data-collection.linux-systems.firewall-metrics - icon_filename: "firewall.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - synproxy - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides statistics about the Synproxy netfilter module." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: netfilter.synproxy_syn_received - description: SYNPROXY SYN Packets received - unit: "packets/s" - chart_type: line - dimensions: - - name: received - - name: netfilter.synproxy_conn_reopened - description: SYNPROXY Connections Reopened - unit: "connections/s" - chart_type: line - dimensions: - - name: reopened - - name: netfilter.synproxy_cookies - description: SYNPROXY TCP Cookies - unit: "cookies/s" - chart_type: line - dimensions: - - name: valid - - name: invalid - - name: retransmits - - meta: - plugin_name: proc.plugin - module_name: /proc/spl/kstat/zfs - monitored_instance: - name: ZFS Pools - link: "" - categories: - - data-collection.linux-systems.filesystem-metrics.zfs - icon_filename: "filesystem.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - zfs pools - - pools - - zfs - - filesystem - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides metrics about the state of ZFS pools." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: zfs_pool_state_warn - link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf - metric: zfspool.state - info: ZFS pool ${label:pool} state is degraded - - name: zfs_pool_state_crit - link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf - metric: zfspool.state - info: ZFS pool ${label:pool} state is faulted or unavail - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: zfs pool - description: "" - labels: - - name: pool - description: TBD - metrics: - - name: zfspool.state - description: ZFS pool state - unit: "boolean" - chart_type: line - dimensions: - - name: online - - name: degraded - - name: faulted - - name: offline - - name: removed - - name: unavail - - name: suspended - - meta: - plugin_name: proc.plugin - module_name: /proc/spl/kstat/zfs/arcstats - monitored_instance: - name: ZFS Adaptive Replacement Cache - link: "" - categories: - - data-collection.linux-systems.filesystem-metrics.zfs - icon_filename: "filesystem.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - zfs arc - - arc - - zfs - - filesystem - most_popular: false - overview: - data_collection: - metrics_description: "This integration monitors ZFS Adadptive Replacement Cache (ARC) statistics." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: zfs_memory_throttle - link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf - metric: zfs.memory_ops - info: number of times ZFS had to limit the ARC growth in the last 10 minutes - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: global - description: "" - labels: [] - metrics: - - name: zfs.arc_size - description: ZFS ARC Size - unit: "MiB" - chart_type: area - dimensions: - - name: arcsz - - name: target - - name: min - - name: max - - name: zfs.l2_size - description: ZFS L2 ARC Size - unit: "MiB" - chart_type: area - dimensions: - - name: actual - - name: size - - name: zfs.reads - description: ZFS Reads - unit: "reads/s" - chart_type: area - dimensions: - - name: arc - - name: demand - - name: prefetch - - name: metadata - - name: l2 - - name: zfs.bytes - description: ZFS ARC L2 Read/Write Rate - unit: "KiB/s" - chart_type: area - dimensions: - - name: read - - name: write - - name: zfs.hits - description: ZFS ARC Hits - unit: "percentage" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.hits_rate - description: ZFS ARC Hits Rate - unit: "events/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.dhits - description: ZFS Demand Hits - unit: "percentage" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.dhits_rate - description: ZFS Demand Hits Rate - unit: "events/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.phits - description: ZFS Prefetch Hits - unit: "percentage" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.phits_rate - description: ZFS Prefetch Hits Rate - unit: "events/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.mhits - description: ZFS Metadata Hits - unit: "percentage" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.mhits_rate - description: ZFS Metadata Hits Rate - unit: "events/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.l2hits - description: ZFS L2 Hits - unit: "percentage" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.l2hits_rate - description: ZFS L2 Hits Rate - unit: "events/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.list_hits - description: ZFS List Hits - unit: "hits/s" - chart_type: area - dimensions: - - name: mfu - - name: mfu_ghost - - name: mru - - name: mru_ghost - - name: zfs.arc_size_breakdown - description: ZFS ARC Size Breakdown - unit: "percentage" - chart_type: stacked - dimensions: - - name: recent - - name: frequent - - name: zfs.memory_ops - description: ZFS Memory Operations - unit: "operations/s" - chart_type: line - dimensions: - - name: direct - - name: throttled - - name: indirect - - name: zfs.important_ops - description: ZFS Important Operations - unit: "operations/s" - chart_type: line - dimensions: - - name: evict_skip - - name: deleted - - name: mutex_miss - - name: hash_collisions - - name: zfs.actual_hits - description: ZFS Actual Cache Hits - unit: "percentage" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.actual_hits_rate - description: ZFS Actual Cache Hits Rate - unit: "events/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.demand_data_hits - description: ZFS Data Demand Efficiency - unit: "percentage" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.demand_data_hits_rate - description: ZFS Data Demand Efficiency Rate - unit: "events/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.prefetch_data_hits - description: ZFS Data Prefetch Efficiency - unit: "percentage" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.prefetch_data_hits_rate - description: ZFS Data Prefetch Efficiency Rate - unit: "events/s" - chart_type: stacked - dimensions: - - name: hits - - name: misses - - name: zfs.hash_elements - description: ZFS ARC Hash Elements - unit: "elements" - chart_type: line - dimensions: - - name: current - - name: max - - name: zfs.hash_chains - description: ZFS ARC Hash Chains - unit: "chains" - chart_type: line - dimensions: - - name: current - - name: max - - meta: - plugin_name: proc.plugin - module_name: /sys/fs/btrfs - monitored_instance: - name: BTRFS - link: "" - categories: - - data-collection.linux-systems.filesystem-metrics.btrfs - icon_filename: "filesystem.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - btrfs - - filesystem - most_popular: false - overview: - data_collection: - metrics_description: "This integration provides usage and error statistics from the BTRFS filesystem." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: btrfs_allocated - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.disk - info: percentage of allocated BTRFS physical disk space - os: "*" - - name: btrfs_data - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.data - info: utilization of BTRFS data space - os: "*" - - name: btrfs_metadata - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.metadata - info: utilization of BTRFS metadata space - os: "*" - - name: btrfs_system - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.system - info: utilization of BTRFS system space - os: "*" - - name: btrfs_device_read_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.device_errors - info: number of encountered BTRFS read errors - os: "*" - - name: btrfs_device_write_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.device_errors - info: number of encountered BTRFS write errors - os: "*" - - name: btrfs_device_flush_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.device_errors - info: number of encountered BTRFS flush errors - os: "*" - - name: btrfs_device_corruption_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.device_errors - info: number of encountered BTRFS corruption errors - os: "*" - - name: btrfs_device_generation_errors - link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf - metric: btrfs.device_errors - info: number of encountered BTRFS generation errors - os: "*" - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: btrfs filesystem - description: "" - labels: - - name: filesystem_uuid - description: TBD - - name: filesystem_label - description: TBD - metrics: - - name: btrfs.disk - description: BTRFS Physical Disk Allocation - unit: "MiB" - chart_type: stacked - dimensions: - - name: unallocated - - name: data_free - - name: data_used - - name: meta_free - - name: meta_used - - name: sys_free - - name: sys_used - - name: btrfs.data - description: BTRFS Data Allocation - unit: "MiB" - chart_type: stacked - dimensions: - - name: free - - name: used - - name: btrfs.metadata - description: BTRFS Metadata Allocation - unit: "MiB" - chart_type: stacked - dimensions: - - name: free - - name: used - - name: reserved - - name: btrfs.system - description: BTRFS System Allocation - unit: "MiB" - chart_type: stacked - dimensions: - - name: free - - name: used - - name: btrfs.commits - description: BTRFS Commits - unit: "commits" - chart_type: line - dimensions: - - name: commits - - name: btrfs.commits_perc_time - description: BTRFS Commits Time Share - unit: "percentage" - chart_type: line - dimensions: - - name: commits - - name: btrfs.commit_timings - description: BTRFS Commit Timings - unit: "ms" - chart_type: line - dimensions: - - name: last - - name: max - - name: btrfs device - description: "" - labels: - - name: device_id - description: TBD - - name: filesystem_uuid - description: TBD - - name: filesystem_label - description: TBD - metrics: - - name: btrfs.device_errors - description: BTRFS Device Errors - unit: "errors" - chart_type: line - dimensions: - - name: write_errs - - name: read_errs - - name: flush_errs - - name: corruption_errs - - name: generation_errs - - meta: - plugin_name: proc.plugin - module_name: /sys/class/power_supply - monitored_instance: - name: Power Supply - link: "" - categories: - - data-collection.linux-systems.power-supply-metrics - icon_filename: "powersupply.svg" - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - psu - - power supply - most_popular: false - overview: - data_collection: - metrics_description: "This integration monitors Power supply metrics, such as battery status, AC power status and more." - method_description: "" - supported_platforms: - include: [] - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: - - name: linux_power_supply_capacity - link: https://github.com/netdata/netdata/blob/master/health/health.d/linux_power_supply.conf - metric: powersupply.capacity - info: percentage of remaining power supply capacity - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: power device - description: "" - labels: - - name: device - description: TBD - metrics: - - name: powersupply.capacity - description: Battery capacity - unit: "percentage" - chart_type: line - dimensions: - - name: capacity - - name: powersupply.charge - description: Battery charge - unit: "Ah" - chart_type: line - dimensions: - - name: empty_design - - name: empty - - name: now - - name: full - - name: full_design - - name: powersupply.energy - description: Battery energy - unit: "Wh" - chart_type: line - dimensions: - - name: empty_design - - name: empty - - name: now - - name: full - - name: full_design - - name: powersupply.voltage - description: Power supply voltage - unit: "V" - chart_type: line - dimensions: - - name: min_design - - name: min - - name: now - - name: max - - name: max_design - - meta: - plugin_name: proc.plugin - module_name: /sys/class/drm - monitored_instance: - name: AMD GPU - link: "https://www.amd.com" - categories: - - data-collection.hardware-devices-and-sensors - icon_filename: amd.svg - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: "" - keywords: - - amd - - gpu - - hardware - most_popular: false - overview: - data_collection: - metrics_description: "This integration monitors AMD GPU metrics, such as utilization, clock frequency and memory usage." - method_description: "It reads `/sys/class/drm` to collect metrics for every AMD GPU card instance it encounters." - supported_platforms: - include: - - Linux - exclude: [] - multi_instance: true - additional_permissions: - description: "" - default_behavior: - auto_detection: - description: "" - limits: - description: "" - performance_impact: - description: "" - setup: - prerequisites: - list: [] - configuration: - file: - name: "" - description: "" - options: - description: "" - folding: - title: "" - enabled: true - list: [] - examples: - folding: - enabled: true - title: "" - list: [] - troubleshooting: - problems: - list: [] - alerts: [] - metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: gpu - description: "These metrics refer to the GPU." - labels: - - name: product_name - description: GPU product name (e.g. AMD RX 6600) - metrics: - - name: amdgpu.gpu_utilization - description: GPU utilization - unit: "percentage" - chart_type: line - dimensions: - - name: utilization - - name: amdgpu.gpu_mem_utilization - description: GPU memory utilization - unit: "percentage" - chart_type: line - dimensions: - - name: utilization - - name: amdgpu.gpu_clk_frequency - description: GPU clock frequency - unit: "MHz" - chart_type: line - dimensions: - - name: frequency - - name: amdgpu.gpu_mem_clk_frequency - description: GPU memory clock frequency - unit: "MHz" - chart_type: line - dimensions: - - name: frequency - - name: amdgpu.gpu_mem_vram_usage_perc - description: VRAM memory usage percentage - unit: "percentage" - chart_type: line - dimensions: - - name: usage - - name: amdgpu.gpu_mem_vram_usage - description: VRAM memory usage - unit: "bytes" - chart_type: area - dimensions: - - name: free - - name: used - - name: amdgpu.gpu_mem_vis_vram_usage_perc - description: visible VRAM memory usage percentage - unit: "percentage" - chart_type: line - dimensions: - - name: usage - - name: amdgpu.gpu_mem_vis_vram_usage - description: visible VRAM memory usage - unit: "bytes" - chart_type: area - dimensions: - - name: free - - name: used - - name: amdgpu.gpu_mem_gtt_usage_perc - description: GTT memory usage percentage - unit: "percentage" - chart_type: line - dimensions: - - name: usage - - name: amdgpu.gpu_mem_gtt_usage - description: GTT memory usage - unit: "bytes" - chart_type: area - dimensions: - - name: free - - name: used |