summaryrefslogtreecommitdiffstats
path: root/daemon/global_statistics.c
diff options
context:
space:
mode:
Diffstat (limited to 'daemon/global_statistics.c')
-rw-r--r--daemon/global_statistics.c1489
1 files changed, 1033 insertions, 456 deletions
diff --git a/daemon/global_statistics.c b/daemon/global_statistics.c
index d5cc03159..98bf0bf9a 100644
--- a/daemon/global_statistics.c
+++ b/daemon/global_statistics.c
@@ -4,7 +4,15 @@
#define GLOBAL_STATS_RESET_WEB_USEC_MAX 0x01
-#define CONFIG_SECTION_GLOBAL_STATISTICS "global statistics"
+#define WORKER_JOB_GLOBAL 0
+#define WORKER_JOB_REGISTRY 1
+#define WORKER_JOB_WORKERS 2
+#define WORKER_JOB_DBENGINE 3
+#define WORKER_JOB_HEARTBEAT 4
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5
+#endif
static struct global_statistics {
volatile uint16_t connected_clients;
@@ -37,37 +45,10 @@ static struct global_statistics {
.rrdr_result_points_generated = 0,
};
-#if defined(HAVE_C___ATOMIC)
-#else
-netdata_mutex_t global_statistics_mutex = NETDATA_MUTEX_INITIALIZER;
-
-static inline void global_statistics_lock(void) {
- netdata_mutex_lock(&global_statistics_mutex);
-}
-
-static inline void global_statistics_unlock(void) {
- netdata_mutex_unlock(&global_statistics_mutex);
-}
-#endif
-
-
void rrdr_query_completed(uint64_t db_points_read, uint64_t result_points_generated) {
-#if defined(HAVE_C___ATOMIC)
__atomic_fetch_add(&global_statistics.rrdr_queries_made, 1, __ATOMIC_SEQ_CST);
__atomic_fetch_add(&global_statistics.rrdr_db_points_read, db_points_read, __ATOMIC_SEQ_CST);
__atomic_fetch_add(&global_statistics.rrdr_result_points_generated, result_points_generated, __ATOMIC_SEQ_CST);
-#else
- #warning NOT using atomic operations - using locks for global statistics
- if (web_server_is_multithreaded)
- global_statistics_lock();
-
- global_statistics.rrdr_queries_made++;
- global_statistics.rrdr_db_points_read += db_points_read;
- global_statistics.rrdr_result_points_generated += result_points_generated;
-
- if (web_server_is_multithreaded)
- global_statistics_unlock();
-#endif
}
void finished_web_request_statistics(uint64_t dt,
@@ -75,7 +56,6 @@ void finished_web_request_statistics(uint64_t dt,
uint64_t bytes_sent,
uint64_t content_size,
uint64_t compressed_content_size) {
-#if defined(HAVE_C___ATOMIC)
uint64_t old_web_usec_max = global_statistics.web_usec_max;
while(dt > old_web_usec_max)
__atomic_compare_exchange(&global_statistics.web_usec_max, &old_web_usec_max, &dt, 1, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
@@ -86,61 +66,19 @@ void finished_web_request_statistics(uint64_t dt,
__atomic_fetch_add(&global_statistics.bytes_sent, bytes_sent, __ATOMIC_SEQ_CST);
__atomic_fetch_add(&global_statistics.content_size, content_size, __ATOMIC_SEQ_CST);
__atomic_fetch_add(&global_statistics.compressed_content_size, compressed_content_size, __ATOMIC_SEQ_CST);
-#else
-#warning NOT using atomic operations - using locks for global statistics
- if (web_server_is_multithreaded)
- global_statistics_lock();
-
- if (dt > global_statistics.web_usec_max)
- global_statistics.web_usec_max = dt;
-
- global_statistics.web_requests++;
- global_statistics.web_usec += dt;
- global_statistics.bytes_received += bytes_received;
- global_statistics.bytes_sent += bytes_sent;
- global_statistics.content_size += content_size;
- global_statistics.compressed_content_size += compressed_content_size;
-
- if (web_server_is_multithreaded)
- global_statistics_unlock();
-#endif
}
uint64_t web_client_connected(void) {
-#if defined(HAVE_C___ATOMIC)
__atomic_fetch_add(&global_statistics.connected_clients, 1, __ATOMIC_SEQ_CST);
- uint64_t id = __atomic_fetch_add(&global_statistics.web_client_count, 1, __ATOMIC_SEQ_CST);
-#else
- if (web_server_is_multithreaded)
- global_statistics_lock();
-
- global_statistics.connected_clients++;
- uint64_t id = global_statistics.web_client_count++;
-
- if (web_server_is_multithreaded)
- global_statistics_unlock();
-#endif
-
- return id;
+ return __atomic_fetch_add(&global_statistics.web_client_count, 1, __ATOMIC_SEQ_CST);
}
void web_client_disconnected(void) {
-#if defined(HAVE_C___ATOMIC)
__atomic_fetch_sub(&global_statistics.connected_clients, 1, __ATOMIC_SEQ_CST);
-#else
- if (web_server_is_multithreaded)
- global_statistics_lock();
-
- global_statistics.connected_clients--;
-
- if (web_server_is_multithreaded)
- global_statistics_unlock();
-#endif
}
static inline void global_statistics_copy(struct global_statistics *gs, uint8_t options) {
-#if defined(HAVE_C___ATOMIC)
gs->connected_clients = __atomic_fetch_add(&global_statistics.connected_clients, 0, __ATOMIC_SEQ_CST);
gs->web_requests = __atomic_fetch_add(&global_statistics.web_requests, 0, __ATOMIC_SEQ_CST);
gs->web_usec = __atomic_fetch_add(&global_statistics.web_usec, 0, __ATOMIC_SEQ_CST);
@@ -160,16 +98,6 @@ static inline void global_statistics_copy(struct global_statistics *gs, uint8_t
__atomic_compare_exchange(&global_statistics.web_usec_max, (uint64_t *) &gs->web_usec_max, &n, 1, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST);
}
-#else
- global_statistics_lock();
-
- memcpy(gs, (const void *)&global_statistics, sizeof(struct global_statistics));
-
- if (options & GLOBAL_STATS_RESET_WEB_USEC_MAX)
- global_statistics.web_usec_max = 0;
-
- global_statistics_unlock();
-#endif
}
static void global_statistics_charts(void) {
@@ -266,7 +194,7 @@ static void global_statistics_charts(void) {
"netdata"
, "clients"
, NULL
- , "netdata"
+ , "api"
, NULL
, "Netdata Web Clients"
, "connected clients"
@@ -297,7 +225,7 @@ static void global_statistics_charts(void) {
"netdata"
, "requests"
, NULL
- , "netdata"
+ , "api"
, NULL
, "Netdata Web Requests"
, "requests/s"
@@ -329,13 +257,13 @@ static void global_statistics_charts(void) {
"netdata"
, "net"
, NULL
- , "netdata"
+ , "api"
, NULL
, "Netdata Network Traffic"
, "kilobits/s"
, "netdata"
, "stats"
- , 130000
+ , 130400
, localhost->rrd_update_every
, RRDSET_TYPE_AREA
);
@@ -363,13 +291,13 @@ static void global_statistics_charts(void) {
"netdata"
, "response_time"
, NULL
- , "netdata"
+ , "api"
, NULL
, "Netdata API Response Time"
, "milliseconds/request"
, "netdata"
, "stats"
- , 130400
+ , 130500
, localhost->rrd_update_every
, RRDSET_TYPE_LINE
);
@@ -412,13 +340,13 @@ static void global_statistics_charts(void) {
"netdata"
, "compression_ratio"
, NULL
- , "netdata"
+ , "api"
, NULL
, "Netdata API Responses Compression Savings Ratio"
, "percentage"
, "netdata"
, "stats"
- , 130500
+ , 130600
, localhost->rrd_update_every
, RRDSET_TYPE_LINE
);
@@ -465,7 +393,7 @@ static void global_statistics_charts(void) {
, "queries/s"
, "netdata"
, "stats"
- , 130500
+ , 131000
, localhost->rrd_update_every
, RRDSET_TYPE_LINE
);
@@ -498,7 +426,7 @@ static void global_statistics_charts(void) {
, "points/s"
, "netdata"
, "stats"
- , 130501
+ , 131001
, localhost->rrd_update_every
, RRDSET_TYPE_AREA
);
@@ -516,450 +444,1099 @@ static void global_statistics_charts(void) {
}
// ----------------------------------------------------------------
+}
+static void dbengine_statistics_charts(void) {
#ifdef ENABLE_DBENGINE
- RRDHOST *host;
- unsigned long long stats_array[RRDENG_NR_STATS] = {0};
- unsigned long long local_stats_array[RRDENG_NR_STATS];
- unsigned dbengine_contexts = 0, counted_multihost_db = 0, i;
-
- rrd_rdlock();
- rrdhost_foreach_read(host) {
- if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)) {
- if (&multidb_ctx == host->rrdeng_ctx) {
- if (counted_multihost_db)
- continue; /* Only count multi-host DB once */
- counted_multihost_db = 1;
- }
- ++dbengine_contexts;
- /* get localhost's DB engine's statistics */
- rrdeng_get_37_statistics(host->rrdeng_ctx, local_stats_array);
- for (i = 0 ; i < RRDENG_NR_STATS ; ++i) {
- /* aggregate statistics across hosts */
- stats_array[i] += local_stats_array[i];
+ if(netdata_rwlock_tryrdlock(&rrd_rwlock) == 0) {
+ RRDHOST *host;
+ unsigned long long stats_array[RRDENG_NR_STATS] = {0};
+ unsigned long long local_stats_array[RRDENG_NR_STATS];
+ unsigned dbengine_contexts = 0, counted_multihost_db = 0, i;
+
+ rrdhost_foreach_read(host) {
+ if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)) {
+ if (&multidb_ctx == host->rrdeng_ctx) {
+ if (counted_multihost_db)
+ continue; /* Only count multi-host DB once */
+ counted_multihost_db = 1;
+ }
+ ++dbengine_contexts;
+ /* get localhost's DB engine's statistics */
+ rrdeng_get_37_statistics(host->rrdeng_ctx, local_stats_array);
+ for (i = 0; i < RRDENG_NR_STATS; ++i) {
+ /* aggregate statistics across hosts */
+ stats_array[i] += local_stats_array[i];
+ }
}
}
- }
- rrd_unlock();
-
- if (dbengine_contexts) {
- /* deduplicate global statistics by getting the ones from the last context */
- stats_array[30] = local_stats_array[30];
- stats_array[31] = local_stats_array[31];
- stats_array[32] = local_stats_array[32];
- stats_array[34] = local_stats_array[34];
- stats_array[36] = local_stats_array[36];
-
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_compression = NULL;
- static RRDDIM *rd_savings = NULL;
-
- if (unlikely(!st_compression)) {
- st_compression = rrdset_create_localhost(
- "netdata"
- , "dbengine_compression_ratio"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata DB engine data extents' compression savings ratio"
- , "percentage"
- , "netdata"
- , "stats"
- , 130502
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_savings = rrddim_add(st_compression, "savings", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ rrd_unlock();
+
+ if (dbengine_contexts) {
+ /* deduplicate global statistics by getting the ones from the last context */
+ stats_array[30] = local_stats_array[30];
+ stats_array[31] = local_stats_array[31];
+ stats_array[32] = local_stats_array[32];
+ stats_array[34] = local_stats_array[34];
+ stats_array[36] = local_stats_array[36];
+
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_compression = NULL;
+ static RRDDIM *rd_savings = NULL;
+
+ if (unlikely(!st_compression)) {
+ st_compression = rrdset_create_localhost(
+ "netdata",
+ "dbengine_compression_ratio",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine data extents' compression savings ratio",
+ "percentage",
+ "netdata",
+ "stats",
+ 132000,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_savings = rrddim_add(st_compression, "savings", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_compression);
+
+ unsigned long long ratio;
+ unsigned long long compressed_content_size = stats_array[12];
+ unsigned long long content_size = stats_array[11];
+
+ if (content_size) {
+ // allow negative savings
+ ratio = ((content_size - compressed_content_size) * 100 * 1000) / content_size;
+ } else {
+ ratio = 0;
+ }
+ rrddim_set_by_pointer(st_compression, rd_savings, ratio);
+
+ rrdset_done(st_compression);
}
- else
- rrdset_next(st_compression);
-
- unsigned long long ratio;
- unsigned long long compressed_content_size = stats_array[12];
- unsigned long long content_size = stats_array[11];
-
- if (content_size) {
- // allow negative savings
- ratio = ((content_size - compressed_content_size) * 100 * 1000) / content_size;
- } else {
- ratio = 0;
- }
- rrddim_set_by_pointer(st_compression, rd_savings, ratio);
- rrdset_done(st_compression);
- }
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_pg_cache_hit_ratio = NULL;
+ static RRDDIM *rd_hit_ratio = NULL;
+
+ if (unlikely(!st_pg_cache_hit_ratio)) {
+ st_pg_cache_hit_ratio = rrdset_create_localhost(
+ "netdata",
+ "page_cache_hit_ratio",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine page cache hit ratio",
+ "percentage",
+ "netdata",
+ "stats",
+ 132003,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_hit_ratio = rrddim_add(st_pg_cache_hit_ratio, "ratio", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_pg_cache_hit_ratio);
+
+ static unsigned long long old_hits = 0;
+ static unsigned long long old_misses = 0;
+ unsigned long long hits = stats_array[7];
+ unsigned long long misses = stats_array[8];
+ unsigned long long hits_delta;
+ unsigned long long misses_delta;
+ unsigned long long ratio;
+
+ hits_delta = hits - old_hits;
+ misses_delta = misses - old_misses;
+ old_hits = hits;
+ old_misses = misses;
+
+ if (hits_delta + misses_delta) {
+ ratio = (hits_delta * 100 * 1000) / (hits_delta + misses_delta);
+ } else {
+ ratio = 0;
+ }
+ rrddim_set_by_pointer(st_pg_cache_hit_ratio, rd_hit_ratio, ratio);
+
+ rrdset_done(st_pg_cache_hit_ratio);
+ }
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_pg_cache_hit_ratio = NULL;
- static RRDDIM *rd_hit_ratio = NULL;
-
- if (unlikely(!st_pg_cache_hit_ratio)) {
- st_pg_cache_hit_ratio = rrdset_create_localhost(
- "netdata"
- , "page_cache_hit_ratio"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata DB engine page cache hit ratio"
- , "percentage"
- , "netdata"
- , "stats"
- , 130503
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_hit_ratio = rrddim_add(st_pg_cache_hit_ratio, "ratio", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_pg_cache_pages = NULL;
+ static RRDDIM *rd_descriptors = NULL;
+ static RRDDIM *rd_populated = NULL;
+ static RRDDIM *rd_dirty = NULL;
+ static RRDDIM *rd_backfills = NULL;
+ static RRDDIM *rd_evictions = NULL;
+ static RRDDIM *rd_used_by_collectors = NULL;
+
+ if (unlikely(!st_pg_cache_pages)) {
+ st_pg_cache_pages = rrdset_create_localhost(
+ "netdata",
+ "page_cache_stats",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata dbengine page cache statistics",
+ "pages",
+ "netdata",
+ "stats",
+ 132004,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_dirty = rrddim_add(st_pg_cache_pages, "dirty", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_backfills = rrddim_add(st_pg_cache_pages, "backfills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_evictions = rrddim_add(st_pg_cache_pages, "evictions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_used_by_collectors =
+ rrddim_add(st_pg_cache_pages, "used_by_collectors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_pg_cache_pages);
+
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_dirty, (collected_number)stats_array[0] + stats_array[4]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_backfills, (collected_number)stats_array[9]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_evictions, (collected_number)stats_array[10]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_used_by_collectors, (collected_number)stats_array[0]);
+ rrdset_done(st_pg_cache_pages);
}
- else
- rrdset_next(st_pg_cache_hit_ratio);
-
- static unsigned long long old_hits = 0;
- static unsigned long long old_misses = 0;
- unsigned long long hits = stats_array[7];
- unsigned long long misses = stats_array[8];
- unsigned long long hits_delta;
- unsigned long long misses_delta;
- unsigned long long ratio;
-
- hits_delta = hits - old_hits;
- misses_delta = misses - old_misses;
- old_hits = hits;
- old_misses = misses;
-
- if (hits_delta + misses_delta) {
- ratio = (hits_delta * 100 * 1000) / (hits_delta + misses_delta);
- } else {
- ratio = 0;
+
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_long_term_pages = NULL;
+ static RRDDIM *rd_total = NULL;
+ static RRDDIM *rd_insertions = NULL;
+ static RRDDIM *rd_deletions = NULL;
+ static RRDDIM *rd_flushing_pressure_deletions = NULL;
+
+ if (unlikely(!st_long_term_pages)) {
+ st_long_term_pages = rrdset_create_localhost(
+ "netdata",
+ "dbengine_long_term_page_stats",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata dbengine long-term page statistics",
+ "pages",
+ "netdata",
+ "stats",
+ 132005,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_total = rrddim_add(st_long_term_pages, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_insertions = rrddim_add(st_long_term_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_deletions = rrddim_add(st_long_term_pages, "deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_flushing_pressure_deletions = rrddim_add(
+ st_long_term_pages, "flushing_pressure_deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
+ } else
+ rrdset_next(st_long_term_pages);
+
+ rrddim_set_by_pointer(st_long_term_pages, rd_total, (collected_number)stats_array[2]);
+ rrddim_set_by_pointer(st_long_term_pages, rd_insertions, (collected_number)stats_array[5]);
+ rrddim_set_by_pointer(st_long_term_pages, rd_deletions, (collected_number)stats_array[6]);
+ rrddim_set_by_pointer(
+ st_long_term_pages, rd_flushing_pressure_deletions, (collected_number)stats_array[36]);
+ rrdset_done(st_long_term_pages);
}
- rrddim_set_by_pointer(st_pg_cache_hit_ratio, rd_hit_ratio, ratio);
- rrdset_done(st_pg_cache_hit_ratio);
- }
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_io_stats = NULL;
+ static RRDDIM *rd_reads = NULL;
+ static RRDDIM *rd_writes = NULL;
+
+ if (unlikely(!st_io_stats)) {
+ st_io_stats = rrdset_create_localhost(
+ "netdata",
+ "dbengine_io_throughput",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine I/O throughput",
+ "MiB/s",
+ "netdata",
+ "stats",
+ 132006,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL);
+ rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL);
+ } else
+ rrdset_next(st_io_stats);
+
+ rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[17]);
+ rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[15]);
+ rrdset_done(st_io_stats);
+ }
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_pg_cache_pages = NULL;
- static RRDDIM *rd_descriptors = NULL;
- static RRDDIM *rd_populated = NULL;
- static RRDDIM *rd_dirty = NULL;
- static RRDDIM *rd_backfills = NULL;
- static RRDDIM *rd_evictions = NULL;
- static RRDDIM *rd_used_by_collectors = NULL;
-
- if (unlikely(!st_pg_cache_pages)) {
- st_pg_cache_pages = rrdset_create_localhost(
- "netdata"
- , "page_cache_stats"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata dbengine page cache statistics"
- , "pages"
- , "netdata"
- , "stats"
- , 130504
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_dirty = rrddim_add(st_pg_cache_pages, "dirty", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_backfills = rrddim_add(st_pg_cache_pages, "backfills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_evictions = rrddim_add(st_pg_cache_pages, "evictions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_used_by_collectors = rrddim_add(st_pg_cache_pages, "used_by_collectors", NULL, 1, 1,
- RRD_ALGORITHM_ABSOLUTE);
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_io_stats = NULL;
+ static RRDDIM *rd_reads = NULL;
+ static RRDDIM *rd_writes = NULL;
+
+ if (unlikely(!st_io_stats)) {
+ st_io_stats = rrdset_create_localhost(
+ "netdata",
+ "dbengine_io_operations",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine I/O operations",
+ "operations/s",
+ "netdata",
+ "stats",
+ 132007,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
+ } else
+ rrdset_next(st_io_stats);
+
+ rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[18]);
+ rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[16]);
+ rrdset_done(st_io_stats);
}
- else
- rrdset_next(st_pg_cache_pages);
-
- rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_dirty, (collected_number)stats_array[0] + stats_array[4]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_backfills, (collected_number)stats_array[9]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_evictions, (collected_number)stats_array[10]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_used_by_collectors, (collected_number)stats_array[0]);
- rrdset_done(st_pg_cache_pages);
- }
- // ----------------------------------------------------------------
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_errors = NULL;
+ static RRDDIM *rd_fs_errors = NULL;
+ static RRDDIM *rd_io_errors = NULL;
+ static RRDDIM *pg_cache_over_half_dirty_events = NULL;
+
+ if (unlikely(!st_errors)) {
+ st_errors = rrdset_create_localhost(
+ "netdata",
+ "dbengine_global_errors",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine errors",
+ "errors/s",
+ "netdata",
+ "stats",
+ 132008,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_io_errors = rrddim_add(st_errors, "io_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_fs_errors = rrddim_add(st_errors, "fs_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ pg_cache_over_half_dirty_events =
+ rrddim_add(st_errors, "pg_cache_over_half_dirty_events", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ } else
+ rrdset_next(st_errors);
+
+ rrddim_set_by_pointer(st_errors, rd_io_errors, (collected_number)stats_array[30]);
+ rrddim_set_by_pointer(st_errors, rd_fs_errors, (collected_number)stats_array[31]);
+ rrddim_set_by_pointer(st_errors, pg_cache_over_half_dirty_events, (collected_number)stats_array[34]);
+ rrdset_done(st_errors);
+ }
- {
- static RRDSET *st_long_term_pages = NULL;
- static RRDDIM *rd_total = NULL;
- static RRDDIM *rd_insertions = NULL;
- static RRDDIM *rd_deletions = NULL;
- static RRDDIM *rd_flushing_pressure_deletions = NULL;
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_fd = NULL;
+ static RRDDIM *rd_fd_current = NULL;
+ static RRDDIM *rd_fd_max = NULL;
+
+ if (unlikely(!st_fd)) {
+ st_fd = rrdset_create_localhost(
+ "netdata",
+ "dbengine_global_file_descriptors",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine File Descriptors",
+ "descriptors",
+ "netdata",
+ "stats",
+ 132009,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_fd_current = rrddim_add(st_fd, "current", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_fd_max = rrddim_add(st_fd, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_fd);
+
+ rrddim_set_by_pointer(st_fd, rd_fd_current, (collected_number)stats_array[32]);
+ /* Careful here, modify this accordingly if the File-Descriptor budget ever changes */
+ rrddim_set_by_pointer(st_fd, rd_fd_max, (collected_number)rlimit_nofile.rlim_cur / 4);
+ rrdset_done(st_fd);
+ }
- if (unlikely(!st_long_term_pages)) {
- st_long_term_pages = rrdset_create_localhost(
- "netdata"
- , "dbengine_long_term_page_stats"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata dbengine long-term page statistics"
- , "pages"
- , "netdata"
- , "stats"
- , 130505
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_total = rrddim_add(st_long_term_pages, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_insertions = rrddim_add(st_long_term_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_deletions = rrddim_add(st_long_term_pages, "deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_flushing_pressure_deletions = rrddim_add(st_long_term_pages, "flushing_pressure_deletions", NULL, -1,
- 1, RRD_ALGORITHM_INCREMENTAL);
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_ram_usage = NULL;
+ static RRDDIM *rd_cached = NULL;
+ static RRDDIM *rd_pinned = NULL;
+ static RRDDIM *rd_cache_metadata = NULL;
+ static RRDDIM *rd_index_metadata = NULL;
+ static RRDDIM *rd_pages_metadata = NULL;
+
+ collected_number cached_pages, pinned_pages, API_producers, populated_pages, cache_metadata, pages_on_disk,
+ page_cache_descriptors, index_metadata, pages_metadata;
+
+ if (unlikely(!st_ram_usage)) {
+ st_ram_usage = rrdset_create_localhost(
+ "netdata",
+ "dbengine_ram",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine RAM usage",
+ "MiB",
+ "netdata",
+ "stats",
+ 132010,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_STACKED);
+
+ rd_cached = rrddim_add(st_ram_usage, "cache", NULL, RRDENG_BLOCK_SIZE, 1024*1024, RRD_ALGORITHM_ABSOLUTE);
+ rd_pinned = rrddim_add(st_ram_usage, "collectors", NULL, RRDENG_BLOCK_SIZE, 1024*1024, RRD_ALGORITHM_ABSOLUTE);
+ rd_cache_metadata = rrddim_add(st_ram_usage, "cache metadata", NULL, 1, 1024*1024, RRD_ALGORITHM_ABSOLUTE);
+ rd_pages_metadata = rrddim_add(st_ram_usage, "pages metadata", NULL, 1, 1024*1024, RRD_ALGORITHM_ABSOLUTE);
+ rd_index_metadata = rrddim_add(st_ram_usage, "index metadata", NULL, 1, 1024*1024, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_ram_usage);
+
+ API_producers = (collected_number)stats_array[0];
+ pages_on_disk = (collected_number)stats_array[2];
+ populated_pages = (collected_number)stats_array[3];
+ page_cache_descriptors = (collected_number)stats_array[27];
+
+ if (API_producers * 2 > populated_pages) {
+ pinned_pages = API_producers;
+ } else {
+ pinned_pages = API_producers * 2;
+ }
+ cached_pages = populated_pages - pinned_pages;
+
+ cache_metadata = page_cache_descriptors * sizeof(struct page_cache_descr);
+
+ pages_metadata = pages_on_disk * sizeof(struct rrdeng_page_descr);
+
+ /* This is an empirical estimation for Judy array indexing and extent structures */
+ index_metadata = pages_on_disk * 58;
+
+ rrddim_set_by_pointer(st_ram_usage, rd_cached, cached_pages);
+ rrddim_set_by_pointer(st_ram_usage, rd_pinned, pinned_pages);
+ rrddim_set_by_pointer(st_ram_usage, rd_cache_metadata, cache_metadata);
+ rrddim_set_by_pointer(st_ram_usage, rd_pages_metadata, pages_metadata);
+ rrddim_set_by_pointer(st_ram_usage, rd_index_metadata, index_metadata);
+ rrdset_done(st_ram_usage);
}
- else
- rrdset_next(st_long_term_pages);
-
- rrddim_set_by_pointer(st_long_term_pages, rd_total, (collected_number)stats_array[2]);
- rrddim_set_by_pointer(st_long_term_pages, rd_insertions, (collected_number)stats_array[5]);
- rrddim_set_by_pointer(st_long_term_pages, rd_deletions, (collected_number)stats_array[6]);
- rrddim_set_by_pointer(st_long_term_pages, rd_flushing_pressure_deletions,
- (collected_number)stats_array[36]);
- rrdset_done(st_long_term_pages);
}
+ }
+#endif
+}
+
+static void update_heartbeat_charts() {
+ static RRDSET *st_heartbeat = NULL;
+ static RRDDIM *rd_heartbeat_min = NULL;
+ static RRDDIM *rd_heartbeat_max = NULL;
+ static RRDDIM *rd_heartbeat_avg = NULL;
+
+ if (unlikely(!st_heartbeat)) {
+ st_heartbeat = rrdset_create_localhost(
+ "netdata"
+ , "heartbeat"
+ , NULL
+ , "heartbeat"
+ , NULL
+ , "System clock jitter"
+ , "microseconds"
+ , "netdata"
+ , "stats"
+ , 900000
+ , localhost->rrd_update_every
+ , RRDSET_TYPE_AREA);
+
+ rd_heartbeat_min = rrddim_add(st_heartbeat, "min", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_heartbeat_max = rrddim_add(st_heartbeat, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_heartbeat_avg = rrddim_add(st_heartbeat, "average", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_heartbeat);
+
+ usec_t min, max, average;
+ size_t count;
+
+ heartbeat_statistics(&min, &max, &average, &count);
+
+ rrddim_set_by_pointer(st_heartbeat, rd_heartbeat_min, (collected_number)min);
+ rrddim_set_by_pointer(st_heartbeat, rd_heartbeat_max, (collected_number)max);
+ rrddim_set_by_pointer(st_heartbeat, rd_heartbeat_avg, (collected_number)average);
+
+ rrdset_done(st_heartbeat);
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// worker utilization
+
+#define WORKERS_MIN_PERCENT_DEFAULT 10000.0
+
+struct worker_job_type {
+ char name[WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH + 1];
+ size_t jobs_started;
+ usec_t busy_time;
+
+ RRDDIM *rd_jobs_started;
+ RRDDIM *rd_busy_time;
+};
+
+struct worker_thread {
+ pid_t pid;
+ int enabled;
+
+ int cpu_enabled;
+ double cpu;
+
+ kernel_uint_t utime;
+ kernel_uint_t stime;
+
+ kernel_uint_t utime_old;
+ kernel_uint_t stime_old;
+
+ usec_t collected_time;
+ usec_t collected_time_old;
+
+ size_t jobs_started;
+ usec_t busy_time;
+
+ struct worker_thread *next;
+};
+
+struct worker_utilization {
+ const char *name;
+ const char *family;
+ size_t priority;
+ uint32_t flags;
+
+ char *name_lowercase;
+
+ struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES];
+
+ size_t workers_registered;
+ size_t workers_busy;
+ usec_t workers_total_busy_time;
+ usec_t workers_total_duration;
+ size_t workers_total_jobs_started;
+ double workers_min_busy_time;
+ double workers_max_busy_time;
+
+ size_t workers_cpu_registered;
+ double workers_cpu_min;
+ double workers_cpu_max;
+ double workers_cpu_total;
+
+ struct worker_thread *threads;
+
+ RRDSET *st_workers_time;
+ RRDDIM *rd_workers_time_avg;
+ RRDDIM *rd_workers_time_min;
+ RRDDIM *rd_workers_time_max;
+
+ RRDSET *st_workers_cpu;
+ RRDDIM *rd_workers_cpu_avg;
+ RRDDIM *rd_workers_cpu_min;
+ RRDDIM *rd_workers_cpu_max;
+
+ RRDSET *st_workers_threads;
+ RRDDIM *rd_workers_threads_free;
+ RRDDIM *rd_workers_threads_busy;
+
+ RRDSET *st_workers_jobs_per_job_type;
+ RRDSET *st_workers_busy_per_job_type;
+
+ RRDDIM *rd_total_cpu_utilizaton;
+};
+
+static struct worker_utilization all_workers_utilization[] = {
+ { .name = "STATS", .family = "workers global statistics", .priority = 1000000 },
+ { .name = "HEALTH", .family = "workers health alarms", .priority = 1000000 },
+ { .name = "MLTRAIN", .family = "workers ML training", .priority = 1000000 },
+ { .name = "MLDETECT", .family = "workers ML detection", .priority = 1000000 },
+ { .name = "STREAMRCV", .family = "workers streaming receive", .priority = 1000000 },
+ { .name = "STREAMSND", .family = "workers streaming send", .priority = 1000000 },
+ { .name = "DBENGINE", .family = "workers dbengine instances", .priority = 1000000 },
+ { .name = "WEB", .family = "workers web server", .priority = 1000000 },
+ { .name = "ACLKQUERY", .family = "workers aclk query", .priority = 1000000 },
+ { .name = "ACLKSYNC", .family = "workers aclk host sync", .priority = 1000000 },
+ { .name = "PLUGINSD", .family = "workers plugins.d", .priority = 1000000 },
+ { .name = "STATSD", .family = "workers plugin statsd", .priority = 1000000 },
+ { .name = "STATSDFLUSH", .family = "workers plugin statsd flush", .priority = 1000000 },
+ { .name = "PROC", .family = "workers plugin proc", .priority = 1000000 },
+ { .name = "NETDEV", .family = "workers plugin proc netdev", .priority = 1000000 },
+ { .name = "FREEBSD", .family = "workers plugin freebsd", .priority = 1000000 },
+ { .name = "MACOS", .family = "workers plugin macos", .priority = 1000000 },
+ { .name = "CGROUPS", .family = "workers plugin cgroups", .priority = 1000000 },
+ { .name = "CGROUPSDISC", .family = "workers plugin cgroups find", .priority = 1000000 },
+ { .name = "DISKSPACE", .family = "workers plugin diskspace", .priority = 1000000 },
+ { .name = "TC", .family = "workers plugin tc", .priority = 1000000 },
+ { .name = "TIMEX", .family = "workers plugin timex", .priority = 1000000 },
+ { .name = "IDLEJITTER", .family = "workers plugin idlejitter", .priority = 1000000 },
+
+ // has to be terminated with a NULL
+ { .name = NULL, .family = NULL }
+};
+
+static void workers_total_cpu_utilization_chart(void) {
+ size_t i, cpu_enabled = 0;
+ for(i = 0; all_workers_utilization[i].name ;i++)
+ if(all_workers_utilization[i].workers_cpu_registered) cpu_enabled++;
+
+ if(!cpu_enabled) return;
+
+ static RRDSET *st = NULL;
+
+ if(!st) {
+ st = rrdset_create_localhost(
+ "netdata",
+ "workers_cpu",
+ NULL,
+ "workers",
+ "netdata.workers.cpu_total",
+ "Netdata Workers CPU Utilization (100% = 1 core)",
+ "%",
+ "netdata",
+ "stats",
+ 999000,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_STACKED);
+ }
+
+ rrdset_next(st);
+
+ for(i = 0; all_workers_utilization[i].name ;i++) {
+ struct worker_utilization *wu = &all_workers_utilization[i];
+ if(!wu->workers_cpu_registered) continue;
+
+ if(!wu->rd_total_cpu_utilizaton)
+ wu->rd_total_cpu_utilizaton = rrddim_add(st, wu->name_lowercase, NULL, 1, 10000ULL, RRD_ALGORITHM_ABSOLUTE);
+
+ rrddim_set_by_pointer(st, wu->rd_total_cpu_utilizaton, (collected_number)((double)wu->workers_cpu_total * 10000.0));
+ }
+
+ rrdset_done(st);
+}
+
+static void workers_utilization_update_chart(struct worker_utilization *wu) {
+ if(!wu->workers_registered) return;
+
+ //fprintf(stderr, "%-12s WORKER UTILIZATION: %-3.2f%%, %zu jobs done, %zu running, on %zu workers, min %-3.02f%%, max %-3.02f%%.\n",
+ // wu->name,
+ // (double)wu->workers_total_busy_time * 100.0 / (double)wu->workers_total_duration,
+ // wu->workers_total_jobs_started, wu->workers_busy, wu->workers_registered,
+ // wu->workers_min_busy_time, wu->workers_max_busy_time);
+
+ // ----------------------------------------------------------------------
+
+ if(unlikely(!wu->st_workers_time)) {
+ char name[RRD_ID_LENGTH_MAX + 1];
+ snprintfz(name, RRD_ID_LENGTH_MAX, "workers_time_%s", wu->name_lowercase);
+
+ char context[RRD_ID_LENGTH_MAX + 1];
+ snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.time", wu->name_lowercase);
+
+ wu->st_workers_time = rrdset_create_localhost(
+ "netdata"
+ , name
+ , NULL
+ , wu->family
+ , context
+ , "Netdata Workers Busy Time (100% = all workers busy)"
+ , "%"
+ , "netdata"
+ , "stats"
+ , wu->priority
+ , localhost->rrd_update_every
+ , RRDSET_TYPE_AREA
+ );
+ }
+
+ // we add the min and max dimensions only when we have multiple workers
+
+ if(unlikely(!wu->rd_workers_time_min && wu->workers_registered > 1))
+ wu->rd_workers_time_min = rrddim_add(wu->st_workers_time, "min", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE);
+
+ if(unlikely(!wu->rd_workers_time_max && wu->workers_registered > 1))
+ wu->rd_workers_time_max = rrddim_add(wu->st_workers_time, "max", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE);
+
+ if(unlikely(!wu->rd_workers_time_avg))
+ wu->rd_workers_time_avg = rrddim_add(wu->st_workers_time, "average", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE);
+
+ rrdset_next(wu->st_workers_time);
+
+ if(unlikely(wu->workers_min_busy_time == WORKERS_MIN_PERCENT_DEFAULT)) wu->workers_min_busy_time = 0.0;
+
+ if(wu->rd_workers_time_min)
+ rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_min, (collected_number)((double)wu->workers_min_busy_time * 10000.0));
+
+ if(wu->rd_workers_time_max)
+ rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_max, (collected_number)((double)wu->workers_max_busy_time * 10000.0));
- // ----------------------------------------------------------------
+ if(wu->workers_total_duration == 0)
+ rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_avg, 0);
+ else
+ rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_avg, (collected_number)((double)wu->workers_total_busy_time * 100.0 * 10000.0 / (double)wu->workers_total_duration));
- {
- static RRDSET *st_io_stats = NULL;
- static RRDDIM *rd_reads = NULL;
- static RRDDIM *rd_writes = NULL;
+ rrdset_done(wu->st_workers_time);
- if (unlikely(!st_io_stats)) {
- st_io_stats = rrdset_create_localhost(
+ // ----------------------------------------------------------------------
+
+#ifdef __linux__
+ if(wu->workers_cpu_registered || wu->st_workers_cpu) {
+ if(unlikely(!wu->st_workers_cpu)) {
+ char name[RRD_ID_LENGTH_MAX + 1];
+ snprintfz(name, RRD_ID_LENGTH_MAX, "workers_cpu_%s", wu->name_lowercase);
+
+ char context[RRD_ID_LENGTH_MAX + 1];
+ snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.cpu", wu->name_lowercase);
+
+ wu->st_workers_cpu = rrdset_create_localhost(
"netdata"
- , "dbengine_io_throughput"
- , NULL
- , "dbengine"
+ , name
, NULL
- , "Netdata DB engine I/O throughput"
- , "MiB/s"
+ , wu->family
+ , context
+ , "Netdata Workers CPU Utilization (100% = all workers busy)"
+ , "%"
, "netdata"
, "stats"
- , 130506
+ , wu->priority + 1
, localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
+ , RRDSET_TYPE_AREA
+ );
+ }
- rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL);
- rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL);
- }
- else
- rrdset_next(st_io_stats);
+ if (unlikely(!wu->rd_workers_cpu_min && wu->workers_registered > 1))
+ wu->rd_workers_cpu_min = rrddim_add(wu->st_workers_cpu, "min", NULL, 1, 10000ULL, RRD_ALGORITHM_ABSOLUTE);
- rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[17]);
- rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[15]);
- rrdset_done(st_io_stats);
- }
+ if (unlikely(!wu->rd_workers_cpu_max && wu->workers_registered > 1))
+ wu->rd_workers_cpu_max = rrddim_add(wu->st_workers_cpu, "max", NULL, 1, 10000ULL, RRD_ALGORITHM_ABSOLUTE);
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_io_stats = NULL;
- static RRDDIM *rd_reads = NULL;
- static RRDDIM *rd_writes = NULL;
-
- if (unlikely(!st_io_stats)) {
- st_io_stats = rrdset_create_localhost(
- "netdata"
- , "dbengine_io_operations"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata DB engine I/O operations"
- , "operations/s"
- , "netdata"
- , "stats"
- , 130507
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
- }
- else
- rrdset_next(st_io_stats);
+ if(unlikely(!wu->rd_workers_cpu_avg))
+ wu->rd_workers_cpu_avg = rrddim_add(wu->st_workers_cpu, "average", NULL, 1, 10000ULL, RRD_ALGORITHM_ABSOLUTE);
- rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[18]);
- rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[16]);
- rrdset_done(st_io_stats);
- }
+ rrdset_next(wu->st_workers_cpu);
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_errors = NULL;
- static RRDDIM *rd_fs_errors = NULL;
- static RRDDIM *rd_io_errors = NULL;
- static RRDDIM *pg_cache_over_half_dirty_events = NULL;
-
- if (unlikely(!st_errors)) {
- st_errors = rrdset_create_localhost(
- "netdata"
- , "dbengine_global_errors"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata DB engine errors"
- , "errors/s"
- , "netdata"
- , "stats"
- , 130508
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_io_errors = rrddim_add(st_errors, "io_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_fs_errors = rrddim_add(st_errors, "fs_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- pg_cache_over_half_dirty_events = rrddim_add(st_errors, "pg_cache_over_half_dirty_events", NULL, 1, 1,
- RRD_ALGORITHM_INCREMENTAL);
- }
- else
- rrdset_next(st_errors);
+ if(unlikely(wu->workers_cpu_min == WORKERS_MIN_PERCENT_DEFAULT)) wu->workers_cpu_min = 0.0;
- rrddim_set_by_pointer(st_errors, rd_io_errors, (collected_number)stats_array[30]);
- rrddim_set_by_pointer(st_errors, rd_fs_errors, (collected_number)stats_array[31]);
- rrddim_set_by_pointer(st_errors, pg_cache_over_half_dirty_events, (collected_number)stats_array[34]);
- rrdset_done(st_errors);
- }
+ if(wu->rd_workers_cpu_min)
+ rrddim_set_by_pointer(wu->st_workers_cpu, wu->rd_workers_cpu_min, (collected_number)(wu->workers_cpu_min * 10000ULL));
+
+ if(wu->rd_workers_cpu_max)
+ rrddim_set_by_pointer(wu->st_workers_cpu, wu->rd_workers_cpu_max, (collected_number)(wu->workers_cpu_max * 10000ULL));
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_fd = NULL;
- static RRDDIM *rd_fd_current = NULL;
- static RRDDIM *rd_fd_max = NULL;
-
- if (unlikely(!st_fd)) {
- st_fd = rrdset_create_localhost(
- "netdata"
- , "dbengine_global_file_descriptors"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata DB engine File Descriptors"
- , "descriptors"
- , "netdata"
- , "stats"
- , 130509
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_fd_current = rrddim_add(st_fd, "current", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_fd_max = rrddim_add(st_fd, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ if(wu->workers_cpu_registered == 0)
+ rrddim_set_by_pointer(wu->st_workers_cpu, wu->rd_workers_cpu_avg, 0);
+ else
+ rrddim_set_by_pointer(wu->st_workers_cpu, wu->rd_workers_cpu_avg, (collected_number)( wu->workers_cpu_total * 10000ULL / (calculated_number)wu->workers_cpu_registered ));
+
+ rrdset_done(wu->st_workers_cpu);
+ }
+#endif
+
+ // ----------------------------------------------------------------------
+
+ if(unlikely(!wu->st_workers_jobs_per_job_type)) {
+ char name[RRD_ID_LENGTH_MAX + 1];
+ snprintfz(name, RRD_ID_LENGTH_MAX, "workers_jobs_by_type_%s", wu->name_lowercase);
+
+ char context[RRD_ID_LENGTH_MAX + 1];
+ snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.jobs_started_by_type", wu->name_lowercase);
+
+ wu->st_workers_jobs_per_job_type = rrdset_create_localhost(
+ "netdata"
+ , name
+ , NULL
+ , wu->family
+ , context
+ , "Netdata Workers Jobs Started by Type"
+ , "jobs"
+ , "netdata"
+ , "stats"
+ , wu->priority + 2
+ , localhost->rrd_update_every
+ , RRDSET_TYPE_STACKED
+ );
+ }
+
+ rrdset_next(wu->st_workers_jobs_per_job_type);
+
+ {
+ size_t i;
+ for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) {
+ if (wu->per_job_type[i].name[0]) {
+
+ if(unlikely(!wu->per_job_type[i].rd_jobs_started))
+ wu->per_job_type[i].rd_jobs_started = rrddim_add(wu->st_workers_jobs_per_job_type, wu->per_job_type[i].name, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+
+ rrddim_set_by_pointer(wu->st_workers_jobs_per_job_type, wu->per_job_type[i].rd_jobs_started, (collected_number)(wu->per_job_type[i].jobs_started));
}
- else
- rrdset_next(st_fd);
+ }
+ }
+
+ rrdset_done(wu->st_workers_jobs_per_job_type);
+
+ // ----------------------------------------------------------------------
+
+ if(unlikely(!wu->st_workers_busy_per_job_type)) {
+ char name[RRD_ID_LENGTH_MAX + 1];
+ snprintfz(name, RRD_ID_LENGTH_MAX, "workers_busy_time_by_type_%s", wu->name_lowercase);
+
+ char context[RRD_ID_LENGTH_MAX + 1];
+ snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.time_by_type", wu->name_lowercase);
+
+ wu->st_workers_busy_per_job_type = rrdset_create_localhost(
+ "netdata"
+ , name
+ , NULL
+ , wu->family
+ , context
+ , "Netdata Workers Busy Time by Type"
+ , "ms"
+ , "netdata"
+ , "stats"
+ , wu->priority + 3
+ , localhost->rrd_update_every
+ , RRDSET_TYPE_STACKED
+ );
+ }
+
+ rrdset_next(wu->st_workers_busy_per_job_type);
+
+ {
+ size_t i;
+ for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) {
+ if (wu->per_job_type[i].name[0]) {
- rrddim_set_by_pointer(st_fd, rd_fd_current, (collected_number)stats_array[32]);
- /* Careful here, modify this accordingly if the File-Descriptor budget ever changes */
- rrddim_set_by_pointer(st_fd, rd_fd_max, (collected_number)rlimit_nofile.rlim_cur / 4);
- rrdset_done(st_fd);
+ if(unlikely(!wu->per_job_type[i].rd_busy_time))
+ wu->per_job_type[i].rd_busy_time = rrddim_add(wu->st_workers_busy_per_job_type, wu->per_job_type[i].name, NULL, 1, USEC_PER_MS, RRD_ALGORITHM_ABSOLUTE);
+
+ rrddim_set_by_pointer(wu->st_workers_busy_per_job_type, wu->per_job_type[i].rd_busy_time, (collected_number)(wu->per_job_type[i].busy_time));
+ }
}
+ }
- // ----------------------------------------------------------------
+ rrdset_done(wu->st_workers_busy_per_job_type);
- {
- static RRDSET *st_ram_usage = NULL;
- static RRDDIM *rd_cached = NULL;
- static RRDDIM *rd_pinned = NULL;
- static RRDDIM *rd_metadata = NULL;
+ // ----------------------------------------------------------------------
- collected_number cached_pages, pinned_pages, API_producers, populated_pages, metadata, pages_on_disk,
- page_cache_descriptors;
+ if(wu->st_workers_threads || wu->workers_registered > 1) {
+ if(unlikely(!wu->st_workers_threads)) {
+ char name[RRD_ID_LENGTH_MAX + 1];
+ snprintfz(name, RRD_ID_LENGTH_MAX, "workers_threads_%s", wu->name_lowercase);
- if (unlikely(!st_ram_usage)) {
- st_ram_usage = rrdset_create_localhost(
+ char context[RRD_ID_LENGTH_MAX + 1];
+ snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.threads", wu->name_lowercase);
+
+ wu->st_workers_threads = rrdset_create_localhost(
"netdata"
- , "dbengine_ram"
- , NULL
- , "dbengine"
+ , name
, NULL
- , "Netdata DB engine RAM usage"
- , "MiB"
+ , wu->family
+ , context
+ , "Netdata Workers Threads"
+ , "threads"
, "netdata"
, "stats"
- , 130510
+ , wu->priority + 4
, localhost->rrd_update_every
, RRDSET_TYPE_STACKED
- );
+ );
- rd_cached = rrddim_add(st_ram_usage, "cache", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE);
- rd_pinned = rrddim_add(st_ram_usage, "collectors", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE);
- rd_metadata = rrddim_add(st_ram_usage, "metadata", NULL, 1, 1048576, RRD_ALGORITHM_ABSOLUTE);
- }
- else
- rrdset_next(st_ram_usage);
-
- API_producers = (collected_number)stats_array[0];
- pages_on_disk = (collected_number)stats_array[2];
- populated_pages = (collected_number)stats_array[3];
- page_cache_descriptors = (collected_number)stats_array[27];
-
- if (API_producers * 2 > populated_pages) {
- pinned_pages = API_producers;
- } else{
- pinned_pages = API_producers * 2;
- }
- cached_pages = populated_pages - pinned_pages;
+ wu->rd_workers_threads_free = rrddim_add(wu->st_workers_threads, "free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ wu->rd_workers_threads_busy = rrddim_add(wu->st_workers_threads, "busy", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ }
+ else
+ rrdset_next(wu->st_workers_threads);
- metadata = page_cache_descriptors * sizeof(struct page_cache_descr);
- metadata += pages_on_disk * sizeof(struct rrdeng_page_descr);
- /* This is an empirical estimation for Judy array indexing and extent structures */
- metadata += pages_on_disk * 58;
+ rrddim_set_by_pointer(wu->st_workers_threads, wu->rd_workers_threads_free, (collected_number)(wu->workers_registered - wu->workers_busy));
+ rrddim_set_by_pointer(wu->st_workers_threads, wu->rd_workers_threads_busy, (collected_number)(wu->workers_busy));
+ rrdset_done(wu->st_workers_threads);
+ }
+}
- rrddim_set_by_pointer(st_ram_usage, rd_cached, cached_pages);
- rrddim_set_by_pointer(st_ram_usage, rd_pinned, pinned_pages);
- rrddim_set_by_pointer(st_ram_usage, rd_metadata, metadata);
- rrdset_done(st_ram_usage);
+static void workers_utilization_reset_statistics(struct worker_utilization *wu) {
+ wu->workers_registered = 0;
+ wu->workers_busy = 0;
+ wu->workers_total_busy_time = 0;
+ wu->workers_total_duration = 0;
+ wu->workers_total_jobs_started = 0;
+ wu->workers_min_busy_time = WORKERS_MIN_PERCENT_DEFAULT;
+ wu->workers_max_busy_time = 0;
+
+ wu->workers_cpu_registered = 0;
+ wu->workers_cpu_min = WORKERS_MIN_PERCENT_DEFAULT;
+ wu->workers_cpu_max = 0;
+ wu->workers_cpu_total = 0;
+
+ size_t i;
+ for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) {
+ if(unlikely(!wu->name_lowercase)) {
+ wu->name_lowercase = strdupz(wu->name);
+ char *s = wu->name_lowercase;
+ for( ; *s ; s++) *s = tolower(*s);
}
+
+ wu->per_job_type[i].jobs_started = 0;
+ wu->per_job_type[i].busy_time = 0;
}
+
+ struct worker_thread *wt;
+ for(wt = wu->threads; wt ; wt = wt->next) {
+ wt->enabled = 0;
+ wt->cpu_enabled = 0;
+ }
+}
+
+static int read_thread_cpu_time_from_proc_stat(pid_t pid __maybe_unused, kernel_uint_t *utime __maybe_unused, kernel_uint_t *stime __maybe_unused) {
+#ifdef __linux__
+ char filename[200 + 1];
+ snprintfz(filename, 200, "/proc/self/task/%d/stat", pid);
+
+ procfile *ff = procfile_open(filename, " ", PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
+ if(!ff) return -1;
+
+ ff = procfile_readall(ff);
+ if(!ff) return -1;
+
+ *utime = str2kernel_uint_t(procfile_lineword(ff, 0, 13));
+ *stime = str2kernel_uint_t(procfile_lineword(ff, 0, 14));
+
+ procfile_close(ff);
+ return 0;
+#else
+ // TODO: add here cpu time detection per thread, for FreeBSD and MacOS
+ *utime = 0;
+ *stime = 0;
+ return 1;
#endif
+}
+static void workers_threads_cleanup(struct worker_utilization *wu) {
+ struct worker_thread *t;
+
+ // free threads at the beginning of the linked list
+ while(wu->threads && !wu->threads->enabled) {
+ t = wu->threads;
+ wu->threads = t->next;
+ t->next = NULL;
+ freez(t);
+ }
+
+ // free threads in the middle of the linked list
+ for(t = wu->threads; t && t->next ; t = t->next) {
+ if(t->next->enabled) continue;
+
+ struct worker_thread *to_remove = t->next;
+ t->next = to_remove->next;
+ to_remove->next = NULL;
+ freez(to_remove);
+ }
+}
+
+static struct worker_thread *worker_thread_find(struct worker_utilization *wu, pid_t pid) {
+ struct worker_thread *wt;
+ for(wt = wu->threads; wt && wt->pid != pid ; wt = wt->next) ;
+ return wt;
+}
+
+static struct worker_thread *worker_thread_create(struct worker_utilization *wu, pid_t pid) {
+ struct worker_thread *wt;
+
+ wt = (struct worker_thread *)callocz(1, sizeof(struct worker_thread));
+ wt->pid = pid;
+
+ // link it
+ wt->next = wu->threads;
+ wu->threads = wt;
+
+ return wt;
+}
+
+static struct worker_thread *worker_thread_find_or_create(struct worker_utilization *wu, pid_t pid) {
+ struct worker_thread *wt;
+ wt = worker_thread_find(wu, pid);
+ if(!wt) wt = worker_thread_create(wu, pid);
+
+ return wt;
+}
+
+static void worker_utilization_charts_callback(void *ptr, pid_t pid __maybe_unused, const char *thread_tag __maybe_unused, size_t utilization_usec __maybe_unused, size_t duration_usec __maybe_unused, size_t jobs_started __maybe_unused, size_t is_running __maybe_unused, const char **job_types_names __maybe_unused, size_t *job_types_jobs_started __maybe_unused, usec_t *job_types_busy_time __maybe_unused) {
+ struct worker_utilization *wu = (struct worker_utilization *)ptr;
+
+ // find the worker_thread in the list
+ struct worker_thread *wt = worker_thread_find_or_create(wu, pid);
+
+ wt->enabled = 1;
+ wt->busy_time = utilization_usec;
+ wt->jobs_started = jobs_started;
+
+ wt->utime_old = wt->utime;
+ wt->stime_old = wt->stime;
+ wt->collected_time_old = wt->collected_time;
+
+ wu->workers_total_busy_time += utilization_usec;
+ wu->workers_total_duration += duration_usec;
+ wu->workers_total_jobs_started += jobs_started;
+ wu->workers_busy += is_running;
+ wu->workers_registered++;
+
+ double util = (double)utilization_usec * 100.0 / (double)duration_usec;
+ if(util > wu->workers_max_busy_time)
+ wu->workers_max_busy_time = util;
+
+ if(util < wu->workers_min_busy_time)
+ wu->workers_min_busy_time = util;
+
+ // accumulate per job type statistics
+ size_t i;
+ for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) {
+ wu->per_job_type[i].jobs_started += job_types_jobs_started[i];
+ wu->per_job_type[i].busy_time += job_types_busy_time[i];
+
+ // new job type found
+ if(unlikely(!wu->per_job_type[i].name[0] && job_types_names[i]))
+ strncpyz(wu->per_job_type[i].name, job_types_names[i], WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH);
+ }
+
+ // find its CPU utilization
+ if((!read_thread_cpu_time_from_proc_stat(pid, &wt->utime, &wt->stime))) {
+ wt->collected_time = now_realtime_usec();
+ usec_t delta = wt->collected_time - wt->collected_time_old;
+
+ double utime = (double)(wt->utime - wt->utime_old) / (double)system_hz * 100.0 * (double)USEC_PER_SEC / (double)delta;
+ double stime = (double)(wt->stime - wt->stime_old) / (double)system_hz * 100.0 * (double)USEC_PER_SEC / (double)delta;
+ double cpu = utime + stime;
+ wt->cpu = cpu;
+ wt->cpu_enabled = 1;
+
+ wu->workers_cpu_total += cpu;
+ if(cpu < wu->workers_cpu_min) wu->workers_cpu_min = cpu;
+ if(cpu > wu->workers_cpu_max) wu->workers_cpu_max = cpu;
+ }
+ wu->workers_cpu_registered += wt->cpu_enabled;
+}
+
+static void worker_utilization_charts(void) {
+ static size_t iterations = 0;
+ iterations++;
+
+ int i;
+ for(i = 0; all_workers_utilization[i].name ;i++) {
+ workers_utilization_reset_statistics(&all_workers_utilization[i]);
+ workers_foreach(all_workers_utilization[i].name, worker_utilization_charts_callback, &all_workers_utilization[i]);
+
+ // skip the first iteration, so that we don't accumulate startup utilization to our charts
+ if(likely(iterations > 1))
+ workers_utilization_update_chart(&all_workers_utilization[i]);
+
+ workers_threads_cleanup(&all_workers_utilization[i]);
+ }
+
+ workers_total_cpu_utilization_chart();
}
+static void worker_utilization_finish(void) {
+ int i;
+ for(i = 0; all_workers_utilization[i].name ;i++) {
+ struct worker_utilization *wu = &all_workers_utilization[i];
+
+ if(wu->name_lowercase) {
+ freez(wu->name_lowercase);
+ wu->name_lowercase = NULL;
+ }
+
+ // mark all threads as not enabled
+ struct worker_thread *t;
+ for(t = wu->threads; t ; t = t->next) t->enabled = 0;
+
+ // let the cleanup job free them
+ workers_threads_cleanup(wu);
+ }
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+
static void global_statistics_cleanup(void *ptr)
{
+ worker_unregister();
+
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
info("cleaning up...");
+ worker_utilization_finish();
+
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
}
void *global_statistics_main(void *ptr)
{
+ worker_register("STATS");
+ worker_register_job_name(WORKER_JOB_GLOBAL, "global");
+ worker_register_job_name(WORKER_JOB_REGISTRY, "registry");
+ worker_register_job_name(WORKER_JOB_WORKERS, "workers");
+ worker_register_job_name(WORKER_JOB_DBENGINE, "dbengine");
+
netdata_thread_cleanup_push(global_statistics_cleanup, ptr);
int update_every =
- (int)config_get_number("CONFIG_SECTION_GLOBAL_STATISTICS", "update every", localhost->rrd_update_every);
+ (int)config_get_number(CONFIG_SECTION_GLOBAL_STATISTICS, "update every", localhost->rrd_update_every);
if (update_every < localhost->rrd_update_every)
update_every = localhost->rrd_update_every;
usec_t step = update_every * USEC_PER_SEC;
heartbeat_t hb;
heartbeat_init(&hb);
+
+ // keep the randomness at zero
+ // to make sure we are not close to any other thread
+ hb.randomness = 0;
+
while (!netdata_exit) {
+ worker_is_idle();
heartbeat_next(&hb, step);
+ worker_is_busy(WORKER_JOB_WORKERS);
+ worker_utilization_charts();
+
+ worker_is_busy(WORKER_JOB_GLOBAL);
global_statistics_charts();
+
+ worker_is_busy(WORKER_JOB_REGISTRY);
registry_statistics();
+
+ worker_is_busy(WORKER_JOB_DBENGINE);
+ dbengine_statistics_charts();
+
+ worker_is_busy(WORKER_JOB_HEARTBEAT);
+ update_heartbeat_charts();
}
netdata_thread_cleanup_pop(1);