summaryrefslogtreecommitdiffstats
path: root/libnetdata/worker_utilization
diff options
context:
space:
mode:
Diffstat (limited to 'libnetdata/worker_utilization')
-rw-r--r--libnetdata/worker_utilization/Makefile.am8
-rw-r--r--libnetdata/worker_utilization/README.md90
-rw-r--r--libnetdata/worker_utilization/worker_utilization.c210
-rw-r--r--libnetdata/worker_utilization/worker_utilization.h22
4 files changed, 330 insertions, 0 deletions
diff --git a/libnetdata/worker_utilization/Makefile.am b/libnetdata/worker_utilization/Makefile.am
new file mode 100644
index 00000000..161784b8
--- /dev/null
+++ b/libnetdata/worker_utilization/Makefile.am
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+AUTOMAKE_OPTIONS = subdir-objects
+MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
+
+dist_noinst_DATA = \
+ README.md \
+ $(NULL)
diff --git a/libnetdata/worker_utilization/README.md b/libnetdata/worker_utilization/README.md
new file mode 100644
index 00000000..35f30b40
--- /dev/null
+++ b/libnetdata/worker_utilization/README.md
@@ -0,0 +1,90 @@
+<!--
+title: "Worker Utilization"
+custom_edit_url: https://github.com/netdata/netdata/edit/master/libnetdata/onewayallocator/README.md
+-->
+
+# Worker Utilization
+
+This library is to be used when there are 1 or more worker threads accepting requests
+of some kind and servicing them. The goal is to provide a very simple way to monitor
+worker threads utilization, as a percentage of the time they are busy and the amount
+of requests served.
+
+## Design goals
+
+1. Minimal, if any, impact on the performance of the workers
+2. Easy to be integrated into any kind of worker
+3. No state of any kind at the worker side
+
+## How to use
+
+When a working thread starts, call:
+
+```c
+void worker_register(const char *name);
+```
+
+This will create the necessary structures for the library to work.
+No need to keep a pointer to them. They are allocated as `__thread` variables.
+
+Then job types need to be defined. Job types are anything a worker does that can be
+counted and their execution time needs to be reported. The library is fast enough to
+be integrated even on workers that perform hundreds of thousands of actions per second.
+
+Job types are defined like this:
+
+```c
+void worker_register_job_type(size_t id, const char *name);
+```
+
+`id` is a number starting from zero. The library is compiled with a fixed size of 50
+ids (0 to 49). More can be allocated by setting `WORKER_UTILIZATION_MAX_JOB_TYPES` in
+`worker_utilization.h`. `name` can be any string up to 22 characters. This can be
+changed by setting `WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH` in `worker_utilization.h`.
+
+Each thread that calls `worker_register(name)` will allocate about 3kB for maintaining
+the information required.
+
+When the thread stops, call:
+
+```c
+void worker_unregister(void);
+```
+
+Again, no parameters, or return values.
+
+> IMPORTANT: cancellable threads need to add a call to `worker_unregister()` to the
+> `pop` function that cleans up the thread. Failure to do so, will result in about
+> 3kB of memory leak for every thread that is stopped.
+
+When you are about to do some work in the working thread, call:
+
+```c
+void worker_is_busy(size_t id);
+```
+
+When you finish doing the job, call:
+
+```c
+void worker_is_idle(void);
+```
+
+Calls to `worker_is_busy(id)` can be made one after another (without calling
+`worker_is_idle()` between them) to switch jobs without losing any time between
+them and eliminating one of the 2 clock calls involved.
+
+## Implementation details
+
+Totally lockless, extremely fast, it should not introduce any kind of problems to the
+workers. Every time `worker_is_busy(id)` or `worker_is_idle()` are called, a call to
+`now_realtime_usec()` is done and a couple of variables are updated. That's it!
+
+The worker does not need to update the variables regularly. Based on the last status
+of the worker, the statistics collector of netdata will calculate if the thread is
+busy or idle all the time or part of the time. Works well for both thousands of jobs
+per second and unlimited working time (being totally busy with a single request for
+ages).
+
+The statistics collector is called by the global statistics thread of netdata. So,
+even if the workers are extremely busy with their jobs, netdata will be able to know
+how busy they are.
diff --git a/libnetdata/worker_utilization/worker_utilization.c b/libnetdata/worker_utilization/worker_utilization.c
new file mode 100644
index 00000000..bd3ad60e
--- /dev/null
+++ b/libnetdata/worker_utilization/worker_utilization.c
@@ -0,0 +1,210 @@
+#include "worker_utilization.h"
+
+#define WORKER_IDLE 'I'
+#define WORKER_BUSY 'B'
+
+struct worker_job_type {
+ char name[WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH + 1];
+
+ // statistics controlled variables
+ size_t statistics_last_jobs_started;
+ usec_t statistics_last_busy_time;
+
+ // worker controlled variables
+ volatile size_t worker_jobs_started;
+ volatile usec_t worker_busy_time;
+};
+
+struct worker {
+ pid_t pid;
+ const char *tag;
+ const char *workname;
+ uint32_t workname_hash;
+
+ // statistics controlled variables
+ volatile usec_t statistics_last_checkpoint;
+ size_t statistics_last_jobs_started;
+ usec_t statistics_last_busy_time;
+
+ // the worker controlled variables
+ volatile size_t job_id;
+ volatile size_t jobs_started;
+ volatile usec_t busy_time;
+ volatile usec_t last_action_timestamp;
+ volatile char last_action;
+
+ struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES];
+
+ struct worker *next;
+};
+
+static netdata_mutex_t base_lock = NETDATA_MUTEX_INITIALIZER;
+static struct worker *base = NULL;
+static __thread struct worker *worker = NULL;
+
+void worker_register(const char *workname) {
+ if(unlikely(worker)) return;
+
+ worker = callocz(1, sizeof(struct worker));
+ worker->pid = gettid();
+ worker->tag = strdupz(netdata_thread_tag());
+ worker->workname = strdupz(workname);
+ worker->workname_hash = simple_hash(worker->workname);
+
+ usec_t now = now_realtime_usec();
+ worker->statistics_last_checkpoint = now;
+ worker->last_action_timestamp = now;
+ worker->last_action = WORKER_IDLE;
+
+ netdata_mutex_lock(&base_lock);
+ worker->next = base;
+ base = worker;
+ netdata_mutex_unlock(&base_lock);
+}
+
+void worker_register_job_name(size_t job_id, const char *name) {
+ if(unlikely(!worker)) return;
+
+ if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES)) {
+ error("WORKER_UTILIZATION: job_id %zu is too big. Max is %zu", job_id, (size_t)(WORKER_UTILIZATION_MAX_JOB_TYPES - 1));
+ return;
+ }
+ if (*worker->per_job_type[job_id].name) {
+ error("WORKER_UTILIZATION: duplicate job registration: worker '%s' job id %zu is '%s', ignoring '%s'", worker->workname, job_id, worker->per_job_type[job_id].name, name);
+ return;
+ }
+
+ strncpy(worker->per_job_type[job_id].name, name, WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH);
+}
+
+void worker_unregister(void) {
+ if(unlikely(!worker)) return;
+
+ netdata_mutex_lock(&base_lock);
+ if(base == worker)
+ base = worker->next;
+ else {
+ struct worker *p;
+ for(p = base; p && p->next && p->next != worker ;p = p->next);
+ if(p && p->next == worker)
+ p->next = worker->next;
+ }
+ netdata_mutex_unlock(&base_lock);
+
+ freez((void *)worker->tag);
+ freez((void *)worker->workname);
+ freez(worker);
+
+ worker = NULL;
+}
+
+static inline void worker_is_idle_with_time(usec_t now) {
+ usec_t delta = now - worker->last_action_timestamp;
+ worker->busy_time += delta;
+ worker->per_job_type[worker->job_id].worker_busy_time += delta;
+
+ // the worker was busy
+ // set it to idle before we set the timestamp
+
+ worker->last_action = WORKER_IDLE;
+ if(likely(worker->last_action_timestamp < now))
+ worker->last_action_timestamp = now;
+}
+
+void worker_is_idle(void) {
+ if(unlikely(!worker)) return;
+ if(unlikely(worker->last_action != WORKER_BUSY)) return;
+
+ worker_is_idle_with_time(now_realtime_usec());
+}
+
+void worker_is_busy(size_t job_id) {
+ if(unlikely(!worker)) return;
+ if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES))
+ job_id = 0;
+
+ usec_t now = now_realtime_usec();
+
+ if(worker->last_action == WORKER_BUSY)
+ worker_is_idle_with_time(now);
+
+ // the worker was idle
+ // set the timestamp and then set it to busy
+
+ worker->job_id = job_id;
+ worker->per_job_type[job_id].worker_jobs_started++;
+ worker->jobs_started++;
+ worker->last_action_timestamp = now;
+ worker->last_action = WORKER_BUSY;
+}
+
+
+// statistics interface
+
+void workers_foreach(const char *workname, void (*callback)(void *data, pid_t pid, const char *thread_tag, size_t utilization_usec, size_t duration_usec, size_t jobs_started, size_t is_running, const char **job_types_names, size_t *job_types_jobs_started, usec_t *job_types_busy_time), void *data) {
+ netdata_mutex_lock(&base_lock);
+ uint32_t hash = simple_hash(workname);
+ usec_t busy_time, delta;
+ size_t i, jobs_started, jobs_running;
+
+ struct worker *p;
+ for(p = base; p ; p = p->next) {
+ if(hash != p->workname_hash || strcmp(workname, p->workname)) continue;
+
+ usec_t now = now_realtime_usec();
+
+ // find per job type statistics
+ const char *per_job_type_name[WORKER_UTILIZATION_MAX_JOB_TYPES];
+ size_t per_job_type_jobs_started[WORKER_UTILIZATION_MAX_JOB_TYPES];
+ usec_t per_job_type_busy_time[WORKER_UTILIZATION_MAX_JOB_TYPES];
+ for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) {
+ per_job_type_name[i] = p->per_job_type[i].name;
+
+ size_t tmp_jobs_started = p->per_job_type[i].worker_jobs_started;
+ per_job_type_jobs_started[i] = tmp_jobs_started - p->per_job_type[i].statistics_last_jobs_started;
+ p->per_job_type[i].statistics_last_jobs_started = tmp_jobs_started;
+
+ usec_t tmp_busy_time = p->per_job_type[i].worker_busy_time;
+ per_job_type_busy_time[i] = tmp_busy_time - p->per_job_type[i].statistics_last_busy_time;
+ p->per_job_type[i].statistics_last_busy_time = tmp_busy_time;
+ }
+
+ // get a copy of the worker variables
+ size_t worker_job_id = p->job_id;
+ usec_t worker_busy_time = p->busy_time;
+ size_t worker_jobs_started = p->jobs_started;
+ char worker_last_action = p->last_action;
+ usec_t worker_last_action_timestamp = p->last_action_timestamp;
+
+ delta = now - p->statistics_last_checkpoint;
+ p->statistics_last_checkpoint = now;
+
+ // this is the only variable both the worker thread and the statistics thread are writing
+ // we set this only when the worker is busy, so that the worker will not
+ // accumulate all the busy time, but only the time after the point we collected statistics
+ if(worker_last_action == WORKER_BUSY && p->last_action_timestamp == worker_last_action_timestamp && p->last_action == WORKER_BUSY)
+ p->last_action_timestamp = now;
+
+ // calculate delta busy time
+ busy_time = worker_busy_time - p->statistics_last_busy_time;
+ p->statistics_last_busy_time = worker_busy_time;
+
+ // calculate delta jobs done
+ jobs_started = worker_jobs_started - p->statistics_last_jobs_started;
+ p->statistics_last_jobs_started = worker_jobs_started;
+
+ jobs_running = 0;
+ if(worker_last_action == WORKER_BUSY) {
+ // the worker is still busy with something
+ // let's add that busy time to the reported one
+ usec_t dt = now - worker_last_action_timestamp;
+ busy_time += dt;
+ per_job_type_busy_time[worker_job_id] += dt;
+ jobs_running = 1;
+ }
+
+ callback(data, p->pid, p->tag, busy_time, delta, jobs_started, jobs_running, per_job_type_name, per_job_type_jobs_started, per_job_type_busy_time);
+ }
+
+ netdata_mutex_unlock(&base_lock);
+}
diff --git a/libnetdata/worker_utilization/worker_utilization.h b/libnetdata/worker_utilization/worker_utilization.h
new file mode 100644
index 00000000..8f16fe05
--- /dev/null
+++ b/libnetdata/worker_utilization/worker_utilization.h
@@ -0,0 +1,22 @@
+#ifndef WORKER_UTILIZATION_H
+#define WORKER_UTILIZATION_H 1
+
+#include "../libnetdata.h"
+
+// workers interfaces
+
+#define WORKER_UTILIZATION_MAX_JOB_TYPES 50
+#define WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH 25
+
+extern void worker_register(const char *workname);
+extern void worker_register_job_name(size_t job_id, const char *name);
+extern void worker_unregister(void);
+
+extern void worker_is_idle(void);
+extern void worker_is_busy(size_t job_id);
+
+// statistics interface
+
+extern void workers_foreach(const char *workname, void (*callback)(void *data, pid_t pid, const char *thread_tag, size_t utilization_usec, size_t duration_usec, size_t jobs_started, size_t is_running, const char **job_types_names, size_t *job_types_jobs_started, usec_t *job_types_busy_time), void *data);
+
+#endif // WORKER_UTILIZATION_H