src/clock.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460

/*
 * General time-keeping code and variables
 *
 * Copyright 2000-2021 Willy Tarreau <w@1wt.eu>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 *
 */

#include <sys/time.h>
#include <signal.h>
#include <time.h>

#ifdef USE_THREAD
#include <pthread.h>
#endif

#include <haproxy/api.h>
#include <haproxy/activity.h>
#include <haproxy/clock.h>
#include <haproxy/signal-t.h>
#include <haproxy/time.h>
#include <haproxy/tinfo-t.h>
#include <haproxy/tools.h>

struct timeval                   start_date;      /* the process's start date in wall-clock time */
struct timeval                   ready_date;      /* date when the process was considered ready */
ullong                           start_time_ns;   /* the process's start date in internal monotonic time (ns) */
volatile ullong                  global_now_ns;   /* common monotonic date between all threads, in ns (wraps every 585 yr) */
volatile uint                    global_now_ms;   /* common monotonic date in milliseconds (may wrap) */

THREAD_ALIGNED(64) static llong  now_offset;      /* global offset between system time and global time in ns */

THREAD_LOCAL ullong              now_ns;          /* internal monotonic date derived from real clock, in ns (wraps every 585 yr) */
THREAD_LOCAL uint                now_ms;          /* internal monotonic date in milliseconds (may wrap) */
THREAD_LOCAL struct timeval      date;            /* the real current date (wall-clock time) */

static THREAD_LOCAL struct timeval before_poll;   /* system date before calling poll() */
static THREAD_LOCAL struct timeval after_poll;    /* system date after leaving poll() */
static THREAD_LOCAL unsigned int samp_time;       /* total elapsed time over current sample */
static THREAD_LOCAL unsigned int idle_time;       /* total idle time over current sample */
static THREAD_LOCAL unsigned int iso_time_sec;     /* last iso time value for this thread */
static THREAD_LOCAL char         iso_time_str[34]; /* ISO time representation of gettimeofday() */

#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME)
static clockid_t per_thread_clock_id[MAX_THREADS];
#endif

/* returns the system's monotonic time in nanoseconds if supported, otherwise zero */
uint64_t now_mono_time(void)
{
	uint64_t ret = 0;
#if defined(_POSIX_TIMERS) && defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_MONOTONIC_CLOCK)
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	ret = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
#endif
	return ret;
}

/* Returns the system's monotonic time in nanoseconds.
 * Uses the coarse clock source if supported (for fast but
 * less precise queries with limited resource usage).
 * Fallback to now_mono_time() if coarse source is not supported,
 * which may itself return 0 if not supported either.
 */
uint64_t now_mono_time_fast(void)
{
#if defined(CLOCK_MONOTONIC_COARSE)
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
	return (ts.tv_sec * 1000000000ULL + ts.tv_nsec);
#else
	/* fallback to regular mono time,
	 * returns 0 if not supported
	 */
	return now_mono_time();
#endif
}

/* returns the current thread's cumulated CPU time in nanoseconds if supported, otherwise zero */
uint64_t now_cpu_time(void)
{
	uint64_t ret = 0;
#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME)
	struct timespec ts;
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
	ret = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
#endif
	return ret;
}

/* Returns the current thread's cumulated CPU time in nanoseconds.
 *
 * thread_local timer is cached so that call is less precise but also less
 * expensive if heavily used.
 * We use the mono time as a cache expiration hint since now_cpu_time() is
 * known to be much more expensive than now_mono_time_fast() on systems
 * supporting the COARSE clock source.
 *
 * Returns 0 if either now_mono_time_fast() or now_cpu_time() are not
 * supported.
 */
uint64_t now_cpu_time_fast(void)
{
	static THREAD_LOCAL uint64_t mono_cache = 0;
	static THREAD_LOCAL uint64_t cpu_cache = 0;
	uint64_t mono_cur;

	mono_cur = now_mono_time_fast();
	if (unlikely(mono_cur !=  mono_cache)) {
		/* global mono clock was updated: local cache is outdated */
		cpu_cache = now_cpu_time();
		mono_cache = mono_cur;
	}
	return cpu_cache;
}

/* returns another thread's cumulated CPU time in nanoseconds if supported, otherwise zero */
uint64_t now_cpu_time_thread(int thr)
{
	uint64_t ret = 0;
#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME)
	struct timespec ts;
	clock_gettime(per_thread_clock_id[thr], &ts);
	ret = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
#endif
	return ret;
}

/* set the clock source for the local thread */
void clock_set_local_source(void)
{
#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME)
#ifdef USE_THREAD
	pthread_getcpuclockid(pthread_self(), &per_thread_clock_id[tid]);
#else
	per_thread_clock_id[tid] = CLOCK_THREAD_CPUTIME_ID;
#endif
#endif
}

/* registers a timer <tmr> of type timer_t delivering signal <sig> with value
 * <val>. It tries on the current thread's clock ID first and falls back to
 * CLOCK_REALTIME. Returns non-zero on success, 1 on failure.
 */
int clock_setup_signal_timer(void *tmr, int sig, int val)
{
	int ret = 0;

#if defined(USE_RT) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME)
	struct sigevent sev = { };
	timer_t *timer = tmr;
	sigset_t set;

	/* unblock the WDTSIG signal we intend to use */
	sigemptyset(&set);
	sigaddset(&set, WDTSIG);
	ha_sigmask(SIG_UNBLOCK, &set, NULL);

	/* this timer will signal WDTSIG when it fires, with tid in the si_int
	 * field (important since any thread will receive the signal).
	 */
	sev.sigev_notify          = SIGEV_SIGNAL;
	sev.sigev_signo           = sig;
	sev.sigev_value.sival_int = val;
	if (timer_create(per_thread_clock_id[tid], &sev, timer) != -1 ||
	    timer_create(CLOCK_REALTIME, &sev, timer) != -1)
		ret = 1;
#endif
	return ret;
}

/* clock_update_date: sets <date> to system time, and sets <now_ns> to something
 * as close as possible to real time, following a monotonic function. The main
 * principle consists in detecting backwards and forwards time jumps and adjust
 * an offset to correct them. This function should be called once after each
 * poll, and never farther apart than MAX_DELAY_MS*2. The poll's timeout should
 * be passed in <max_wait>, and the return value in <interrupted> (a non-zero
 * value means that we have not expired the timeout).
 *
 * clock_init_process_date() must have been called once first, and
 * clock_init_thread_date() must also have been called once for each thread.
 *
 * An offset is used to adjust the current time (date), to figure a monotonic
 * local time (now_ns). The offset is not critical, as it is only updated after
 * a clock jump is detected. From this point all threads will apply it to their
 * locally measured time, and will then agree around a common monotonic
 * global_now_ns value that serves to further refine their local time. Both
 * now_ns and global_now_ns are 64-bit integers counting nanoseconds since a
 * vague reference (it starts roughly 20s before the next wrap-around of the
 * millisecond counter after boot). The offset is also an integral number of
 * nanoseconds, but it's signed so that the clock can be adjusted in the two
 * directions.
 */
void clock_update_local_date(int max_wait, int interrupted)
{
	struct timeval min_deadline, max_deadline;

	gettimeofday(&date, NULL);

	/* compute the minimum and maximum local date we may have reached based
	 * on our past date and the associated timeout. There are three possible
	 * extremities:
	 *    - the new date cannot be older than before_poll
	 *    - if not interrupted, the new date cannot be older than
	 *      before_poll+max_wait
	 *    - in any case the new date cannot be newer than
	 *      before_poll+max_wait+some margin (100ms used here).
	 * In case of violation, we'll ignore the current date and instead
	 * restart from the last date we knew.
	 */
	_tv_ms_add(&min_deadline, &before_poll, max_wait);
	_tv_ms_add(&max_deadline, &before_poll, max_wait + 100);

	if (unlikely(__tv_islt(&date, &before_poll)                    || // big jump backwards
		     (!interrupted && __tv_islt(&date, &min_deadline)) || // small jump backwards
		     __tv_islt(&max_deadline, &date))) {                  // big jump forwards
		if (!interrupted)
			now_ns += ms_to_ns(max_wait);
	} else {
		/* The date is still within expectations. Let's apply the
		 * now_offset to the system date. Note: ofs if made of two
		 * independent signed ints.
		 */
		now_ns = tv_to_ns(&date) + HA_ATOMIC_LOAD(&now_offset);
	}
	now_ms = ns_to_ms(now_ns);
}

void clock_update_global_date()
{
	ullong old_now_ns;
	uint old_now_ms;

	/* now that we have bounded the local time, let's check if it's
	 * realistic regarding the global date, which only moves forward,
	 * otherwise catch up.
	 */
	old_now_ns = _HA_ATOMIC_LOAD(&global_now_ns);
	old_now_ms = global_now_ms;

	do {
		if (now_ns < old_now_ns)
			now_ns = old_now_ns;

		/* now <now_ns> is expected to be the most accurate date,
		 * equal to <global_now_ns> or newer. Updating the global
		 * date too often causes extreme contention and is not
		 * needed: it's only used to help threads run at the
		 * same date in case of local drift, and the global date,
		 * which changes, is only used by freq counters (a choice
		 * which is debatable by the way since it changes under us).
		 * Tests have seen that the contention can be reduced from
		 * 37% in this function to almost 0% when keeping clocks
		 * synchronized no better than 32 microseconds, so that's
		 * what we're doing here.
		 */
		now_ms = ns_to_ms(now_ns);

		if (!((now_ns ^ old_now_ns) & ~0x7FFFULL))
			return;

		/* let's try to update the global_now_ns (both in nanoseconds
		 * and ms forms) or loop again.
		 */
	} while ((!_HA_ATOMIC_CAS(&global_now_ns, &old_now_ns, now_ns) ||
		  (now_ms  != old_now_ms && !_HA_ATOMIC_CAS(&global_now_ms, &old_now_ms, now_ms))) &&
		 __ha_cpu_relax());

	/* <now_ns> and <now_ms> are now updated to the last value of
	 * global_now_ns and global_now_ms, which were also monotonically
	 * updated. We can compute the latest offset, we don't care who writes
	 * it last, the variations will not break the monotonic property.
	 */
	HA_ATOMIC_STORE(&now_offset, now_ns - tv_to_ns(&date));
}

/* must be called once at boot to initialize some global variables */
void clock_init_process_date(void)
{
	now_offset = 0;
	gettimeofday(&date, NULL);
	after_poll = before_poll = date;
	now_ns = global_now_ns = tv_to_ns(&date);
	global_now_ms = ns_to_ms(now_ns);

	/* force time to wrap 20s after boot: we first compute the time offset
	 * that once applied to the wall-clock date will make the local time
	 * wrap in 5 seconds. This offset is applied to the process-wide time,
	 * and will be used to recompute the local time, both of which will
	 * match and continue from this shifted date.
	 */
	now_offset = sec_to_ns((uint)((uint)(-global_now_ms) / 1000U - BOOT_TIME_WRAP_SEC));
	global_now_ns += now_offset;
	now_ns = global_now_ns;
	now_ms = global_now_ms = ns_to_ms(now_ns);

	th_ctx->idle_pct = 100;
	clock_update_date(0, 1);
}

void clock_adjust_now_offset(void)
{
	HA_ATOMIC_STORE(&now_offset, now_ns - tv_to_ns(&date));
}

/* must be called once per thread to initialize their thread-local variables.
 * Note that other threads might also be initializing and running in parallel.
 */
void clock_init_thread_date(void)
{
	gettimeofday(&date, NULL);
	after_poll = before_poll = date;

	now_ns = _HA_ATOMIC_LOAD(&global_now_ns);
	th_ctx->idle_pct = 100;
	th_ctx->prev_cpu_time  = now_cpu_time();
	clock_update_date(0, 1);
}

/* report the average CPU idle percentage over all running threads, between 0 and 100 */
uint clock_report_idle(void)
{
	uint total = 0;
	uint rthr = 0;
	uint thr;

	for (thr = 0; thr < MAX_THREADS; thr++) {
		if (!ha_thread_info[thr].tg ||
		    !(ha_thread_info[thr].tg->threads_enabled & ha_thread_info[thr].ltid_bit))
			continue;
		total += HA_ATOMIC_LOAD(&ha_thread_ctx[thr].idle_pct);
		rthr++;
	}
	return rthr ? total / rthr : 0;
}

/* Update the idle time value twice a second, to be called after
 * clock_update_date() when called after poll(), and currently called only by
 * clock_leaving_poll() below. It relies on <before_poll> to be updated to
 * the system time before calling poll().
 */
static inline void clock_measure_idle(void)
{
	/* Let's compute the idle to work ratio. We worked between after_poll
	 * and before_poll, and slept between before_poll and date. The idle_pct
	 * is updated at most twice every second. Note that the current second
	 * rarely changes so we avoid a multiply when not needed.
	 */
	int delta;

	if ((delta = date.tv_sec - before_poll.tv_sec))
		delta *= 1000000;
	idle_time += delta + (date.tv_usec - before_poll.tv_usec);

	if ((delta = date.tv_sec - after_poll.tv_sec))
		delta *= 1000000;
	samp_time += delta + (date.tv_usec - after_poll.tv_usec);

	after_poll.tv_sec = date.tv_sec; after_poll.tv_usec = date.tv_usec;
	if (samp_time < 500000)
		return;

	HA_ATOMIC_STORE(&th_ctx->idle_pct, (100ULL * idle_time + samp_time / 2) / samp_time);
	idle_time = samp_time = 0;
}

/* Collect date and time information after leaving poll(). <timeout> must be
 * set to the maximum sleep time passed to poll (in milliseconds), and
 * <interrupted> must be zero if the poller reached the timeout or non-zero
 * otherwise, which generally is provided by the poller's return value.
 */
void clock_leaving_poll(int timeout, int interrupted)
{
	clock_measure_idle();
	th_ctx->prev_cpu_time  = now_cpu_time();
	th_ctx->prev_mono_time = now_mono_time();
}

/* Collect date and time information before calling poll(). This will be used
 * to count the run time of the past loop and the sleep time of the next poll.
 * It also compares the elapsed and cpu times during the activity period to
 * estimate the amount of stolen time, which is reported if higher than half
 * a millisecond.
 */
void clock_entering_poll(void)
{
	uint64_t new_mono_time;
	uint64_t new_cpu_time;
	uint32_t run_time;
	int64_t stolen;

	gettimeofday(&before_poll, NULL);

	run_time = (before_poll.tv_sec - after_poll.tv_sec) * 1000000U + (before_poll.tv_usec - after_poll.tv_usec);

	new_cpu_time   = now_cpu_time();
	new_mono_time  = now_mono_time();

	if (th_ctx->prev_cpu_time && th_ctx->prev_mono_time) {
		new_cpu_time  -= th_ctx->prev_cpu_time;
		new_mono_time -= th_ctx->prev_mono_time;
		stolen = new_mono_time - new_cpu_time;
		if (unlikely(stolen >= 500000)) {
			stolen /= 500000;
			/* more than half a millisecond difference might
			 * indicate an undesired preemption.
			 */
			report_stolen_time(stolen);
		}
	}

	/* update the average runtime */
	activity_count_runtime(run_time);
}

/* returns the current date as returned by gettimeofday() in ISO+microsecond
 * format. It uses a thread-local static variable that the reader can consume
 * for as long as it wants until next call. Thus, do not call it from a signal
 * handler. If <pad> is non-0, a trailing space will be added. It will always
 * return exactly 32 or 33 characters (depending on padding) and will always be
 * zero-terminated, thus it will always fit into a 34 bytes buffer.
 * This also always include the local timezone (in +/-HH:mm format) .
 */
char *timeofday_as_iso_us(int pad)
{
	struct timeval new_date;
	struct tm tm;
	const char *offset;
	char c;

	gettimeofday(&new_date, NULL);
	if (new_date.tv_sec != iso_time_sec || !new_date.tv_sec) {
		get_localtime(new_date.tv_sec, &tm);
		offset = get_gmt_offset(new_date.tv_sec, &tm);
		if (unlikely(strftime(iso_time_str, sizeof(iso_time_str), "%Y-%m-%dT%H:%M:%S.000000+00:00", &tm) != 32))
			strlcpy2(iso_time_str, "YYYY-mm-ddTHH:MM:SS.000000-00:00", sizeof(iso_time_str)); // make the failure visible but respect format.
		iso_time_str[26] = offset[0];
		iso_time_str[27] = offset[1];
		iso_time_str[28] = offset[2];
		iso_time_str[30] = offset[3];
		iso_time_str[31] = offset[4];
		iso_time_sec = new_date.tv_sec;
	}

	/* utoa_pad adds a trailing 0 so we save the char for restore */
	c = iso_time_str[26];
	utoa_pad(new_date.tv_usec, iso_time_str + 20, 7);
	iso_time_str[26] = c;
	if (pad) {
		iso_time_str[32] = ' ';
		iso_time_str[33] = 0;
	}
	return iso_time_str;
}