1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
|
// SPDX-License-Identifier: GPL-3.0-or-later
#ifndef NETDATA_RRDENGINE_H
#define NETDATA_RRDENGINE_H
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <fcntl.h>
#include <lz4.h>
#include <Judy.h>
#include <openssl/sha.h>
#include <openssl/evp.h>
#include "daemon/common.h"
#include "../rrd.h"
#include "rrddiskprotocol.h"
#include "rrdenginelib.h"
#include "datafile.h"
#include "journalfile.h"
#include "rrdengineapi.h"
#include "pagecache.h"
#include "metric.h"
#include "cache.h"
#include "pdc.h"
extern unsigned rrdeng_pages_per_extent;
/* Forward declarations */
struct rrdengine_instance;
struct rrdeng_cmd;
#define MAX_PAGES_PER_EXTENT (64) /* TODO: can go higher only when journal supports bigger than 4KiB transactions */
#define RRDENG_FILE_NUMBER_SCAN_TMPL "%1u-%10u"
#define RRDENG_FILE_NUMBER_PRINT_TMPL "%1.1u-%10.10u"
typedef enum __attribute__ ((__packed__)) {
// final status for all pages
// if a page does not have one of these, it is considered unroutable
PDC_PAGE_READY = (1 << 0), // ready to be processed (pd->page is not null)
PDC_PAGE_FAILED = (1 << 1), // failed to be loaded (pd->page is null)
PDC_PAGE_SKIP = (1 << 2), // don't use this page, it is not good for us
PDC_PAGE_INVALID = (1 << 3), // don't use this page, it is invalid
PDC_PAGE_EMPTY = (1 << 4), // the page is empty, does not have any data
// other statuses for tracking issues
PDC_PAGE_PREPROCESSED = (1 << 5), // used during preprocessing
PDC_PAGE_PROCESSED = (1 << 6), // processed by the query caller
PDC_PAGE_RELEASED = (1 << 7), // already released
// data found in cache (preloaded) or on disk?
PDC_PAGE_PRELOADED = (1 << 8), // data found in memory
PDC_PAGE_DISK_PENDING = (1 << 9), // data need to be loaded from disk
// worker related statuses
PDC_PAGE_FAILED_INVALID_EXTENT = (1 << 10),
PDC_PAGE_FAILED_NOT_IN_EXTENT = (1 << 11),
PDC_PAGE_FAILED_TO_MAP_EXTENT = (1 << 12),
PDC_PAGE_FAILED_TO_ACQUIRE_DATAFILE= (1 << 13),
PDC_PAGE_EXTENT_FROM_CACHE = (1 << 14),
PDC_PAGE_EXTENT_FROM_DISK = (1 << 15),
PDC_PAGE_CANCELLED = (1 << 16), // the query thread had left when we try to load the page
PDC_PAGE_SOURCE_MAIN_CACHE = (1 << 17),
PDC_PAGE_SOURCE_OPEN_CACHE = (1 << 18),
PDC_PAGE_SOURCE_JOURNAL_V2 = (1 << 19),
PDC_PAGE_PRELOADED_PASS4 = (1 << 20),
// datafile acquired
PDC_PAGE_DATAFILE_ACQUIRED = (1 << 30),
} PDC_PAGE_STATUS;
#define PDC_PAGE_QUERY_GLOBAL_SKIP_LIST (PDC_PAGE_FAILED | PDC_PAGE_SKIP | PDC_PAGE_INVALID | PDC_PAGE_RELEASED)
typedef struct page_details_control {
struct rrdengine_instance *ctx;
struct metric *metric;
struct completion prep_completion;
struct completion page_completion; // sync between the query thread and the workers
Pvoid_t page_list_JudyL; // the list of page details
unsigned completed_jobs; // the number of jobs completed last time the query thread checked
bool workers_should_stop; // true when the query thread left and the workers should stop
bool prep_done;
PDC_PAGE_STATUS common_status;
size_t pages_to_load_from_disk;
SPINLOCK refcount_spinlock; // spinlock to protect refcount
int32_t refcount; // the number of workers currently working on this request + 1 for the query thread
size_t executed_with_gaps;
time_t start_time_s;
time_t end_time_s;
STORAGE_PRIORITY priority;
time_t optimal_end_time_s;
} PDC;
PDC *pdc_get(void);
struct page_details {
struct {
struct rrdengine_datafile *ptr;
uv_file file;
unsigned fileno;
struct {
uint64_t pos;
uint32_t bytes;
} extent;
} datafile;
struct pgc_page *page;
Word_t metric_id;
time_t first_time_s;
time_t last_time_s;
uint32_t update_every_s;
uint16_t page_length;
PDC_PAGE_STATUS status;
struct {
struct page_details *prev;
struct page_details *next;
} load;
};
struct page_details *page_details_get(void);
#define pdc_page_status_check(pd, flag) (__atomic_load_n(&((pd)->status), __ATOMIC_ACQUIRE) & (flag))
#define pdc_page_status_set(pd, flag) __atomic_or_fetch(&((pd)->status), flag, __ATOMIC_RELEASE)
#define pdc_page_status_clear(pd, flag) __atomic_and_fetch(&((od)->status), ~(flag), __ATOMIC_RELEASE)
struct jv2_extents_info {
size_t index;
uint64_t pos;
unsigned bytes;
size_t number_of_pages;
};
struct jv2_metrics_info {
uuid_t *uuid;
uint32_t page_list_header;
time_t first_time_s;
time_t last_time_s;
size_t number_of_pages;
Pvoid_t JudyL_pages_by_start_time;
};
struct jv2_page_info {
time_t start_time_s;
time_t end_time_s;
time_t update_every_s;
size_t page_length;
uint32_t extent_index;
void *custom_data;
// private
struct pgc_page *page;
};
typedef enum __attribute__ ((__packed__)) {
RRDENG_1ST_METRIC_WRITER = (1 << 0),
} RRDENG_COLLECT_HANDLE_OPTIONS;
typedef enum __attribute__ ((__packed__)) {
RRDENG_PAGE_PAST_COLLECTION = (1 << 0),
RRDENG_PAGE_REPEATED_COLLECTION = (1 << 1),
RRDENG_PAGE_BIG_GAP = (1 << 2),
RRDENG_PAGE_GAP = (1 << 3),
RRDENG_PAGE_FUTURE_POINT = (1 << 4),
RRDENG_PAGE_CREATED_IN_FUTURE = (1 << 5),
RRDENG_PAGE_COMPLETED_IN_FUTURE = (1 << 6),
RRDENG_PAGE_UNALIGNED = (1 << 7),
RRDENG_PAGE_CONFLICT = (1 << 8),
RRDENG_PAGE_FULL = (1 << 9),
RRDENG_PAGE_COLLECT_FINALIZE = (1 << 10),
RRDENG_PAGE_UPDATE_EVERY_CHANGE = (1 << 11),
RRDENG_PAGE_STEP_TOO_SMALL = (1 << 12),
RRDENG_PAGE_STEP_UNALIGNED = (1 << 13),
} RRDENG_COLLECT_PAGE_FLAGS;
struct rrdeng_collect_handle {
struct storage_collect_handle common; // has to be first item
RRDENG_COLLECT_PAGE_FLAGS page_flags;
RRDENG_COLLECT_HANDLE_OPTIONS options;
uint8_t type;
struct metric *metric;
struct pgc_page *page;
void *data;
size_t data_size;
struct pg_alignment *alignment;
uint32_t page_entries_max;
uint32_t page_position; // keep track of the current page size, to make sure we don't exceed it
usec_t page_start_time_ut;
usec_t page_end_time_ut;
usec_t update_every_ut;
};
struct rrdeng_query_handle {
struct metric *metric;
struct pgc_page *page;
struct rrdengine_instance *ctx;
storage_number *metric_data;
struct page_details_control *pdc;
// the request
time_t start_time_s;
time_t end_time_s;
STORAGE_PRIORITY priority;
// internal data
time_t now_s;
time_t dt_s;
unsigned position;
unsigned entries;
#ifdef NETDATA_INTERNAL_CHECKS
usec_t started_time_s;
pid_t query_pid;
struct rrdeng_query_handle *prev, *next;
#endif
};
struct rrdeng_query_handle *rrdeng_query_handle_get(void);
void rrdeng_query_handle_release(struct rrdeng_query_handle *handle);
enum rrdeng_opcode {
/* can be used to return empty status or flush the command queue */
RRDENG_OPCODE_NOOP = 0,
RRDENG_OPCODE_QUERY,
RRDENG_OPCODE_EXTENT_WRITE,
RRDENG_OPCODE_EXTENT_READ,
RRDENG_OPCODE_FLUSHED_TO_OPEN,
RRDENG_OPCODE_DATABASE_ROTATE,
RRDENG_OPCODE_JOURNAL_INDEX,
RRDENG_OPCODE_FLUSH_INIT,
RRDENG_OPCODE_EVICT_INIT,
RRDENG_OPCODE_CTX_SHUTDOWN,
RRDENG_OPCODE_CTX_QUIESCE,
RRDENG_OPCODE_CTX_POPULATE_MRG,
RRDENG_OPCODE_CLEANUP,
RRDENG_OPCODE_MAX
};
// WORKERS IDS:
// RRDENG_MAX_OPCODE : reserved for the cleanup
// RRDENG_MAX_OPCODE + opcode : reserved for the callbacks of each opcode
// RRDENG_MAX_OPCODE + RRDENG_MAX_OPCODE : reserved for the timer
#define RRDENG_TIMER_CB (RRDENG_OPCODE_MAX + RRDENG_OPCODE_MAX)
#define RRDENG_FLUSH_TRANSACTION_BUFFER_CB (RRDENG_TIMER_CB + 1)
#define RRDENG_OPCODES_WAITING (RRDENG_TIMER_CB + 2)
#define RRDENG_WORKS_DISPATCHED (RRDENG_TIMER_CB + 3)
#define RRDENG_WORKS_EXECUTING (RRDENG_TIMER_CB + 4)
struct extent_io_data {
unsigned fileno;
uv_file file;
uint64_t pos;
unsigned bytes;
uint16_t page_length;
};
struct extent_io_descriptor {
struct rrdengine_instance *ctx;
uv_fs_t uv_fs_request;
uv_buf_t iov;
uv_file file;
void *buf;
struct wal *wal;
uint64_t pos;
unsigned bytes;
struct completion *completion;
unsigned descr_count;
struct page_descr_with_data *descr_array[MAX_PAGES_PER_EXTENT];
struct rrdengine_datafile *datafile;
struct extent_io_descriptor *next; /* multiple requests to be served by the same cached extent */
};
struct generic_io_descriptor {
struct rrdengine_instance *ctx;
uv_fs_t req;
uv_buf_t iov;
void *buf;
void *data;
uint64_t pos;
unsigned bytes;
struct completion *completion;
};
typedef struct wal {
uint64_t transaction_id;
void *buf;
size_t size;
size_t buf_size;
struct generic_io_descriptor io_descr;
struct {
struct wal *prev;
struct wal *next;
} cache;
} WAL;
WAL *wal_get(struct rrdengine_instance *ctx, unsigned size);
void wal_release(WAL *wal);
/*
* Debug statistics not used by code logic.
* They only describe operations since DB engine instance load time.
*/
struct rrdengine_statistics {
rrdeng_stats_t before_decompress_bytes;
rrdeng_stats_t after_decompress_bytes;
rrdeng_stats_t before_compress_bytes;
rrdeng_stats_t after_compress_bytes;
rrdeng_stats_t io_write_bytes;
rrdeng_stats_t io_write_requests;
rrdeng_stats_t io_read_bytes;
rrdeng_stats_t io_read_requests;
rrdeng_stats_t datafile_creations;
rrdeng_stats_t datafile_deletions;
rrdeng_stats_t journalfile_creations;
rrdeng_stats_t journalfile_deletions;
rrdeng_stats_t io_errors;
rrdeng_stats_t fs_errors;
};
/* I/O errors global counter */
extern rrdeng_stats_t global_io_errors;
/* File-System errors global counter */
extern rrdeng_stats_t global_fs_errors;
/* number of File-Descriptors that have been reserved by dbengine */
extern rrdeng_stats_t rrdeng_reserved_file_descriptors;
/* inability to flush global counters */
extern rrdeng_stats_t global_pg_cache_over_half_dirty_events;
extern rrdeng_stats_t global_flushing_pressure_page_deletions; /* number of deleted pages */
struct rrdengine_instance {
struct {
bool legacy; // true when the db is autonomous for a single host
int tier; // the tier of this ctx
uint8_t page_type; // default page type for this context
uint64_t max_disk_space; // the max disk space this ctx is allowed to use
uint8_t global_compress_alg; // the wanted compression algorithm
char dbfiles_path[FILENAME_MAX + 1];
} config;
struct {
uv_rwlock_t rwlock; // the linked list of datafiles is protected by this lock
struct rrdengine_datafile *first; // oldest - the newest with ->first->prev
} datafiles;
struct {
RW_SPINLOCK spinlock;
Pvoid_t JudyL;
} njfv2idx;
struct {
unsigned last_fileno; // newest index of datafile and journalfile
unsigned last_flush_fileno; // newest index of datafile received data
size_t collectors_running;
size_t collectors_running_duplicate;
size_t inflight_queries; // the number of queries currently running
uint64_t current_disk_space; // the current disk space size used
uint64_t transaction_id; // the transaction id of the next extent flushing
bool migration_to_v2_running;
bool now_deleting_files;
unsigned extents_currently_being_flushed; // non-zero until we commit data to disk (both datafile and journal file)
time_t first_time_s;
} atomic;
struct {
bool exit_mode;
bool enabled; // when set (before shutdown), queries are prohibited
struct completion completion;
} quiesce;
struct {
struct {
size_t size;
struct completion *array;
} populate_mrg;
bool create_new_datafile_pair;
} loading;
struct rrdengine_statistics stats;
};
#define ctx_current_disk_space_get(ctx) __atomic_load_n(&(ctx)->atomic.current_disk_space, __ATOMIC_RELAXED)
#define ctx_current_disk_space_increase(ctx, size) __atomic_add_fetch(&(ctx)->atomic.current_disk_space, size, __ATOMIC_RELAXED)
#define ctx_current_disk_space_decrease(ctx, size) __atomic_sub_fetch(&(ctx)->atomic.current_disk_space, size, __ATOMIC_RELAXED)
static inline void ctx_io_read_op_bytes(struct rrdengine_instance *ctx, size_t bytes) {
__atomic_add_fetch(&ctx->stats.io_read_bytes, bytes, __ATOMIC_RELAXED);
__atomic_add_fetch(&ctx->stats.io_read_requests, 1, __ATOMIC_RELAXED);
}
static inline void ctx_io_write_op_bytes(struct rrdengine_instance *ctx, size_t bytes) {
__atomic_add_fetch(&ctx->stats.io_write_bytes, bytes, __ATOMIC_RELAXED);
__atomic_add_fetch(&ctx->stats.io_write_requests, 1, __ATOMIC_RELAXED);
}
static inline void ctx_io_error(struct rrdengine_instance *ctx) {
__atomic_add_fetch(&ctx->stats.io_errors, 1, __ATOMIC_RELAXED);
rrd_stat_atomic_add(&global_io_errors, 1);
}
static inline void ctx_fs_error(struct rrdengine_instance *ctx) {
__atomic_add_fetch(&ctx->stats.fs_errors, 1, __ATOMIC_RELAXED);
rrd_stat_atomic_add(&global_fs_errors, 1);
}
#define ctx_last_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_fileno, __ATOMIC_RELAXED)
#define ctx_last_fileno_increment(ctx) __atomic_add_fetch(&(ctx)->atomic.last_fileno, 1, __ATOMIC_RELAXED)
#define ctx_last_flush_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_flush_fileno, __ATOMIC_RELAXED)
static inline void ctx_last_flush_fileno_set(struct rrdengine_instance *ctx, unsigned fileno) {
unsigned old_fileno = ctx_last_flush_fileno_get(ctx);
do {
if(old_fileno >= fileno)
return;
} while(!__atomic_compare_exchange_n(&ctx->atomic.last_flush_fileno, &old_fileno, fileno, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
}
#define ctx_is_available_for_queries(ctx) (__atomic_load_n(&(ctx)->quiesce.enabled, __ATOMIC_RELAXED) == false && __atomic_load_n(&(ctx)->quiesce.exit_mode, __ATOMIC_RELAXED) == false)
void *dbengine_page_alloc(size_t size);
void dbengine_page_free(void *page, size_t size);
void *dbengine_extent_alloc(size_t size);
void dbengine_extent_free(void *extent, size_t size);
bool rrdeng_ctx_exceeded_disk_quota(struct rrdengine_instance *ctx);
int init_rrd_files(struct rrdengine_instance *ctx);
void finalize_rrd_files(struct rrdengine_instance *ctx);
bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx);
void dbengine_event_loop(void *arg);
typedef void (*enqueue_callback_t)(struct rrdeng_cmd *cmd);
typedef void (*dequeue_callback_t)(struct rrdeng_cmd *cmd);
void rrdeng_enqueue_epdl_cmd(struct rrdeng_cmd *cmd);
void rrdeng_dequeue_epdl_cmd(struct rrdeng_cmd *cmd);
typedef struct rrdeng_cmd *(*requeue_callback_t)(void *data);
void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY priority);
void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, void *data,
struct completion *completion, enum storage_priority priority,
enqueue_callback_t enqueue_cb, dequeue_callback_t dequeue_cb);
void pdc_route_asynchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc);
void pdc_route_synchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc);
void pdc_acquire(PDC *pdc);
bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router);
uint64_t rrdeng_target_data_file_size(struct rrdengine_instance *ctx);
struct page_descr_with_data *page_descriptor_get(void);
typedef struct validated_page_descriptor {
time_t start_time_s;
time_t end_time_s;
time_t update_every_s;
size_t page_length;
size_t point_size;
size_t entries;
uint8_t type;
bool is_valid;
} VALIDATED_PAGE_DESCRIPTOR;
#define DBENGINE_EMPTY_PAGE (void *)(-1)
#define page_entries_by_time(start_time_s, end_time_s, update_every_s) \
((update_every_s) ? (((end_time_s) - ((start_time_s) - (update_every_s))) / (update_every_s)) : 1)
#define page_entries_by_size(page_length_in_bytes, point_size_in_bytes) \
((page_length_in_bytes) / (point_size_in_bytes))
VALIDATED_PAGE_DESCRIPTOR validate_page(uuid_t *uuid,
time_t start_time_s,
time_t end_time_s,
time_t update_every_s,
size_t page_length,
uint8_t page_type,
size_t entries,
time_t now_s,
time_t overwrite_zero_update_every_s,
bool have_read_error,
const char *msg,
RRDENG_COLLECT_PAGE_FLAGS flags);
VALIDATED_PAGE_DESCRIPTOR validate_extent_page_descr(const struct rrdeng_extent_page_descr *descr, time_t now_s, time_t overwrite_zero_update_every_s, bool have_read_error);
void collect_page_flags_to_buffer(BUFFER *wb, RRDENG_COLLECT_PAGE_FLAGS flags);
typedef enum {
PAGE_IS_IN_THE_PAST = -1,
PAGE_IS_IN_RANGE = 0,
PAGE_IS_IN_THE_FUTURE = 1,
} TIME_RANGE_COMPARE;
TIME_RANGE_COMPARE is_page_in_time_range(time_t page_first_time_s, time_t page_last_time_s, time_t wanted_start_time_s, time_t wanted_end_time_s);
static inline time_t max_acceptable_collected_time(void) {
return now_realtime_sec() + 1;
}
void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool update_retention, bool worker);
static inline int journal_metric_uuid_compare(const void *key, const void *metric) {
return uuid_memcmp((uuid_t *)key, &(((struct journal_metric_list *) metric)->uuid));
}
#endif /* NETDATA_RRDENGINE_H */
|