1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
|
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Red Hat
*/
#ifndef VDO_SLAB_DEPOT_H
#define VDO_SLAB_DEPOT_H
#include <linux/atomic.h>
#include <linux/dm-kcopyd.h>
#include <linux/list.h>
#include "numeric.h"
#include "admin-state.h"
#include "completion.h"
#include "data-vio.h"
#include "encodings.h"
#include "physical-zone.h"
#include "priority-table.h"
#include "recovery-journal.h"
#include "statistics.h"
#include "types.h"
#include "vio.h"
#include "wait-queue.h"
/*
* A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has
* a single array of slabs in order to eliminate the need for additional math in order to compute
* which physical zone a PBN is in. It also has a block_allocator per zone.
*
* Each physical zone has a single dedicated queue and thread for performing all updates to the
* slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the
* code to omit more fine-grained locking for the various slab structures. Each physical zone
* maintains a separate copy of the slab summary to remove the need for explicit locking on that
* structure as well.
*
* Load operations must be performed on the admin thread. Normal operations, such as allocations
* and reference count updates, must be performed on the appropriate physical zone thread. Requests
* from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery
* journal thread to run on the appropriate physical zone thread. Save operations must be launched
* from the same admin thread as the original load operation.
*/
enum {
/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
};
/*
* Represents the possible status of a block.
*/
enum reference_status {
RS_FREE, /* this block is free */
RS_SINGLE, /* this block is singly-referenced */
RS_SHARED, /* this block is shared */
RS_PROVISIONAL /* this block is provisionally allocated */
};
struct vdo_slab;
struct journal_lock {
u16 count;
sequence_number_t recovery_start;
};
struct slab_journal {
/* A waiter object for getting a VIO pool entry */
struct vdo_waiter resource_waiter;
/* A waiter object for updating the slab summary */
struct vdo_waiter slab_summary_waiter;
/* A waiter object for getting a vio with which to flush */
struct vdo_waiter flush_waiter;
/* The queue of VIOs waiting to make an entry */
struct vdo_wait_queue entry_waiters;
/* The parent slab reference of this journal */
struct vdo_slab *slab;
/* Whether a tail block commit is pending */
bool waiting_to_commit;
/* Whether the journal is updating the slab summary */
bool updating_slab_summary;
/* Whether the journal is adding entries from the entry_waiters queue */
bool adding_entries;
/* Whether a partial write is in progress */
bool partial_write_in_progress;
/* The oldest block in the journal on disk */
sequence_number_t head;
/* The oldest block in the journal which may not be reaped */
sequence_number_t unreapable;
/* The end of the half-open interval of the active journal */
sequence_number_t tail;
/* The next journal block to be committed */
sequence_number_t next_commit;
/* The tail sequence number that is written in the slab summary */
sequence_number_t summarized;
/* The tail sequence number that was last summarized in slab summary */
sequence_number_t last_summarized;
/* The sequence number of the recovery journal lock */
sequence_number_t recovery_lock;
/*
* The number of entries which fit in a single block. Can't use the constant because unit
* tests change this number.
*/
journal_entry_count_t entries_per_block;
/*
* The number of full entries which fit in a single block. Can't use the constant because
* unit tests change this number.
*/
journal_entry_count_t full_entries_per_block;
/* The recovery journal of the VDO (slab journal holds locks on it) */
struct recovery_journal *recovery_journal;
/* The statistics shared by all slab journals in our physical zone */
struct slab_journal_statistics *events;
/* A list of the VIO pool entries for outstanding journal block writes */
struct list_head uncommitted_blocks;
/*
* The current tail block header state. This will be packed into the block just before it
* is written.
*/
struct slab_journal_block_header tail_header;
/* A pointer to a block-sized buffer holding the packed block data */
struct packed_slab_journal_block *block;
/* The number of blocks in the on-disk journal */
block_count_t size;
/* The number of blocks at which to start pushing reference blocks */
block_count_t flushing_threshold;
/* The number of blocks at which all reference blocks should be writing */
block_count_t flushing_deadline;
/* The number of blocks at which to wait for reference blocks to write */
block_count_t blocking_threshold;
/* The number of blocks at which to scrub the slab before coming online */
block_count_t scrubbing_threshold;
/* This list entry is for block_allocator to keep a queue of dirty journals */
struct list_head dirty_entry;
/* The lock for the oldest unreaped block of the journal */
struct journal_lock *reap_lock;
/* The locks for each on disk block */
struct journal_lock *locks;
};
/*
* Reference_block structure
*
* Blocks are used as a proxy, permitting saves of partial refcounts.
*/
struct reference_block {
/* This block waits on the ref_counts to tell it to write */
struct vdo_waiter waiter;
/* The slab to which this reference_block belongs */
struct vdo_slab *slab;
/* The number of references in this block that represent allocations */
block_size_t allocated_count;
/* The slab journal block on which this block must hold a lock */
sequence_number_t slab_journal_lock;
/* The slab journal block which should be released when this block is committed */
sequence_number_t slab_journal_lock_to_release;
/* The point up to which each sector is accurate on disk */
struct journal_point commit_points[VDO_SECTORS_PER_BLOCK];
/* Whether this block has been modified since it was written to disk */
bool is_dirty;
/* Whether this block is currently writing */
bool is_writing;
};
/* The search_cursor represents the saved position of a free block search. */
struct search_cursor {
/* The reference block containing the current search index */
struct reference_block *block;
/* The position at which to start searching for the next free counter */
slab_block_number index;
/* The position just past the last valid counter in the current block */
slab_block_number end_index;
/* A pointer to the first reference block in the slab */
struct reference_block *first_block;
/* A pointer to the last reference block in the slab */
struct reference_block *last_block;
};
enum slab_rebuild_status {
VDO_SLAB_REBUILT,
VDO_SLAB_REPLAYING,
VDO_SLAB_REQUIRES_SCRUBBING,
VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
VDO_SLAB_REBUILDING,
};
/*
* This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
* 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
* metadata storage for the reference counts and slab journal for the slab.
*
* A reference count is maintained for each physical block number. The vast majority of blocks have
* a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
* (254) the reference count is stored in counters[pbn].
*/
struct vdo_slab {
/* A list entry to queue this slab in a block_allocator list */
struct list_head allocq_entry;
/* The struct block_allocator that owns this slab */
struct block_allocator *allocator;
/* The journal for this slab */
struct slab_journal journal;
/* The slab number of this slab */
slab_count_t slab_number;
/* The offset in the allocator partition of the first block in this slab */
physical_block_number_t start;
/* The offset of the first block past the end of this slab */
physical_block_number_t end;
/* The starting translated PBN of the slab journal */
physical_block_number_t journal_origin;
/* The starting translated PBN of the reference counts */
physical_block_number_t ref_counts_origin;
/* The administrative state of the slab */
struct admin_state state;
/* The status of the slab */
enum slab_rebuild_status status;
/* Whether the slab was ever queued for scrubbing */
bool was_queued_for_scrubbing;
/* The priority at which this slab has been queued for allocation */
u8 priority;
/* Fields beyond this point are the reference counts for the data blocks in this slab. */
/* The size of the counters array */
u32 block_count;
/* The number of free blocks */
u32 free_blocks;
/* The array of reference counts */
vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */
/* The saved block pointer and array indexes for the free block search */
struct search_cursor search_cursor;
/* A list of the dirty blocks waiting to be written out */
struct vdo_wait_queue dirty_blocks;
/* The number of blocks which are currently writing */
size_t active_count;
/* A waiter object for updating the slab summary */
struct vdo_waiter summary_waiter;
/* The latest slab journal for which there has been a reference count update */
struct journal_point slab_journal_point;
/* The number of reference count blocks */
u32 reference_block_count;
/* reference count block array */
struct reference_block *reference_blocks;
};
enum block_allocator_drain_step {
VDO_DRAIN_ALLOCATOR_START,
VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER,
VDO_DRAIN_ALLOCATOR_STEP_SLABS,
VDO_DRAIN_ALLOCATOR_STEP_SUMMARY,
VDO_DRAIN_ALLOCATOR_STEP_FINISHED,
};
struct slab_scrubber {
/* The queue of slabs to scrub first */
struct list_head high_priority_slabs;
/* The queue of slabs to scrub once there are no high_priority_slabs */
struct list_head slabs;
/* The queue of VIOs waiting for a slab to be scrubbed */
struct vdo_wait_queue waiters;
/*
* The number of slabs that are unrecovered or being scrubbed. This field is modified by
* the physical zone thread, but is queried by other threads.
*/
slab_count_t slab_count;
/* The administrative state of the scrubber */
struct admin_state admin_state;
/* Whether to only scrub high-priority slabs */
bool high_priority_only;
/* The slab currently being scrubbed */
struct vdo_slab *slab;
/* The vio for loading slab journal blocks */
struct vio vio;
};
/* A sub-structure for applying actions in parallel to all an allocator's slabs. */
struct slab_actor {
/* The number of slabs performing a slab action */
slab_count_t slab_action_count;
/* The method to call when a slab action has been completed by all slabs */
vdo_action_fn callback;
};
/* A slab_iterator is a structure for iterating over a set of slabs. */
struct slab_iterator {
struct vdo_slab **slabs;
struct vdo_slab *next;
slab_count_t end;
slab_count_t stride;
};
/*
* The slab_summary provides hints during load and recovery about the state of the slabs in order
* to avoid the need to read the slab journals in their entirety before a VDO can come online.
*
* The information in the summary for each slab includes the rough number of free blocks (which is
* used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free
* space will be used on restart), and the location of the tail block of the slab's journal.
*
* The slab_summary has its own partition at the end of the volume which is sized to allow for a
* complete copy of the summary for each of up to 16 physical zones.
*
* During resize, the slab_summary moves its backing partition and is saved once moved; the
* slab_summary is not permitted to overwrite the previous recovery journal space.
*
* The slab_summary does not have its own version information, but relies on the VDO volume version
* number.
*/
/*
* A slab status is a very small structure for use in determining the ordering of slabs in the
* scrubbing process.
*/
struct slab_status {
slab_count_t slab_number;
bool is_clean;
u8 emptiness;
};
struct slab_summary_block {
/* The block_allocator to which this block belongs */
struct block_allocator *allocator;
/* The index of this block in its zone's summary */
block_count_t index;
/* Whether this block has a write outstanding */
bool writing;
/* Ring of updates waiting on the outstanding write */
struct vdo_wait_queue current_update_waiters;
/* Ring of updates waiting on the next write */
struct vdo_wait_queue next_update_waiters;
/* The active slab_summary_entry array for this block */
struct slab_summary_entry *entries;
/* The vio used to write this block */
struct vio vio;
/* The packed entries, one block long, backing the vio */
char *outgoing_entries;
};
/*
* The statistics for all the slab summary zones owned by this slab summary. These fields are all
* mutated only by their physical zone threads, but are read by other threads when gathering
* statistics for the entire depot.
*/
struct atomic_slab_summary_statistics {
/* Number of blocks written */
atomic64_t blocks_written;
};
struct block_allocator {
struct vdo_completion completion;
/* The slab depot for this allocator */
struct slab_depot *depot;
/* The nonce of the VDO */
nonce_t nonce;
/* The physical zone number of this allocator */
zone_count_t zone_number;
/* The thread ID for this allocator's physical zone */
thread_id_t thread_id;
/* The number of slabs in this allocator */
slab_count_t slab_count;
/* The number of the last slab owned by this allocator */
slab_count_t last_slab;
/* The reduced priority level used to preserve unopened slabs */
unsigned int unopened_slab_priority;
/* The state of this allocator */
struct admin_state state;
/* The actor for applying an action to all slabs */
struct slab_actor slab_actor;
/* The slab from which blocks are currently being allocated */
struct vdo_slab *open_slab;
/* A priority queue containing all slabs available for allocation */
struct priority_table *prioritized_slabs;
/* The slab scrubber */
struct slab_scrubber scrubber;
/* What phase of the close operation the allocator is to perform */
enum block_allocator_drain_step drain_step;
/*
* These statistics are all mutated only by the physical zone thread, but are read by other
* threads when gathering statistics for the entire depot.
*/
/*
* The count of allocated blocks in this zone. Not in block_allocator_statistics for
* historical reasons.
*/
u64 allocated_blocks;
/* Statistics for this block allocator */
struct block_allocator_statistics statistics;
/* Cumulative statistics for the slab journals in this zone */
struct slab_journal_statistics slab_journal_statistics;
/* Cumulative statistics for the reference counters in this zone */
struct ref_counts_statistics ref_counts_statistics;
/*
* This is the head of a queue of slab journals which have entries in their tail blocks
* which have not yet started to commit. When the recovery journal is under space pressure,
* slab journals which have uncommitted entries holding a lock on the recovery journal head
* are forced to commit their blocks early. This list is kept in order, with the tail
* containing the slab journal holding the most recent recovery journal lock.
*/
struct list_head dirty_slab_journals;
/* The vio pool for reading and writing block allocator metadata */
struct vio_pool *vio_pool;
/* The dm_kcopyd client for erasing slab journals */
struct dm_kcopyd_client *eraser;
/* Iterator over the slabs to be erased */
struct slab_iterator slabs_to_erase;
/* The portion of the slab summary managed by this allocator */
/* The state of the slab summary */
struct admin_state summary_state;
/* The number of outstanding summary writes */
block_count_t summary_write_count;
/* The array (owned by the blocks) of all entries */
struct slab_summary_entry *summary_entries;
/* The array of slab_summary_blocks */
struct slab_summary_block *summary_blocks;
};
enum slab_depot_load_type {
VDO_SLAB_DEPOT_NORMAL_LOAD,
VDO_SLAB_DEPOT_RECOVERY_LOAD,
VDO_SLAB_DEPOT_REBUILD_LOAD
};
struct slab_depot {
zone_count_t zone_count;
zone_count_t old_zone_count;
struct vdo *vdo;
struct slab_config slab_config;
struct action_manager *action_manager;
physical_block_number_t first_block;
physical_block_number_t last_block;
physical_block_number_t origin;
/* slab_size == (1 << slab_size_shift) */
unsigned int slab_size_shift;
/* Determines how slabs should be queued during load */
enum slab_depot_load_type load_type;
/* The state for notifying slab journals to release recovery journal */
sequence_number_t active_release_request;
sequence_number_t new_release_request;
/* State variables for scrubbing complete handling */
atomic_t zones_to_scrub;
/* Array of pointers to individually allocated slabs */
struct vdo_slab **slabs;
/* The number of slabs currently allocated and stored in 'slabs' */
slab_count_t slab_count;
/* Array of pointers to a larger set of slabs (used during resize) */
struct vdo_slab **new_slabs;
/* The number of slabs currently allocated and stored in 'new_slabs' */
slab_count_t new_slab_count;
/* The size that 'new_slabs' was allocated for */
block_count_t new_size;
/* The last block before resize, for rollback */
physical_block_number_t old_last_block;
/* The last block after resize, for resize */
physical_block_number_t new_last_block;
/* The statistics for the slab summary */
struct atomic_slab_summary_statistics summary_statistics;
/* The start of the slab summary partition */
physical_block_number_t summary_origin;
/* The number of bits to shift to get a 7-bit fullness hint */
unsigned int hint_shift;
/* The slab summary entries for all of the zones the partition can hold */
struct slab_summary_entry *summary_entries;
/* The block allocators for this depot */
struct block_allocator allocators[];
};
struct reference_updater;
bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab,
physical_block_number_t pbn,
enum journal_operation operation,
bool increment,
struct journal_point *recovery_point,
struct vdo_completion *parent);
int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
physical_block_number_t pbn,
enum journal_operation operation);
static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion)
{
vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION);
return container_of(completion, struct block_allocator, completion);
}
int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab,
physical_block_number_t pbn,
struct pbn_lock *lock);
int __must_check vdo_allocate_block(struct block_allocator *allocator,
physical_block_number_t *block_number_ptr);
int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
struct vdo_waiter *waiter);
void vdo_modify_reference_count(struct vdo_completion *completion,
struct reference_updater *updater);
int __must_check vdo_release_block_reference(struct block_allocator *allocator,
physical_block_number_t pbn);
void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion);
void vdo_dump_block_allocator(const struct block_allocator *allocator);
int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state,
struct vdo *vdo,
struct partition *summary_partition,
struct slab_depot **depot_ptr);
void vdo_free_slab_depot(struct slab_depot *depot);
struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot);
int __must_check vdo_allocate_reference_counters(struct slab_depot *depot);
struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot,
physical_block_number_t pbn);
u8 __must_check vdo_get_increment_limit(struct slab_depot *depot,
physical_block_number_t pbn);
bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot,
physical_block_number_t pbn);
block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot);
block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot);
void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
struct vdo_statistics *stats);
void vdo_load_slab_depot(struct slab_depot *depot,
const struct admin_state_code *operation,
struct vdo_completion *parent, void *context);
void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
enum slab_depot_load_type load_type,
struct vdo_completion *parent);
void vdo_update_slab_depot_size(struct slab_depot *depot);
int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
const struct partition *partition);
void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent);
void vdo_abandon_new_slabs(struct slab_depot *depot);
void vdo_drain_slab_depot(struct slab_depot *depot,
const struct admin_state_code *operation,
struct vdo_completion *parent);
void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent);
void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
sequence_number_t recovery_block_number);
void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
struct vdo_completion *parent);
void vdo_dump_slab_depot(const struct slab_depot *depot);
#endif /* VDO_SLAB_DEPOT_H */
|