1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
|
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Red Hat
*/
#ifndef VDO_BLOCK_MAP_H
#define VDO_BLOCK_MAP_H
#include <linux/list.h>
#include "numeric.h"
#include "admin-state.h"
#include "completion.h"
#include "encodings.h"
#include "int-map.h"
#include "statistics.h"
#include "types.h"
#include "vio.h"
#include "wait-queue.h"
/*
* The block map is responsible for tracking all the logical to physical mappings of a VDO. It
* consists of a collection of 60 radix trees gradually allocated as logical addresses are used.
* Each tree is assigned to a logical zone such that it is easy to compute which zone must handle
* each logical address. Each logical zone also has a dedicated portion of the leaf page cache.
*
* Each logical zone has a single dedicated queue and thread for performing all updates to the
* radix trees assigned to that zone. The concurrency guarantees of this single-threaded model
* allow the code to omit more fine-grained locking for the block map structures.
*
* Load operations must be performed on the admin thread. Normal operations, such as reading and
* updating mappings, must be performed on the appropriate logical zone thread. Save operations
* must be launched from the same admin thread as the original load operation.
*/
enum {
BLOCK_MAP_VIO_POOL_SIZE = 64,
};
/*
* Generation counter for page references.
*/
typedef u32 vdo_page_generation;
extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY;
/* The VDO Page Cache abstraction. */
struct vdo_page_cache {
/* the VDO which owns this cache */
struct vdo *vdo;
/* number of pages in cache */
page_count_t page_count;
/* number of pages to write in the current batch */
page_count_t pages_in_batch;
/* Whether the VDO is doing a read-only rebuild */
bool rebuilding;
/* array of page information entries */
struct page_info *infos;
/* raw memory for pages */
char *pages;
/* cache last found page info */
struct page_info *last_found;
/* map of page number to info */
struct int_map *page_map;
/* main LRU list (all infos) */
struct list_head lru_list;
/* free page list (oldest first) */
struct list_head free_list;
/* outgoing page list */
struct list_head outgoing_list;
/* number of read I/O operations pending */
page_count_t outstanding_reads;
/* number of write I/O operations pending */
page_count_t outstanding_writes;
/* number of pages covered by the current flush */
page_count_t pages_in_flush;
/* number of pages waiting to be included in the next flush */
page_count_t pages_to_flush;
/* number of discards in progress */
unsigned int discard_count;
/* how many VPCs waiting for free page */
unsigned int waiter_count;
/* queue of waiters who want a free page */
struct vdo_wait_queue free_waiters;
/*
* Statistics are only updated on the logical zone thread, but are accessed from other
* threads.
*/
struct block_map_statistics stats;
/* counter for pressure reports */
u32 pressure_report;
/* the block map zone to which this cache belongs */
struct block_map_zone *zone;
};
/*
* The state of a page buffer. If the page buffer is free no particular page is bound to it,
* otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
* the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
* in flight (incoming or outgoing) and its data should not be accessed.
*
* @note Update the static data in get_page_state_name() if you change this enumeration.
*/
enum vdo_page_buffer_state {
/* this page buffer is not being used */
PS_FREE,
/* this page is being read from store */
PS_INCOMING,
/* attempt to load this page failed */
PS_FAILED,
/* this page is valid and un-modified */
PS_RESIDENT,
/* this page is valid and modified */
PS_DIRTY,
/* this page is being written and should not be used */
PS_OUTGOING,
/* not a state */
PAGE_STATE_COUNT,
} __packed;
/*
* The write status of page
*/
enum vdo_page_write_status {
WRITE_STATUS_NORMAL,
WRITE_STATUS_DISCARD,
WRITE_STATUS_DEFERRED,
} __packed;
/* Per-page-slot information. */
struct page_info {
/* Preallocated page struct vio */
struct vio *vio;
/* back-link for references */
struct vdo_page_cache *cache;
/* the pbn of the page */
physical_block_number_t pbn;
/* page is busy (temporarily locked) */
u16 busy;
/* the write status the page */
enum vdo_page_write_status write_status;
/* page state */
enum vdo_page_buffer_state state;
/* queue of completions awaiting this item */
struct vdo_wait_queue waiting;
/* state linked list entry */
struct list_head state_entry;
/* LRU entry */
struct list_head lru_entry;
/*
* The earliest recovery journal block containing uncommitted updates to the block map page
* associated with this page_info. A reference (lock) is held on that block to prevent it
* from being reaped. When this value changes, the reference on the old value must be
* released and a reference on the new value must be acquired.
*/
sequence_number_t recovery_lock;
};
/*
* A completion awaiting a specific page. Also a live reference into the page once completed, until
* freed.
*/
struct vdo_page_completion {
/* The generic completion */
struct vdo_completion completion;
/* The cache involved */
struct vdo_page_cache *cache;
/* The waiter for the pending list */
struct vdo_waiter waiter;
/* The absolute physical block number of the page on disk */
physical_block_number_t pbn;
/* Whether the page may be modified */
bool writable;
/* Whether the page is available */
bool ready;
/* The info structure for the page, only valid when ready */
struct page_info *info;
};
struct forest;
struct tree_page {
struct vdo_waiter waiter;
/* Dirty list entry */
struct list_head entry;
/* If dirty, the tree zone flush generation in which it was last dirtied. */
u8 generation;
/* Whether this page is an interior tree page being written out. */
bool writing;
/* If writing, the tree zone flush generation of the copy being written. */
u8 writing_generation;
/*
* Sequence number of the earliest recovery journal block containing uncommitted updates to
* this page
*/
sequence_number_t recovery_lock;
/* The value of recovery_lock when the this page last started writing */
sequence_number_t writing_recovery_lock;
char page_buffer[VDO_BLOCK_SIZE];
};
enum block_map_page_type {
VDO_TREE_PAGE,
VDO_CACHE_PAGE,
};
typedef struct list_head dirty_era_t[2];
struct dirty_lists {
/* The number of periods after which an element will be expired */
block_count_t maximum_age;
/* The oldest period which has unexpired elements */
sequence_number_t oldest_period;
/* One more than the current period */
sequence_number_t next_period;
/* The offset in the array of lists of the oldest period */
block_count_t offset;
/* Expired pages */
dirty_era_t expired;
/* The lists of dirty pages */
dirty_era_t eras[];
};
struct block_map_zone {
zone_count_t zone_number;
thread_id_t thread_id;
struct admin_state state;
struct block_map *block_map;
/* Dirty pages, by era*/
struct dirty_lists *dirty_lists;
struct vdo_page_cache page_cache;
data_vio_count_t active_lookups;
struct int_map *loading_pages;
struct vio_pool *vio_pool;
/* The tree page which has issued or will be issuing a flush */
struct tree_page *flusher;
struct vdo_wait_queue flush_waiters;
/* The generation after the most recent flush */
u8 generation;
u8 oldest_generation;
/* The counts of dirty pages in each generation */
u32 dirty_page_counts[256];
};
struct block_map {
struct vdo *vdo;
struct action_manager *action_manager;
/* The absolute PBN of the first root of the tree part of the block map */
physical_block_number_t root_origin;
block_count_t root_count;
/* The era point we are currently distributing to the zones */
sequence_number_t current_era_point;
/* The next era point */
sequence_number_t pending_era_point;
/* The number of entries in block map */
block_count_t entry_count;
nonce_t nonce;
struct recovery_journal *journal;
/* The trees for finding block map pages */
struct forest *forest;
/* The expanded trees awaiting growth */
struct forest *next_forest;
/* The number of entries after growth */
block_count_t next_entry_count;
zone_count_t zone_count;
struct block_map_zone zones[];
};
/**
* typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing
* the forest.
* @pbn: A PBN of a tree node.
* @completion: The parent completion of the traversal.
*
* Return: VDO_SUCCESS or an error.
*/
typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn,
struct vdo_completion *completion);
static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion)
{
vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION);
return container_of(completion, struct vdo_page_completion, completion);
}
void vdo_release_page_completion(struct vdo_completion *completion);
void vdo_get_page(struct vdo_page_completion *page_completion,
struct block_map_zone *zone, physical_block_number_t pbn,
bool writable, void *parent, vdo_action_fn callback,
vdo_action_fn error_handler, bool requeue);
void vdo_request_page_write(struct vdo_completion *completion);
int __must_check vdo_get_cached_page(struct vdo_completion *completion,
struct block_map_page **page_ptr);
int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache);
static inline struct block_map_page * __must_check
vdo_as_block_map_page(struct tree_page *tree_page)
{
return (struct block_map_page *) tree_page->page_buffer;
}
bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
physical_block_number_t pbn,
struct block_map_page *page);
void vdo_find_block_map_slot(struct data_vio *data_vio);
physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
page_number_t page_number);
void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone);
void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
struct vdo_completion *completion);
int __must_check vdo_decode_block_map(struct block_map_state_2_0 state,
block_count_t logical_blocks, struct vdo *vdo,
struct recovery_journal *journal, nonce_t nonce,
page_count_t cache_size, block_count_t maximum_age,
struct block_map **map_ptr);
void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
struct vdo_completion *parent);
void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent);
int __must_check vdo_prepare_to_grow_block_map(struct block_map *map,
block_count_t new_logical_blocks);
void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent);
void vdo_abandon_block_map_growth(struct block_map *map);
void vdo_free_block_map(struct block_map *map);
struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map);
void vdo_initialize_block_map_from_journal(struct block_map *map,
struct recovery_journal *journal);
zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio);
void vdo_advance_block_map_era(struct block_map *map,
sequence_number_t recovery_block_number);
void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
physical_block_number_t pbn,
enum block_mapping_state mapping_state,
sequence_number_t *recovery_lock);
void vdo_get_mapped_block(struct data_vio *data_vio);
void vdo_put_mapped_block(struct data_vio *data_vio);
struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map);
/**
* vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format
* @age: The configured maximum age
*
* Return: The converted age
*
* In the old recovery journal format, each journal block held 311 entries, and every write bio
* made two entries. The old maximum age was half the usable journal length. In the new format,
* each block holds only 217 entries, but each bio only makes one entry. We convert the configured
* age so that the number of writes in a block map era is the same in the old and new formats. This
* keeps the bound on the amount of work required to recover the block map from the recovery
* journal the same across the format change. It also keeps the amortization of block map page
* writes to write bios the same.
*/
static inline block_count_t vdo_convert_maximum_age(block_count_t age)
{
return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK,
2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK);
}
#endif /* VDO_BLOCK_MAP_H */
|