From 3afb00d3f86d3d924f88b56fa8285d4e9db85852 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 7 Aug 2024 15:17:52 +0200 Subject: Merging upstream version 6.10.3. Signed-off-by: Daniel Baumann --- fs/btrfs/extent_map.c | 381 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 311 insertions(+), 70 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 24a048210b..b4c9a6aa11 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -8,6 +8,7 @@ #include "extent_map.h" #include "compression.h" #include "btrfs_inode.h" +#include "disk-io.h" static struct kmem_cache *extent_map_cache; @@ -76,6 +77,14 @@ static u64 range_end(u64 start, u64 len) return start + len; } +static void dec_evictable_extent_maps(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) + percpu_counter_dec(&fs_info->evictable_extent_maps); +} + static int tree_insert(struct rb_root_cached *root, struct extent_map *em) { struct rb_node **p = &root->rb_root.rb_node; @@ -223,8 +232,9 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma return next->block_start == prev->block_start; } -static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) +static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { + struct extent_map_tree *tree = &inode->extent_tree; struct extent_map *merge = NULL; struct rb_node *rb; @@ -252,14 +262,13 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; - em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; - em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); + dec_evictable_extent_maps(inode); } } @@ -271,10 +280,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->block_len; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); - em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; free_extent_map(merge); + dec_evictable_extent_maps(inode); } } @@ -300,7 +309,6 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; struct extent_map *em; - bool prealloc = false; write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); @@ -325,20 +333,8 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) em->generation = gen; em->flags &= ~EXTENT_FLAG_PINNED; - em->mod_start = em->start; - em->mod_len = em->len; - if (em->flags & EXTENT_FLAG_FILLING) { - prealloc = true; - em->flags &= ~EXTENT_FLAG_FILLING; - } - - try_merge_map(tree, em); - - if (prealloc) { - em->mod_start = em->start; - em->mod_len = em->len; - } + try_merge_map(inode, em); out: write_unlock(&tree->lock); @@ -347,58 +343,62 @@ out: } -void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { - lockdep_assert_held_write(&tree->lock); + lockdep_assert_held_write(&inode->extent_tree.lock); em->flags &= ~EXTENT_FLAG_LOGGING; if (extent_map_in_tree(em)) - try_merge_map(tree, em); + try_merge_map(inode, em); } -static inline void setup_extent_mapping(struct extent_map_tree *tree, +static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, int modified) { refcount_inc(&em->refs); - em->mod_start = em->start; - em->mod_len = em->len; ASSERT(list_empty(&em->list)); if (modified) - list_add(&em->list, &tree->modified_extents); + list_add(&em->list, &inode->extent_tree.modified_extents); else - try_merge_map(tree, em); + try_merge_map(inode, em); } /* - * Add new extent map to the extent tree + * Add a new extent map to an inode's extent map tree. * - * @tree: tree to insert new map in + * @inode: the target inode * @em: map to insert * @modified: indicate whether the given @em should be added to the * modified list, which indicates the extent needs to be logged * - * Insert @em into @tree or perform a simple forward/backward merge with - * existing mappings. The extent_map struct passed in will be inserted - * into the tree directly, with an additional reference taken, or a - * reference dropped if the merge attempt was successful. + * Insert @em into the @inode's extent map tree or perform a simple + * forward/backward merge with existing mappings. The extent_map struct passed + * in will be inserted into the tree directly, with an additional reference + * taken, or a reference dropped if the merge attempt was successful. */ -static int add_extent_mapping(struct extent_map_tree *tree, +static int add_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, int modified) { - int ret = 0; + struct extent_map_tree *tree = &inode->extent_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; lockdep_assert_held_write(&tree->lock); ret = tree_insert(&tree->map, em); if (ret) - goto out; + return ret; - setup_extent_mapping(tree, em, modified); -out: - return ret; + setup_extent_mapping(inode, em, modified); + + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) + percpu_counter_inc(&fs_info->evictable_extent_maps); + + return 0; } static struct extent_map * @@ -464,16 +464,18 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, } /* - * Remove an extent_map from the extent tree. + * Remove an extent_map from its inode's extent tree. * - * @tree: extent tree to remove from + * @inode: the inode the extent map belongs to * @em: extent map being removed * - * Remove @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use. + * Remove @em from the extent tree of @inode. No reference counts are dropped, + * and no checks are done to see if the range is in use. */ -void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { + struct extent_map_tree *tree = &inode->extent_tree; + lockdep_assert_held_write(&tree->lock); WARN_ON(em->flags & EXTENT_FLAG_PINNED); @@ -481,13 +483,17 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); RB_CLEAR_NODE(&em->rb_node); + + dec_evictable_extent_maps(inode); } -static void replace_extent_mapping(struct extent_map_tree *tree, +static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, int modified) { + struct extent_map_tree *tree = &inode->extent_tree; + lockdep_assert_held_write(&tree->lock); WARN_ON(cur->flags & EXTENT_FLAG_PINNED); @@ -497,7 +503,7 @@ static void replace_extent_mapping(struct extent_map_tree *tree, rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); RB_CLEAR_NODE(&cur->rb_node); - setup_extent_mapping(tree, new, modified); + setup_extent_mapping(inode, new, modified); } static struct extent_map *next_extent_map(const struct extent_map *em) @@ -526,7 +532,7 @@ static struct extent_map *prev_extent_map(struct extent_map *em) * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. */ -static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, +static noinline int merge_extent_mapping(struct btrfs_inode *inode, struct extent_map *existing, struct extent_map *em, u64 map_start) @@ -560,14 +566,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, em->block_start += start_diff; em->block_len = em->len; } - return add_extent_mapping(em_tree, em, 0); + return add_extent_mapping(inode, em, 0); } /* - * Add extent mapping into em_tree. + * Add extent mapping into an inode's extent map tree. * - * @fs_info: the filesystem - * @em_tree: extent tree into which we want to insert the extent mapping + * @inode: target inode * @em_in: extent we are inserting * @start: start of the logical range btrfs_get_extent() is requesting * @len: length of the logical range btrfs_get_extent() is requesting @@ -575,8 +580,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. * - * Insert @em_in into @em_tree. In case there is an overlapping range, handle - * the -EEXIST by either: + * Insert @em_in into the inode's extent map tree. In case there is an + * overlapping range, handle the -EEXIST by either: * a) Returning the existing extent in @em_in if @start is within the * existing em. * b) Merge the existing extent with @em_in passed in. @@ -584,12 +589,12 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Return 0 on success, otherwise -EEXIST. * */ -int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len) { int ret; struct extent_map *em = *em_in; + struct btrfs_fs_info *fs_info = inode->root->fs_info; /* * Tree-checker should have rejected any inline extent with non-zero @@ -598,7 +603,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (em->block_start == EXTENT_MAP_INLINE) ASSERT(em->start == 0); - ret = add_extent_mapping(em_tree, em, 0); + ret = add_extent_mapping(inode, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -606,7 +611,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (ret == -EEXIST) { struct extent_map *existing; - existing = search_extent_mapping(em_tree, start, len); + existing = search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -627,8 +632,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, * The existing extent map is the one nearest to * the [start, start + len) range which overlaps */ - ret = merge_extent_mapping(em_tree, existing, - em, start); + ret = merge_extent_mapping(inode, existing, em, start); if (WARN_ON(ret)) { free_extent_map(em); *em_in = NULL; @@ -650,8 +654,10 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, * if needed. This avoids searching the tree, from the root down to the first * extent map, before each deletion. */ -static void drop_all_extent_maps_fast(struct extent_map_tree *tree) +static void drop_all_extent_maps_fast(struct btrfs_inode *inode) { + struct extent_map_tree *tree = &inode->extent_tree; + write_lock(&tree->lock); while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { struct extent_map *em; @@ -660,7 +666,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree) node = rb_first_cached(&tree->map); em = rb_entry(node, struct extent_map, rb_node); em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); - remove_extent_mapping(tree, em); + remove_extent_mapping(inode, em); free_extent_map(em); cond_resched_rwlock_write(&tree->lock); } @@ -693,7 +699,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, WARN_ON(end < start); if (end == (u64)-1) { if (start == 0 && !skip_pinned) { - drop_all_extent_maps_fast(em_tree); + drop_all_extent_maps_fast(inode); return; } len = (u64)-1; @@ -790,7 +796,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; - replace_extent_mapping(em_tree, em, split, modified); + replace_extent_mapping(inode, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; @@ -831,13 +837,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, } if (extent_map_in_tree(em)) { - replace_extent_mapping(em_tree, em, split, - modified); + replace_extent_mapping(inode, em, split, modified); } else { int ret; - ret = add_extent_mapping(em_tree, split, - modified); + ret = add_extent_mapping(inode, split, modified); /* Logic error, shouldn't happen. */ ASSERT(ret == 0); if (WARN_ON(ret != 0) && modified) @@ -872,7 +876,7 @@ remove_em: ASSERT(!split); btrfs_set_inode_full_sync(inode); } - remove_extent_mapping(em_tree, em); + remove_extent_mapping(inode, em); } /* @@ -927,7 +931,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, do { btrfs_drop_extent_map_range(inode, new_em->start, end, false); write_lock(&tree->lock); - ret = add_extent_mapping(tree, new_em, modified); + ret = add_extent_mapping(inode, new_em, modified); write_unlock(&tree->lock); } while (ret == -EEXIST); @@ -991,7 +995,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_pre->flags = flags; split_pre->generation = em->generation; - replace_extent_mapping(em_tree, em, split_pre, 1); + replace_extent_mapping(inode, em, split_pre, 1); /* * Now we only have an extent_map at: @@ -1008,7 +1012,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); + add_extent_mapping(inode, split_mid, 1); /* Once for us */ free_extent_map(em); @@ -1023,3 +1027,240 @@ out_free_pre: free_extent_map(split_pre); return ret; } + +struct btrfs_em_shrink_ctx { + long nr_to_scan; + long scanned; + u64 last_ino; + u64 last_root; +}; + +static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) +{ + const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); + struct extent_map_tree *tree = &inode->extent_tree; + long nr_dropped = 0; + struct rb_node *node; + + /* + * Take the mmap lock so that we serialize with the inode logging phase + * of fsync because we may need to set the full sync flag on the inode, + * in case we have to remove extent maps in the tree's list of modified + * extents. If we set the full sync flag in the inode while an fsync is + * in progress, we may risk missing new extents because before the flag + * is set, fsync decides to only wait for writeback to complete and then + * during inode logging it sees the flag set and uses the subvolume tree + * to find new extents, which may not be there yet because ordered + * extents haven't completed yet. + * + * We also do a try lock because otherwise we could deadlock. This is + * because the shrinker for this filesystem may be invoked while we are + * in a path that is holding the mmap lock in write mode. For example in + * a reflink operation while COWing an extent buffer, when allocating + * pages for a new extent buffer and under memory pressure, the shrinker + * may be invoked, and therefore we would deadlock by attempting to read + * lock the mmap lock while we are holding already a write lock on it. + */ + if (!down_read_trylock(&inode->i_mmap_lock)) + return 0; + + /* + * We want to be fast because we can be called from any path trying to + * allocate memory, so if the lock is busy we don't want to spend time + * waiting for it - either some task is about to do IO for the inode or + * we may have another task shrinking extent maps, here in this code, so + * skip this inode. + */ + if (!write_trylock(&tree->lock)) { + up_read(&inode->i_mmap_lock); + return 0; + } + + node = rb_first_cached(&tree->map); + while (node) { + struct extent_map *em; + + em = rb_entry(node, struct extent_map, rb_node); + node = rb_next(node); + ctx->scanned++; + + if (em->flags & EXTENT_FLAG_PINNED) + goto next; + + /* + * If the inode is in the list of modified extents (new) and its + * generation is the same (or is greater than) the current fs + * generation, it means it was not yet persisted so we have to + * set the full sync flag so that the next fsync will not miss + * it. + */ + if (!list_empty(&em->list) && em->generation >= cur_fs_gen) + btrfs_set_inode_full_sync(inode); + + remove_extent_mapping(inode, em); + trace_btrfs_extent_map_shrinker_remove_em(inode, em); + /* Drop the reference for the tree. */ + free_extent_map(em); + nr_dropped++; +next: + if (ctx->scanned >= ctx->nr_to_scan) + break; + + /* + * Stop if we need to reschedule or there's contention on the + * lock. This is to avoid slowing other tasks trying to take the + * lock and because the shrinker might be called during a memory + * allocation path and we want to avoid taking a very long time + * and slowing down all sorts of tasks. + */ + if (need_resched() || rwlock_needbreak(&tree->lock)) + break; + } + write_unlock(&tree->lock); + up_read(&inode->i_mmap_lock); + + return nr_dropped; +} + +static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) +{ + struct btrfs_inode *inode; + long nr_dropped = 0; + u64 min_ino = ctx->last_ino + 1; + + inode = btrfs_find_first_inode(root, min_ino); + while (inode) { + nr_dropped += btrfs_scan_inode(inode, ctx); + + min_ino = btrfs_ino(inode) + 1; + ctx->last_ino = btrfs_ino(inode); + btrfs_add_delayed_iput(inode); + + if (ctx->scanned >= ctx->nr_to_scan) + break; + + /* + * We may be called from memory allocation paths, so we don't + * want to take too much time and slowdown tasks. + */ + if (need_resched()) + break; + + inode = btrfs_find_first_inode(root, min_ino); + } + + if (inode) { + /* + * There are still inodes in this root or we happened to process + * the last one and reached the scan limit. In either case set + * the current root to this one, so we'll resume from the next + * inode if there is one or we will find out this was the last + * one and move to the next root. + */ + ctx->last_root = btrfs_root_id(root); + } else { + /* + * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so + * that when processing the next root we start from its first inode. + */ + ctx->last_ino = 0; + ctx->last_root = btrfs_root_id(root) + 1; + } + + return nr_dropped; +} + +long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + struct btrfs_em_shrink_ctx ctx; + u64 start_root_id; + u64 next_root_id; + bool cycled = false; + long nr_dropped = 0; + + ctx.scanned = 0; + ctx.nr_to_scan = nr_to_scan; + + /* + * In case we have multiple tasks running this shrinker, make the next + * one start from the next inode in case it starts before we finish. + */ + spin_lock(&fs_info->extent_map_shrinker_lock); + ctx.last_ino = fs_info->extent_map_shrinker_last_ino; + fs_info->extent_map_shrinker_last_ino++; + ctx.last_root = fs_info->extent_map_shrinker_last_root; + spin_unlock(&fs_info->extent_map_shrinker_lock); + + start_root_id = ctx.last_root; + next_root_id = ctx.last_root; + + if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, + nr, ctx.last_root, + ctx.last_ino); + } + + /* + * We may be called from memory allocation paths, so we don't want to + * take too much time and slowdown tasks, so stop if we need reschedule. + */ + while (ctx.scanned < ctx.nr_to_scan && !need_resched()) { + struct btrfs_root *root; + unsigned long count; + + spin_lock(&fs_info->fs_roots_radix_lock); + count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)&root, + (unsigned long)next_root_id, 1); + if (count == 0) { + spin_unlock(&fs_info->fs_roots_radix_lock); + if (start_root_id > 0 && !cycled) { + next_root_id = 0; + ctx.last_root = 0; + ctx.last_ino = 0; + cycled = true; + continue; + } + break; + } + next_root_id = btrfs_root_id(root) + 1; + root = btrfs_grab_root(root); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (!root) + continue; + + if (is_fstree(btrfs_root_id(root))) + nr_dropped += btrfs_scan_root(root, &ctx); + + btrfs_put_root(root); + } + + /* + * In case of multiple tasks running this extent map shrinking code this + * isn't perfect but it's simple and silences things like KCSAN. It's + * not possible to know which task made more progress because we can + * cycle back to the first root and first inode if it's not the first + * time the shrinker ran, see the above logic. Also a task that started + * later may finish ealier than another task and made less progress. So + * make this simple and update to the progress of the last task that + * finished, with the occasional possiblity of having two consecutive + * runs of the shrinker process the same inodes. + */ + spin_lock(&fs_info->extent_map_shrinker_lock); + fs_info->extent_map_shrinker_last_ino = ctx.last_ino; + fs_info->extent_map_shrinker_last_root = ctx.last_root; + spin_unlock(&fs_info->extent_map_shrinker_lock); + + if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, + nr, ctx.last_root, + ctx.last_ino); + } + + return nr_dropped; +} -- cgit v1.2.3