summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_map.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/btrfs/extent_map.c381
1 files changed, 311 insertions, 70 deletions
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 24a048210b..b4c9a6aa11 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -8,6 +8,7 @@
#include "extent_map.h"
#include "compression.h"
#include "btrfs_inode.h"
+#include "disk-io.h"
static struct kmem_cache *extent_map_cache;
@@ -76,6 +77,14 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
+static void dec_evictable_extent_maps(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
+ percpu_counter_dec(&fs_info->evictable_extent_maps);
+}
+
static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
struct rb_node **p = &root->rb_root.rb_node;
@@ -223,8 +232,9 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
return next->block_start == prev->block_start;
}
-static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
+static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -252,14 +262,13 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
- em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
- em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
+ dec_evictable_extent_maps(inode);
}
}
@@ -271,10 +280,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->block_len += merge->block_len;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
- em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
free_extent_map(merge);
+ dec_evictable_extent_maps(inode);
}
}
@@ -300,7 +309,6 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
struct extent_map *em;
- bool prealloc = false;
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
@@ -325,20 +333,8 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
em->generation = gen;
em->flags &= ~EXTENT_FLAG_PINNED;
- em->mod_start = em->start;
- em->mod_len = em->len;
- if (em->flags & EXTENT_FLAG_FILLING) {
- prealloc = true;
- em->flags &= ~EXTENT_FLAG_FILLING;
- }
-
- try_merge_map(tree, em);
-
- if (prealloc) {
- em->mod_start = em->start;
- em->mod_len = em->len;
- }
+ try_merge_map(inode, em);
out:
write_unlock(&tree->lock);
@@ -347,58 +343,62 @@ out:
}
-void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
{
- lockdep_assert_held_write(&tree->lock);
+ lockdep_assert_held_write(&inode->extent_tree.lock);
em->flags &= ~EXTENT_FLAG_LOGGING;
if (extent_map_in_tree(em))
- try_merge_map(tree, em);
+ try_merge_map(inode, em);
}
-static inline void setup_extent_mapping(struct extent_map_tree *tree,
+static inline void setup_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em,
int modified)
{
refcount_inc(&em->refs);
- em->mod_start = em->start;
- em->mod_len = em->len;
ASSERT(list_empty(&em->list));
if (modified)
- list_add(&em->list, &tree->modified_extents);
+ list_add(&em->list, &inode->extent_tree.modified_extents);
else
- try_merge_map(tree, em);
+ try_merge_map(inode, em);
}
/*
- * Add new extent map to the extent tree
+ * Add a new extent map to an inode's extent map tree.
*
- * @tree: tree to insert new map in
+ * @inode: the target inode
* @em: map to insert
* @modified: indicate whether the given @em should be added to the
* modified list, which indicates the extent needs to be logged
*
- * Insert @em into @tree or perform a simple forward/backward merge with
- * existing mappings. The extent_map struct passed in will be inserted
- * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was successful.
+ * Insert @em into the @inode's extent map tree or perform a simple
+ * forward/backward merge with existing mappings. The extent_map struct passed
+ * in will be inserted into the tree directly, with an additional reference
+ * taken, or a reference dropped if the merge attempt was successful.
*/
-static int add_extent_mapping(struct extent_map_tree *tree,
+static int add_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em, int modified)
{
- int ret = 0;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
lockdep_assert_held_write(&tree->lock);
ret = tree_insert(&tree->map, em);
if (ret)
- goto out;
+ return ret;
- setup_extent_mapping(tree, em, modified);
-out:
- return ret;
+ setup_extent_mapping(inode, em, modified);
+
+ if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
+ percpu_counter_inc(&fs_info->evictable_extent_maps);
+
+ return 0;
}
static struct extent_map *
@@ -464,16 +464,18 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
}
/*
- * Remove an extent_map from the extent tree.
+ * Remove an extent_map from its inode's extent tree.
*
- * @tree: extent tree to remove from
+ * @inode: the inode the extent map belongs to
* @em: extent map being removed
*
- * Remove @em from @tree. No reference counts are dropped, and no checks
- * are done to see if the range is in use.
+ * Remove @em from the extent tree of @inode. No reference counts are dropped,
+ * and no checks are done to see if the range is in use.
*/
-void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
lockdep_assert_held_write(&tree->lock);
WARN_ON(em->flags & EXTENT_FLAG_PINNED);
@@ -481,13 +483,17 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
+
+ dec_evictable_extent_maps(inode);
}
-static void replace_extent_mapping(struct extent_map_tree *tree,
+static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *cur,
struct extent_map *new,
int modified)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
lockdep_assert_held_write(&tree->lock);
WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
@@ -497,7 +503,7 @@ static void replace_extent_mapping(struct extent_map_tree *tree,
rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
- setup_extent_mapping(tree, new, modified);
+ setup_extent_mapping(inode, new, modified);
}
static struct extent_map *next_extent_map(const struct extent_map *em)
@@ -526,7 +532,7 @@ static struct extent_map *prev_extent_map(struct extent_map *em)
* and an extent that you want to insert, deal with overlap and insert
* the best fitted new extent into the tree.
*/
-static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
+static noinline int merge_extent_mapping(struct btrfs_inode *inode,
struct extent_map *existing,
struct extent_map *em,
u64 map_start)
@@ -560,14 +566,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
em->block_start += start_diff;
em->block_len = em->len;
}
- return add_extent_mapping(em_tree, em, 0);
+ return add_extent_mapping(inode, em, 0);
}
/*
- * Add extent mapping into em_tree.
+ * Add extent mapping into an inode's extent map tree.
*
- * @fs_info: the filesystem
- * @em_tree: extent tree into which we want to insert the extent mapping
+ * @inode: target inode
* @em_in: extent we are inserting
* @start: start of the logical range btrfs_get_extent() is requesting
* @len: length of the logical range btrfs_get_extent() is requesting
@@ -575,8 +580,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Note that @em_in's range may be different from [start, start+len),
* but they must be overlapped.
*
- * Insert @em_in into @em_tree. In case there is an overlapping range, handle
- * the -EEXIST by either:
+ * Insert @em_in into the inode's extent map tree. In case there is an
+ * overlapping range, handle the -EEXIST by either:
* a) Returning the existing extent in @em_in if @start is within the
* existing em.
* b) Merge the existing extent with @em_in passed in.
@@ -584,12 +589,12 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Return 0 on success, otherwise -EEXIST.
*
*/
-int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len)
{
int ret;
struct extent_map *em = *em_in;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
/*
* Tree-checker should have rejected any inline extent with non-zero
@@ -598,7 +603,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (em->block_start == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = add_extent_mapping(inode, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
@@ -606,7 +611,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (ret == -EEXIST) {
struct extent_map *existing;
- existing = search_extent_mapping(em_tree, start, len);
+ existing = search_extent_mapping(&inode->extent_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -627,8 +632,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
* The existing extent map is the one nearest to
* the [start, start + len) range which overlaps
*/
- ret = merge_extent_mapping(em_tree, existing,
- em, start);
+ ret = merge_extent_mapping(inode, existing, em, start);
if (WARN_ON(ret)) {
free_extent_map(em);
*em_in = NULL;
@@ -650,8 +654,10 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
* if needed. This avoids searching the tree, from the root down to the first
* extent map, before each deletion.
*/
-static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
write_lock(&tree->lock);
while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
struct extent_map *em;
@@ -660,7 +666,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
- remove_extent_mapping(tree, em);
+ remove_extent_mapping(inode, em);
free_extent_map(em);
cond_resched_rwlock_write(&tree->lock);
}
@@ -693,7 +699,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
WARN_ON(end < start);
if (end == (u64)-1) {
if (start == 0 && !skip_pinned) {
- drop_all_extent_maps_fast(em_tree);
+ drop_all_extent_maps_fast(inode);
return;
}
len = (u64)-1;
@@ -790,7 +796,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
- replace_extent_mapping(em_tree, em, split, modified);
+ replace_extent_mapping(inode, em, split, modified);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -831,13 +837,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
if (extent_map_in_tree(em)) {
- replace_extent_mapping(em_tree, em, split,
- modified);
+ replace_extent_mapping(inode, em, split, modified);
} else {
int ret;
- ret = add_extent_mapping(em_tree, split,
- modified);
+ ret = add_extent_mapping(inode, split, modified);
/* Logic error, shouldn't happen. */
ASSERT(ret == 0);
if (WARN_ON(ret != 0) && modified)
@@ -872,7 +876,7 @@ remove_em:
ASSERT(!split);
btrfs_set_inode_full_sync(inode);
}
- remove_extent_mapping(em_tree, em);
+ remove_extent_mapping(inode, em);
}
/*
@@ -927,7 +931,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
do {
btrfs_drop_extent_map_range(inode, new_em->start, end, false);
write_lock(&tree->lock);
- ret = add_extent_mapping(tree, new_em, modified);
+ ret = add_extent_mapping(inode, new_em, modified);
write_unlock(&tree->lock);
} while (ret == -EEXIST);
@@ -991,7 +995,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_pre->flags = flags;
split_pre->generation = em->generation;
- replace_extent_mapping(em_tree, em, split_pre, 1);
+ replace_extent_mapping(inode, em, split_pre, 1);
/*
* Now we only have an extent_map at:
@@ -1008,7 +1012,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, 1);
+ add_extent_mapping(inode, split_mid, 1);
/* Once for us */
free_extent_map(em);
@@ -1023,3 +1027,240 @@ out_free_pre:
free_extent_map(split_pre);
return ret;
}
+
+struct btrfs_em_shrink_ctx {
+ long nr_to_scan;
+ long scanned;
+ u64 last_ino;
+ u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
+{
+ const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ struct extent_map_tree *tree = &inode->extent_tree;
+ long nr_dropped = 0;
+ struct rb_node *node;
+
+ /*
+ * Take the mmap lock so that we serialize with the inode logging phase
+ * of fsync because we may need to set the full sync flag on the inode,
+ * in case we have to remove extent maps in the tree's list of modified
+ * extents. If we set the full sync flag in the inode while an fsync is
+ * in progress, we may risk missing new extents because before the flag
+ * is set, fsync decides to only wait for writeback to complete and then
+ * during inode logging it sees the flag set and uses the subvolume tree
+ * to find new extents, which may not be there yet because ordered
+ * extents haven't completed yet.
+ *
+ * We also do a try lock because otherwise we could deadlock. This is
+ * because the shrinker for this filesystem may be invoked while we are
+ * in a path that is holding the mmap lock in write mode. For example in
+ * a reflink operation while COWing an extent buffer, when allocating
+ * pages for a new extent buffer and under memory pressure, the shrinker
+ * may be invoked, and therefore we would deadlock by attempting to read
+ * lock the mmap lock while we are holding already a write lock on it.
+ */
+ if (!down_read_trylock(&inode->i_mmap_lock))
+ return 0;
+
+ /*
+ * We want to be fast because we can be called from any path trying to
+ * allocate memory, so if the lock is busy we don't want to spend time
+ * waiting for it - either some task is about to do IO for the inode or
+ * we may have another task shrinking extent maps, here in this code, so
+ * skip this inode.
+ */
+ if (!write_trylock(&tree->lock)) {
+ up_read(&inode->i_mmap_lock);
+ return 0;
+ }
+
+ node = rb_first_cached(&tree->map);
+ while (node) {
+ struct extent_map *em;
+
+ em = rb_entry(node, struct extent_map, rb_node);
+ node = rb_next(node);
+ ctx->scanned++;
+
+ if (em->flags & EXTENT_FLAG_PINNED)
+ goto next;
+
+ /*
+ * If the inode is in the list of modified extents (new) and its
+ * generation is the same (or is greater than) the current fs
+ * generation, it means it was not yet persisted so we have to
+ * set the full sync flag so that the next fsync will not miss
+ * it.
+ */
+ if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
+ btrfs_set_inode_full_sync(inode);
+
+ remove_extent_mapping(inode, em);
+ trace_btrfs_extent_map_shrinker_remove_em(inode, em);
+ /* Drop the reference for the tree. */
+ free_extent_map(em);
+ nr_dropped++;
+next:
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * Stop if we need to reschedule or there's contention on the
+ * lock. This is to avoid slowing other tasks trying to take the
+ * lock and because the shrinker might be called during a memory
+ * allocation path and we want to avoid taking a very long time
+ * and slowing down all sorts of tasks.
+ */
+ if (need_resched() || rwlock_needbreak(&tree->lock))
+ break;
+ }
+ write_unlock(&tree->lock);
+ up_read(&inode->i_mmap_lock);
+
+ return nr_dropped;
+}
+
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
+{
+ struct btrfs_inode *inode;
+ long nr_dropped = 0;
+ u64 min_ino = ctx->last_ino + 1;
+
+ inode = btrfs_find_first_inode(root, min_ino);
+ while (inode) {
+ nr_dropped += btrfs_scan_inode(inode, ctx);
+
+ min_ino = btrfs_ino(inode) + 1;
+ ctx->last_ino = btrfs_ino(inode);
+ btrfs_add_delayed_iput(inode);
+
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * We may be called from memory allocation paths, so we don't
+ * want to take too much time and slowdown tasks.
+ */
+ if (need_resched())
+ break;
+
+ inode = btrfs_find_first_inode(root, min_ino);
+ }
+
+ if (inode) {
+ /*
+ * There are still inodes in this root or we happened to process
+ * the last one and reached the scan limit. In either case set
+ * the current root to this one, so we'll resume from the next
+ * inode if there is one or we will find out this was the last
+ * one and move to the next root.
+ */
+ ctx->last_root = btrfs_root_id(root);
+ } else {
+ /*
+ * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
+ * that when processing the next root we start from its first inode.
+ */
+ ctx->last_ino = 0;
+ ctx->last_root = btrfs_root_id(root) + 1;
+ }
+
+ return nr_dropped;
+}
+
+long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ struct btrfs_em_shrink_ctx ctx;
+ u64 start_root_id;
+ u64 next_root_id;
+ bool cycled = false;
+ long nr_dropped = 0;
+
+ ctx.scanned = 0;
+ ctx.nr_to_scan = nr_to_scan;
+
+ /*
+ * In case we have multiple tasks running this shrinker, make the next
+ * one start from the next inode in case it starts before we finish.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+ fs_info->extent_map_shrinker_last_ino++;
+ ctx.last_root = fs_info->extent_map_shrinker_last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ start_root_id = ctx.last_root;
+ next_root_id = ctx.last_root;
+
+ if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
+ s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+ nr, ctx.last_root,
+ ctx.last_ino);
+ }
+
+ /*
+ * We may be called from memory allocation paths, so we don't want to
+ * take too much time and slowdown tasks, so stop if we need reschedule.
+ */
+ while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
+ struct btrfs_root *root;
+ unsigned long count;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)&root,
+ (unsigned long)next_root_id, 1);
+ if (count == 0) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ if (start_root_id > 0 && !cycled) {
+ next_root_id = 0;
+ ctx.last_root = 0;
+ ctx.last_ino = 0;
+ cycled = true;
+ continue;
+ }
+ break;
+ }
+ next_root_id = btrfs_root_id(root) + 1;
+ root = btrfs_grab_root(root);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (!root)
+ continue;
+
+ if (is_fstree(btrfs_root_id(root)))
+ nr_dropped += btrfs_scan_root(root, &ctx);
+
+ btrfs_put_root(root);
+ }
+
+ /*
+ * In case of multiple tasks running this extent map shrinking code this
+ * isn't perfect but it's simple and silences things like KCSAN. It's
+ * not possible to know which task made more progress because we can
+ * cycle back to the first root and first inode if it's not the first
+ * time the shrinker ran, see the above logic. Also a task that started
+ * later may finish ealier than another task and made less progress. So
+ * make this simple and update to the progress of the last task that
+ * finished, with the occasional possiblity of having two consecutive
+ * runs of the shrinker process the same inodes.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+ fs_info->extent_map_shrinker_last_root = ctx.last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
+ s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+ nr, ctx.last_root,
+ ctx.last_ino);
+ }
+
+ return nr_dropped;
+}