diff options
Diffstat (limited to 'fs/btrfs')
115 files changed, 1929 insertions, 1066 deletions
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 1925a0919c..79026917db 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -5,7 +5,8 @@ #include <asm/unaligned.h> #include "messages.h" -#include "ctree.h" +#include "extent_io.h" +#include "fs.h" #include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, @@ -63,8 +64,8 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -94,7 +95,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ @@ -117,8 +118,8 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ const unsigned long oil = get_eb_offset_in_folio(token->eb, \ member_offset);\ - const int unit_size = folio_size(token->eb->folios[0]); \ - const int unit_shift = folio_shift(token->eb->folios[0]); \ + const int unit_size = token->eb->folio_size; \ + const int unit_shift = token->eb->folio_shift; \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = unit_size - oil; \ @@ -151,7 +152,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ const unsigned long oil = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = folio_size(eb->folios[0]); \ + const int unit_size = eb->folio_size; \ char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ const int part = unit_size - oil; \ diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ed7aa32972..6fce3e8d3d 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,8 +3,17 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H -#include <linux/stddef.h> #include <asm/unaligned.h> +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/align.h> +#include <linux/build_bug.h> +#include <linux/compiler.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; struct btrfs_map_token { struct extent_buffer *eb; @@ -844,45 +853,6 @@ static inline void btrfs_set_balance_sys(struct extent_buffer *eb, write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } -static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void btrfs_cpu_balance_args_to_disk( - struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7427449a04..e0ba00d64e 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -12,7 +12,6 @@ #include <linux/sched/mm.h> #include <linux/slab.h> #include "ctree.h" -#include "btrfs_inode.h" #include "xattr.h" #include "acl.h" diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index a270e71ec0..48b9ddae4a 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,8 +3,15 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +struct posix_acl; +struct inode; +struct btrfs_trans_handle; + #ifdef CONFIG_BTRFS_FS_POSIX_ACL +struct mnt_idmap; +struct dentry; + struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); @@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, #else +#include <linux/errno.h> + +struct btrfs_trans_handle; + #define btrfs_get_acl NULL #define btrfs_set_acl NULL static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 9e261aac67..361a866c19 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -11,7 +11,6 @@ #include <linux/freezer.h> #include <trace/events/btrfs.h> #include "async-thread.h" -#include "ctree.h" enum { WORK_DONE_BIT, diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 62b8a0d578..04c2f31758 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -7,11 +7,14 @@ #ifndef BTRFS_ASYNC_THREAD_H #define BTRFS_ASYNC_THREAD_H +#include <linux/compiler_types.h> #include <linux/workqueue.h> +#include <linux/list.h> struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; + typedef void (*btrfs_func_t)(struct btrfs_work *arg); typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index fc0930dd92..58110c9686 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -198,10 +198,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache; int __init btrfs_prelim_ref_init(void) { btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", - sizeof(struct prelim_ref), - 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct prelim_ref), 0, 0, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; return 0; @@ -1036,8 +1033,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); - BUG_ON(item_size < sizeof(*ei)); - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); if (ctx->check_extent_item) { @@ -1435,8 +1430,10 @@ again: if (ret < 0) goto out; if (ret == 0) { - /* This shouldn't happen, indicates a bug or fs corruption. */ - ASSERT(ret != 0); + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto out; } @@ -2225,6 +2222,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + return -EUCLEAN; + } ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { @@ -2247,7 +2251,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, eb = path->nodes[0]; item_size = btrfs_item_size(eb, path->slots[0]); - BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); @@ -2844,6 +2847,16 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf return ret; } +static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) +{ + iter->bytenr = 0; + iter->item_ptr = 0; + iter->cur_ptr = 0; + iter->end_ptr = 0; + btrfs_release_path(iter->path); + memset(&iter->cur_key, 0, sizeof(iter->cur_key)); +} + int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) { struct btrfs_fs_info *fs_info = iter->fs_info; @@ -2862,6 +2875,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) if (ret < 0) return ret; if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ ret = -EUCLEAN; goto release; } @@ -2932,6 +2949,14 @@ release: return ret; } +static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || + iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) + return true; + return false; +} + /* * Go to the next backref item of current bytenr, can be either inlined or * keyed. @@ -2944,7 +2969,7 @@ release: */ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) { - struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct extent_buffer *eb = iter->path->nodes[0]; struct btrfs_root *extent_root; struct btrfs_path *path = iter->path; struct btrfs_extent_inline_ref *iref; @@ -3032,6 +3057,19 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( return node; } +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); + cache->nr_nodes--; + btrfs_put_root(node->root); + kfree(node); + } +} + struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache) { @@ -3043,6 +3081,52 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( return edge; } +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node) +{ + if (node->eb) { + btrfs_backref_unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +/* + * Drop the backref node from cache without cleaning up its children + * edges. + * + * This can only be called on node without parent edges. + * The children edges are still kept as is. + */ +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node) +{ + ASSERT(list_empty(&node->upper)); + + btrfs_backref_drop_node_buffer(node); + list_del_init(&node->list); + list_del_init(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + btrfs_backref_free_node(tree, node); +} + /* * Drop the backref node from cache, also cleaning up all its * upper edges and any uncached nodes in the path. @@ -3114,6 +3198,19 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) ASSERT(!cache->nr_edges); } +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which) +{ + ASSERT(upper && lower && upper->level == lower->level + 1); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + if (link_which & LINK_LOWER) + list_add_tail(&edge->list[LOWER], &lower->upper); + if (link_which & LINK_UPPER) + list_add_tail(&edge->list[UPPER], &upper->lower); +} /* * Handle direct tree backref * @@ -3422,7 +3519,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, int type; cond_resched(); - eb = btrfs_backref_get_eb(iter); + eb = iter->path->nodes[0]; key.objectid = iter->bytenr; if (btrfs_backref_iter_is_inline_ref(iter)) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ab4ca0eda6..e8c22cccb5 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -6,11 +6,23 @@ #ifndef BTRFS_BACKREF_H #define BTRFS_BACKREF_H -#include <linux/btrfs.h> +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" -#include "ulist.h" +#include "locking.h" #include "disk-io.h" #include "extent_io.h" +#include "ctree.h" + +struct extent_inode_elem; +struct ulist; +struct btrfs_extent_item; +struct btrfs_trans_handle; +struct btrfs_fs_info; /* * Used by implementations of iterate_extent_inodes_t (see definition below) to @@ -271,22 +283,6 @@ struct btrfs_backref_iter { struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); -static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) -{ - if (!iter) - return; - btrfs_free_path(iter->path); - kfree(iter); -} - -static inline struct extent_buffer *btrfs_backref_get_eb( - struct btrfs_backref_iter *iter) -{ - if (!iter) - return NULL; - return iter->path->nodes[0]; -} - /* * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header. @@ -306,25 +302,6 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr); int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); -static inline bool btrfs_backref_iter_is_inline_ref( - struct btrfs_backref_iter *iter) -{ - if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || - iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) - return true; - return false; -} - -static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) -{ - iter->bytenr = 0; - iter->item_ptr = 0; - iter->cur_ptr = 0; - iter->end_ptr = 0; - btrfs_release_path(iter->path); - memset(&iter->cur_key, 0, sizeof(iter->cur_key)); -} - /* * Backref cache related structures * @@ -452,83 +429,22 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge( #define LINK_LOWER (1 << 0) #define LINK_UPPER (1 << 1) -static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which) -{ - ASSERT(upper && lower && upper->level == lower->level + 1); - edge->node[LOWER] = lower; - edge->node[UPPER] = upper; - if (link_which & LINK_LOWER) - list_add_tail(&edge->list[LOWER], &lower->upper); - if (link_which & LINK_UPPER) - list_add_tail(&edge->list[UPPER], &upper->lower); -} - -static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, - struct btrfs_backref_node *node) -{ - if (node) { - ASSERT(list_empty(&node->list)); - ASSERT(list_empty(&node->lower)); - ASSERT(node->eb == NULL); - cache->nr_nodes--; - btrfs_put_root(node->root); - kfree(node); - } -} - -static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, - struct btrfs_backref_edge *edge) -{ - if (edge) { - cache->nr_edges--; - kfree(edge); - } -} - -static inline void btrfs_backref_unlock_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } -} -static inline void btrfs_backref_drop_node_buffer( - struct btrfs_backref_node *node) -{ - if (node->eb) { - btrfs_backref_unlock_node_buffer(node); - free_extent_buffer(node->eb); - node->eb = NULL; - } -} - -/* - * Drop the backref node from cache without cleaning up its children - * edges. - * - * This can only be called on node without parent edges. - * The children edges are still kept as is. - */ -static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, - struct btrfs_backref_node *node) -{ - ASSERT(list_empty(&node->upper)); - - btrfs_backref_drop_node_buffer(node); - list_del_init(&node->list); - list_del_init(&node->lower); - if (!RB_EMPTY_NODE(&node->rb_node)) - rb_erase(&node->rb_node, &tree->rb_root); - btrfs_backref_free_node(tree, node); -} +void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which); +void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); +void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge); +void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node); +void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node); void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); +void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node); void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 928f512cdb..e3a57196b0 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -11,7 +11,6 @@ #include "raid56.h" #include "async-thread.h" #include "dev-replace.h" -#include "rcu-string.h" #include "zoned.h" #include "file-item.h" #include "raid-stripe-tree.h" @@ -509,8 +508,6 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; - if (bio_op(bio) != REQ_OP_READ) - btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; if (bio_op(bio) != REQ_OP_READ) btrfs_bio(bio)->orig_physical = smap->physical; @@ -611,8 +608,20 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) static bool should_async_write(struct btrfs_bio *bbio) { + bool auto_csum_mode = true; + +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; + enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); + + if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) + return false; + + auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); +#endif + /* Submit synchronously if the checksum implementation is fast. */ - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* @@ -732,7 +741,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; - } else if (use_append) { + } else if (use_append || + (btrfs_is_zoned(fs_info) && inode && + inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); if (ret) goto fail_put_bio; diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index bbaed31716..d9dd527609 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -7,12 +7,14 @@ #ifndef BTRFS_BIO_H #define BTRFS_BIO_H +#include <linux/types.h> #include <linux/bio.h> #include <linux/workqueue.h> #include "tree-checker.h" struct btrfs_bio; struct btrfs_fs_info; +struct btrfs_inode; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index be70acea87..1a66be33bb 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -418,7 +418,7 @@ struct btrfs_caching_control *btrfs_get_caching_control( return ctl; } -void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +static void btrfs_put_caching_control(struct btrfs_caching_control *ctl) { if (refcount_dec_and_test(&ctl->count)) kfree(ctl); @@ -1063,7 +1063,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, map->start); - BUG_ON(!block_group); + if (!block_group) + return -ENOENT; + BUG_ON(!block_group->ro); trace_btrfs_remove_block_group(block_group); @@ -1429,7 +1431,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, * group in pinned_extents before we were able to clear the whole block * group range from pinned_extents. This means that task can lookup for * the block group after we unpinned it from pinned_extents and removed - * it, leading to a BUG_ON() at unpin_extent_range(). + * it, leading to an error at unpin_extent_range(). */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { @@ -1522,6 +1524,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group. + * + * Also bail out if this is the only block group for its + * type, because otherwise we would lose profile + * information from fs_info->avail_*_alloc_bits and the + * next block group of this type would be created with a + * "single" profile (even if we're in a raid fs) because + * fs_info->avail_*_alloc_bits would be 0. */ trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1776,6 +1785,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; + LIST_HEAD(retry_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1912,8 +1922,11 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: - if (ret) - btrfs_mark_bg_to_reclaim(bg); + if (ret) { + /* Refcount held by the reclaim_bgs list after splice. */ + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, &retry_list); + } btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1933,6 +1946,9 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); end: + spin_lock(&fs_info->unused_bgs_lock); + list_splice_tail(&retry_list, &fs_info->reclaim_bgs); + spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 962b119839..85e2d4cd12 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -3,9 +3,22 @@ #ifndef BTRFS_BLOCK_GROUP_H #define BTRFS_BLOCK_GROUP_H +#include <linux/atomic.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/wait.h> +#include <linux/sizes.h> +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs_tree.h> #include "free-space-cache.h" struct btrfs_chunk_map; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_trans_handle; enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, @@ -297,7 +310,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); -void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); int btrfs_add_new_free_space(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 1043a81423..95c174f9fd 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,7 +6,6 @@ #include "space-info.h" #include "transaction.h" #include "block-group.h" -#include "disk-io.h" #include "fs.h" #include "accessors.h" diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 43a9a6b5a7..1f53b967d0 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -3,8 +3,15 @@ #ifndef BTRFS_BLOCK_RSV_H #define BTRFS_BLOCK_RSV_H +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/spinlock.h> + struct btrfs_trans_handle; struct btrfs_root; +struct btrfs_space_info; +struct btrfs_block_rsv; +struct btrfs_fs_info; enum btrfs_reserve_flush_enum; /* diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7f7c5a92d2..100020ca46 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -8,13 +8,32 @@ #include <linux/hash.h> #include <linux/refcount.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/compiler.h> #include <linux/fscrypt.h> +#include <linux/lockdep.h> +#include <uapi/linux/btrfs_tree.h> #include <trace/events/btrfs.h> +#include "block-rsv.h" +#include "btrfs_inode.h" #include "extent_map.h" #include "extent_io.h" +#include "extent-io-tree.h" #include "ordered-data.h" #include "delayed-inode.h" +struct extent_state; +struct posix_acl; +struct iov_iter; +struct writeback_control; +struct btrfs_root; +struct btrfs_fs_info; +struct btrfs_trans_handle; + /* * Since we search a directory based on f_pos (struct dir_context::pos) we have * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so @@ -41,7 +60,6 @@ enum { */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, - BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, /* @@ -428,7 +446,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); -void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -490,8 +508,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); + struct page *page, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0041613a36..b2b9400995 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -25,8 +25,6 @@ #include "misc.h" #include "ctree.h" #include "fs.h" -#include "disk-io.h" -#include "transaction.h" #include "btrfs_inode.h" #include "bio.h" #include "ordered-data.h" @@ -34,8 +32,7 @@ #include "extent_io.h" #include "extent_map.h" #include "subpage.h" -#include "zoned.h" -#include "file-item.h" +#include "messages.h" #include "super.h" static struct bio_set btrfs_compressed_bioset; @@ -1479,11 +1476,6 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, /* * Compression heuristic. * - * For now is's a naive and optimistic 'return true', we'll extend the logic to - * quickly (compared to direct compression) detect data characteristics - * (compressible/incompressible) to avoid wasting CPU time on incompressible - * data. - * * The following types of analysis can be performed: * - detect mostly zero data * - detect data with low "byte set" size (text, etc) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index afd7e50d07..4691a84ca8 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -7,10 +7,18 @@ #define BTRFS_COMPRESSION_H #include <linux/sizes.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/wait.h> #include "bio.h" +struct address_space; +struct page; +struct inode; struct btrfs_inode; struct btrfs_ordered_extent; +struct btrfs_bio; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -32,8 +40,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 -struct page; - struct compressed_bio { /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -169,7 +175,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e65e012bac..aaf53fd843 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -820,7 +820,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; @@ -4280,6 +4280,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, /* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. + * + * Returns: 0 on success + * -EEXIST if the first key already exists + * < 0 on other errors */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5082,9 +5086,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root, int __init btrfs_ctree_init(void) { - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); if (!btrfs_path_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 70e828d331..c03c582460 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -7,25 +7,24 @@ #define BTRFS_CTREE_H #include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> +#include <linux/wait.h> +#include <linux/list.h> +#include <linux/atomic.h> +#include <linux/xarray.h> +#include <linux/refcount.h> +#include <uapi/linux/btrfs_tree.h> #include "locking.h" #include "fs.h" #include "accessors.h" +#include "extent-io-tree.h" +struct extent_buffer; +struct btrfs_block_rsv; struct btrfs_trans_handle; -struct btrfs_transaction; -struct btrfs_pending_snapshot; -struct btrfs_delayed_ref_root; -struct btrfs_space_info; struct btrfs_block_group; -struct btrfs_ordered_sum; -struct btrfs_ref; -struct btrfs_bio; -struct btrfs_ioctl_encoded_io_args; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; -struct reloc_control; /* Read ahead values for struct btrfs_path.reada */ enum { @@ -478,8 +477,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index a77be9896d..f015fa1b63 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include "ctree.h" #include "disk-io.h" -#include "print-tree.h" #include "transaction.h" #include "locking.h" #include "accessors.h" @@ -521,7 +520,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, * keep_locks set and lowest_level is 1, regardless of the value of * path->slots[1]. */ - BUG_ON(path->locks[1] == 0); + ASSERT(path->locks[1] != 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -861,20 +860,21 @@ out: * NOTE: Caller should also wait for page writeback after the cluster is * prepared, here we don't do writeback wait for each page. */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); u64 page_start = (u64)index << PAGE_SHIFT; u64 page_end = page_start + PAGE_SIZE - 1; struct extent_state *cached_state = NULL; - struct page *page; + struct folio *folio; int ret; again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) + return folio; /* * Since we can defragment files opened read-only, we can encounter @@ -884,16 +884,16 @@ again: * executables that explicitly enable them, so this isn't very * restrictive. */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ETXTBSY); } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ERR_PTR(ret); } @@ -908,17 +908,17 @@ again: if (!ordered) break; - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); - lock_page(page); + folio_lock(folio); /* - * We unlocked the page above, so we need check if it was + * We unlocked the folio above, so we need check if it was * released or not. */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } } @@ -927,21 +927,21 @@ again: * Now the page range has no ordered extent any more. Read the page to * make it uptodate. */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping || !folio->private) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } } - return page; + return folio; } struct defrag_target_range { @@ -1162,7 +1162,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); */ static int defrag_one_locked_target(struct btrfs_inode *inode, struct defrag_target_range *target, - struct page **pages, int nr_pages, + struct folio **folios, int nr_pages, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1171,7 +1171,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, const u64 len = target->len; unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); + unsigned long first_index = folios[0]->index; int ret = 0; int i; @@ -1188,8 +1188,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len); + folio_clear_checked(folios[i]); + btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1205,7 +1205,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, struct defrag_target_range *entry; struct defrag_target_range *tmp; LIST_HEAD(target_list); - struct page **pages; + struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; u64 last_index = (start + len - 1) >> PAGE_SHIFT; u64 start_index = start >> PAGE_SHIFT; @@ -1216,21 +1216,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) + folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); + if (!folios) return -ENOMEM; /* Prepare all pages */ for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; + folios[i] = defrag_prepare_one_folio(inode, start_index + i); + if (IS_ERR(folios[i])) { + ret = PTR_ERR(folios[i]); + nr_pages = i; + goto free_folios; } } for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); + folio_wait_writeback(folios[i]); /* Lock the pages range */ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, @@ -1250,7 +1250,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, goto unlock_extent; list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + ret = defrag_one_locked_target(inode, entry, folios, nr_pages, &cached_state); if (ret < 0) break; @@ -1264,14 +1264,12 @@ unlock_extent: unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, &cached_state); -free_pages: +free_folios: for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folios[i]); + folio_put(folios[i]); } - kfree(pages); + kfree(folios); return ret; } @@ -1512,9 +1510,7 @@ void __cold btrfs_auto_defrag_exit(void) int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); + sizeof(struct inode_defrag), 0, 0, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 5a62763528..878528e086 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -3,6 +3,16 @@ #ifndef BTRFS_DEFRAG_H #define BTRFS_DEFRAG_H +#include <linux/types.h> +#include <linux/compiler_types.h> + +struct inode; +struct file_ra_state; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ioctl_defrag_range_args; + int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index acf9f4b6c0..b3527efd0b 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -6,9 +6,7 @@ #include "block-rsv.h" #include "btrfs_inode.h" #include "space-info.h" -#include "transaction.h" #include "qgroup.h" -#include "block-group.h" #include "fs.h" /* diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index c5d573f236..ce4f889e4f 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -3,7 +3,11 @@ #ifndef BTRFS_DELALLOC_SPACE_H #define BTRFS_DELALLOC_SPACE_H +#include <linux/types.h> + struct extent_changeset; +struct btrfs_inode; +struct btrfs_fs_info; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index ab5a833e7d..121ab890bd 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -28,11 +28,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("btrfs_delayed_node", - sizeof(struct btrfs_delayed_node), - 0, - SLAB_MEM_SPREAD, - NULL); + delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0); if (!delayed_node_cache) return -ENOMEM; return 0; @@ -43,6 +39,17 @@ void __cold btrfs_delayed_inode_exit(void) kmem_cache_destroy(delayed_node_cache); } +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root) +{ + atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); + delayed_root->nodes = 0; + spin_lock_init(&delayed_root->lock); + init_waitqueue_head(&delayed_root->wait); + INIT_LIST_HEAD(&delayed_root->node_list); + INIT_LIST_HEAD(&delayed_root->prepare_list); +} + static inline void btrfs_init_delayed_node( struct btrfs_delayed_node *delayed_node, struct btrfs_root *root, u64 inode_id) @@ -430,8 +437,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) delayed_root = delayed_node->root->fs_info->delayed_root; - BUG_ON(!delayed_root); - if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else @@ -980,7 +985,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) if (delayed_node && test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - BUG_ON(!delayed_node->root); + ASSERT(delayed_node->root); clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 5cceb31bbd..64e115d974 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -7,15 +7,23 @@ #ifndef BTRFS_DELAYED_INODE_H #define BTRFS_DELAYED_INODE_H +#include <linux/types.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> +#include <linux/fs.h> #include <linux/atomic.h> #include <linux/refcount.h> #include "ctree.h" +struct btrfs_disk_key; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_trans_handle; + enum btrfs_delayed_item_type { BTRFS_DELAYED_INSERTION_ITEM, BTRFS_DELAYED_DELETION_ITEM @@ -98,18 +106,7 @@ struct btrfs_delayed_item { char data[] __counted_by(data_len); }; -static inline void btrfs_init_delayed_root( - struct btrfs_delayed_root *delayed_root) -{ - atomic_set(&delayed_root->items, 0); - atomic_set(&delayed_root->items_seq, 0); - delayed_root->nodes = 0; - spin_lock_init(&delayed_root->lock); - init_waitqueue_head(&delayed_root->wait); - INIT_LIST_HEAD(&delayed_root->node_list); - INIT_LIST_HEAD(&delayed_root->prepare_list); -} - +void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root); int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 891ea2fa26..e44e62cf76 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1004,6 +1004,52 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&ref->add_list); } +void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, + u64 len, u64 parent, u64 owning_root) +{ + generic_ref->action = action; + generic_ref->bytenr = bytenr; + generic_ref->len = len; + generic_ref->parent = parent; + generic_ref->owning_root = owning_root; +} + +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, + u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: root; +#endif + generic_ref->tree_ref.level = level; + generic_ref->tree_ref.ref_root = root; + generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + +} + +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ref_root, u64 ino, + u64 offset, u64 mod_root, bool skip_qgroup) +{ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* If @real_root not set, use @root as fallback */ + generic_ref->real_root = mod_root ?: ref_root; +#endif + generic_ref->data_ref.ref_root = ref_root; + generic_ref->data_ref.ino = ino; + generic_ref->data_ref.offset = offset; + generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; +} + /* * add a delayed tree ref. This does all of the accounting required * to make sure the delayed ref is eventually processed before this @@ -1220,6 +1266,25 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return 0; } +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) +{ + if (refcount_dec_and_test(&ref->refs)) { + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + default: + BUG(); + } + } +} + /* * This does a simple search for the head node for a given extent. Returns the * head node if found, or NULL if not. @@ -1242,31 +1307,19 @@ void __cold btrfs_delayed_ref_exit(void) int __init btrfs_delayed_ref_init(void) { - btrfs_delayed_ref_head_cachep = kmem_cache_create( - "btrfs_delayed_ref_head", - sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) goto fail; - btrfs_delayed_tree_ref_cachep = kmem_cache_create( - "btrfs_delayed_tree_ref", - sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_tree_ref_cachep = KMEM_CACHE(btrfs_delayed_tree_ref, 0); if (!btrfs_delayed_tree_ref_cachep) goto fail; - btrfs_delayed_data_ref_cachep = kmem_cache_create( - "btrfs_delayed_data_ref", - sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_data_ref_cachep = KMEM_CACHE(btrfs_delayed_data_ref, 0); if (!btrfs_delayed_data_ref_cachep) goto fail; - btrfs_delayed_extent_op_cachep = kmem_cache_create( - "btrfs_delayed_extent_op", - sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 62d679d40f..b291147cb8 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -6,7 +6,17 @@ #ifndef BTRFS_DELAYED_REF_H #define BTRFS_DELAYED_REF_H +#include <linux/types.h> #include <linux/refcount.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <uapi/linux/btrfs_tree.h> + +struct btrfs_trans_handle; +struct btrfs_fs_info; /* these are the possible values of struct btrfs_delayed_ref_node->action */ enum btrfs_delayed_ref_action { @@ -308,53 +318,12 @@ static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info * return btrfs_calc_metadata_size(fs_info, num_csum_items); } -static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, - int action, u64 bytenr, u64 len, - u64 parent, u64 owning_root) -{ - generic_ref->action = action; - generic_ref->bytenr = bytenr; - generic_ref->len = len; - generic_ref->parent = parent; - generic_ref->owning_root = owning_root; -} - -static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, - u64 root, u64 mod_root, bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: root; -#endif - generic_ref->tree_ref.level = level; - generic_ref->tree_ref.ref_root = root; - generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; - -} - -static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, - u64 ref_root, u64 ino, u64 offset, u64 mod_root, - bool skip_qgroup) -{ -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: ref_root; -#endif - generic_ref->data_ref.ref_root = ref_root; - generic_ref->data_ref.ino = ino; - generic_ref->data_ref.offset = offset; - generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(ref_root) && - (!mod_root || is_fstree(mod_root)))) - generic_ref->skip_qgroup = true; - else - generic_ref->skip_qgroup = false; -} +void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, + u64 len, u64 parent, u64 owning_root); +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, + u64 mod_root, bool skip_qgroup); +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ref_root, u64 ino, + u64 offset, u64 mod_root, bool skip_qgroup); static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) @@ -369,24 +338,7 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) kmem_cache_free(btrfs_delayed_extent_op_cachep, op); } -static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) -{ - if (refcount_dec_and_test(&ref->refs)) { - WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); - switch (ref->type) { - case BTRFS_TREE_BLOCK_REF_KEY: - case BTRFS_SHARED_BLOCK_REF_KEY: - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - case BTRFS_SHARED_DATA_REF_KEY: - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - break; - default: - BUG(); - } - } -} +void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref); static inline u64 btrfs_ref_head_to_space_flags( struct btrfs_delayed_ref_head *head_ref) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 79c4293ddf..7696beec4c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -11,10 +11,8 @@ #include <linux/math64.h> #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "async-thread.h" #include "dev-replace.h" @@ -246,7 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -257,13 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) { + if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev_handle); + return PTR_ERR(bdev_file); } - bdev = bdev_handle->bdev; + bdev = file_bdev(bdev_file); if (!btrfs_check_device_zone_type(fs_info, bdev)) { btrfs_err(fs_info, @@ -314,7 +312,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; - device->bdev_handle = bdev_handle; + device->bdev_file = bdev_file; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->dev_stats_valid = 1; @@ -335,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - bdev_release(bdev_handle); + fput(bdev_file); return ret; } @@ -1000,8 +998,7 @@ error: btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) - btrfs_scratch_superblocks(fs_info, src_device->bdev, - src_device->name->str); + btrfs_scratch_superblocks(fs_info, src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 675082ccec..23e480efe5 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -6,11 +6,15 @@ #ifndef BTRFS_DEV_REPLACE_H #define BTRFS_DEV_REPLACE_H +#include <linux/types.h> +#include <linux/compiler_types.h> + struct btrfs_ioctl_dev_replace_args; struct btrfs_fs_info; struct btrfs_trans_handle; struct btrfs_dev_replace; struct btrfs_block_group; +struct btrfs_device; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index e40a226373..00b3d83d75 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -3,9 +3,15 @@ #ifndef BTRFS_DIR_ITEM_H #define BTRFS_DIR_ITEM_H +#include <linux/types.h> #include <linux/crc32c.h> struct fscrypt_str; +struct btrfs_fs_info; +struct btrfs_key; +struct btrfs_path; +struct btrfs_root; +struct btrfs_trans_handle; int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2567821224..3a47eec87b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" @@ -193,7 +192,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + eb->folio_size); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, @@ -1245,6 +1244,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "leaked root %s refcount %d", btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); + WARN_ON_ONCE(1); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); btrfs_put_root(root); @@ -2240,7 +2240,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) struct btrfs_key location; int ret; - BUG_ON(!fs_info->tree_root); + ASSERT(fs_info->tree_root); ret = load_global_roots(tree_root); if (ret) @@ -4544,18 +4544,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_delayed_ref_node *ref; - delayed_refs = &trans->delayed_refs; - spin_lock(&delayed_refs->lock); - if (atomic_read(&delayed_refs->num_entries) == 0) { - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, "delayed_refs has NO entry"); - return; - } - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; struct rb_node *n; @@ -4629,7 +4621,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - __btrfs_del_delalloc_inode(root, btrfs_inode); + btrfs_del_delalloc_inode(btrfs_inode); spin_unlock(&root->delalloc_lock); /* @@ -4928,7 +4920,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto error; + } if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index eb3473d1c1..76eb53fe7a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -6,6 +6,22 @@ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H +#include <linux/sizes.h> +#include <linux/compiler_types.h> +#include "ctree.h" +#include "fs.h" + +struct block_device; +struct super_block; +struct extent_buffer; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info; +struct btrfs_super_block; +struct btrfs_trans_handle; +struct btrfs_tree_parent_check; +struct btrfs_transaction; + #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 @@ -25,10 +41,6 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_tree_parent_check; - void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1ac00ee795..8398d345ec 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -5,7 +5,6 @@ #include "ctree.h" #include "disk-io.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "export.h" #include "accessors.h" #include "super.h" diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index eba6bc4f5a..464582273a 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -4,6 +4,10 @@ #define BTRFS_EXPORT_H #include <linux/exportfs.h> +#include <linux/types.h> + +struct dentry; +struct super_block; extern const struct export_operations btrfs_export_ops; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index e3ee5449cc..c09b428823 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -6,7 +6,6 @@ #include "ctree.h" #include "extent-io-tree.h" #include "btrfs_inode.h" -#include "misc.h" static struct kmem_cache *extent_state_cache; @@ -48,6 +47,7 @@ static inline void btrfs_extent_state_leak_debug_check(void) extent_state_in_tree(state), refcount_read(&state->refs)); list_del(&state->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_state_cache, state); } } @@ -1883,8 +1883,8 @@ void __cold extent_state_free_cachep(void) int __init extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_state), 0, 0, + NULL); if (!extent_state_cache) return -ENOMEM; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index ebe6390d65..9d3a52d8f5 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,9 +3,16 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/list.h> +#include <linux/wait.h> #include "misc.h" struct extent_changeset; +struct btrfs_fs_info; +struct btrfs_inode; /* Bits for the extent state */ enum { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8e8cc11112..257d044bca 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -18,7 +18,7 @@ #include <linux/crc32c.h> #include "ctree.h" #include "extent-tree.h" -#include "tree-log.h" +#include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" @@ -26,14 +26,11 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" -#include "delalloc-space.h" #include "discard.h" -#include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" @@ -2399,7 +2396,14 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + ret = -EUCLEAN; + goto out; + } ret = -ENOENT; if (path->slots[0] == 0) @@ -2780,6 +2784,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; + int ret = 0; while (start <= end) { readonly = false; @@ -2789,7 +2794,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); /* Logic error */ + if (cache == NULL) { + /* Logic error, something removed the block group. */ + ret = -EUCLEAN; + goto out; + } cluster = fetch_cluster_info(fs_info, cache->space_info, @@ -2858,7 +2867,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, if (cache) btrfs_put_block_group(cache); - return 0; +out: + return ret; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) @@ -2888,7 +2898,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, &cached_state); - unpin_extent_range(fs_info, start, end, true); + ret = unpin_extent_range(fs_info, start, end, true); + BUG_ON(ret); mutex_unlock(&fs_info->unused_bg_unpin_mutex); free_extent_state(cached_state); cond_resched(); @@ -3447,16 +3458,25 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_ref generic_ref = { 0 }; struct btrfs_block_group *bg; int ret; - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent, btrfs_header_owner(buf)); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root_id, 0, false); - if (root_id != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_ref generic_ref = { 0 }; + + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, + buf->start, buf->len, parent, + btrfs_header_owner(buf)); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), + root_id, 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ @@ -4950,7 +4970,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 root_objectid = root->root_key.objectid; u64 owning_root = root_objectid; - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(root_objectid != BTRFS_TREE_LOG_OBJECTID); if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) owning_root = root->relocation_src_root; @@ -6167,10 +6187,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end) +/* + * Unpin the extent range in an error context and don't add the space back. + * Errors are not propagated further. + */ +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { - return unpin_extent_range(fs_info, start, end, false); + unpin_extent_range(fs_info, start, end, false); } /* diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 2e066035cc..af9f8800d5 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -3,11 +3,21 @@ #ifndef BTRFS_EXTENT_TREE_H #define BTRFS_EXTENT_TREE_H +#include <linux/types.h> #include "misc.h" #include "block-group.h" +#include "locking.h" +struct extent_buffer; struct btrfs_free_cluster; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_path; +struct btrfs_ref; +struct btrfs_disk_key; struct btrfs_delayed_ref_head; +struct btrfs_delayed_ref_root; +struct btrfs_extent_inline_ref; enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b5175b9929..41173701f1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -14,7 +14,6 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/fsverity.h> -#include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" @@ -22,7 +21,6 @@ #include "btrfs_inode.h" #include "bio.h" #include "locking.h" -#include "rcu-string.h" #include "backref.h" #include "disk-io.h" #include "subpage.h" @@ -78,10 +76,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); pr_err( - "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); + WARN_ON_ONCE(1); kmem_cache_free(extent_buffer_cache, eb); } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); @@ -147,8 +146,8 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", - sizeof(struct extent_buffer), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_buffer), 0, 0, + NULL); if (!extent_buffer_cache) return -ENOMEM; @@ -462,16 +461,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) */ static void end_bbio_data_write(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; - struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; @@ -593,22 +591,17 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) */ static void end_bbio_data_read(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct processed_extent processed = { 0 }; struct folio_iter fi; - /* - * The offset to the beginning of a bio, since one bio can never be - * larger than UINT_MAX, u32 here is enough. - */ - u32 bio_offset = 0; + const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u32 sectorsize = fs_info->sectorsize; u64 start; u64 end; u32 len; @@ -667,10 +660,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) end_page_read(folio_page(folio, 0), uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -728,6 +717,8 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp) for (int i = 0; i < num_pages; i++) eb->folios[i] = page_folio(page_array[i]); + eb->folio_size = PAGE_SIZE; + eb->folio_shift = PAGE_SHIFT; return 0; } @@ -964,13 +955,14 @@ void clear_page_extent_mapped(struct page *page) folio_detach_private(folio); } -static struct extent_map * -__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, +static struct extent_map *__get_extent_map(struct inode *inode, struct page *page, u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; - if (em_cached && *em_cached) { + ASSERT(em_cached); + + if (*em_cached) { em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { @@ -982,8 +974,8 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR(em)) { + em = btrfs_get_extent(BTRFS_I(inode), page, start, len); + if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -1045,8 +1037,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_page_read(page, true, cur, iosize); break; } - em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, em_cached); + em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); @@ -1155,11 +1146,14 @@ int btrfs_read_folio(struct file *file, struct folio *folio) u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct extent_map *em_cached = NULL; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); + ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL); + free_extent_map(em_cached); + /* * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. @@ -1177,6 +1171,8 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, struct btrfs_inode *inode = page_to_inode(pages[0]); int index; + ASSERT(em_cached); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { @@ -1365,7 +1361,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, continue; } - em = btrfs_get_extent(inode, NULL, 0, cur, len); + em = btrfs_get_extent(inode, NULL, cur, len); if (IS_ERR(em)) { ret = PTR_ERR_OR_ZERO(em); goto out_error; @@ -1733,10 +1729,10 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_lock(folio); folio_clear_dirty_for_io(folio); folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - folio_size(folio)); + eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -2746,7 +2742,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { - struct extent_buffer *clone; + struct extent_buffer *clone = path->nodes[0]; struct btrfs_key key; int slot; int ret; @@ -2755,29 +2751,51 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) return 0; + /* + * Add a temporary extra ref to an already cloned extent buffer to + * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid + * the cost of allocating a new one. + */ + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); + atomic_inc(&clone->refs); + ret = btrfs_next_leaf(inode->root, path); if (ret != 0) - return ret; + goto out; /* * Don't bother with cloning if there are no more file extent items for * our inode. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 1; + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { + ret = 1; + goto out; + } + /* + * Important to preserve the start field, for the optimizations when + * checking if extents are shared (see extent_fiemap()). + * + * We must set ->start before calling copy_extent_buffer_full(). If we + * are on sub-pagesize blocksize, we use ->start to determine the offset + * into the folio where our eb exists, and if we update ->start after + * the fact then any subsequent reads of the eb may read from a + * different offset in the folio than where we originally copied into. + */ + clone->start = path->nodes[0]->start; /* See the comment at fiemap_search_slot() about why we clone. */ - clone = btrfs_clone_extent_buffer(path->nodes[0]); - if (!clone) - return -ENOMEM; + copy_extent_buffer_full(clone, path->nodes[0]); slot = path->slots[0]; btrfs_release_path(path); path->nodes[0] = clone; path->slots[0] = slot; +out: + if (ret) + free_extent_buffer(clone); - return 0; + return ret; } /* @@ -3644,6 +3662,8 @@ static struct extent_buffer *grab_extent_buffer( struct folio *folio = page_folio(page); struct extent_buffer *exists; + lockdep_assert_held(&page->mapping->i_private_lock); + /* * For subpage case, we completely rely on radix tree to ensure we * don't try to insert two ebs for the same bytenr. So here we always @@ -3711,13 +3731,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * The caller needs to free the existing folios and retry using the same order. */ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct btrfs_subpage *prealloc, struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio; + struct folio *existing_folio = NULL; int ret; ASSERT(found_eb_ret); @@ -3729,30 +3750,31 @@ retry: ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) - return 0; + goto finish; existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) + if (IS_ERR(existing_folio)) { + existing_folio = NULL; goto retry; + } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); - if (folio_size(existing_folio) != folio_size(eb->folios[0])) { + if (folio_size(existing_folio) != eb->folio_size) { folio_unlock(existing_folio); folio_put(existing_folio); return -EAGAIN; } - if (fs_info->nodesize < PAGE_SIZE) { - /* - * We're going to reuse the existing page, can drop our page - * and subpage structure now. - */ +finish: + spin_lock(&mapping->i_private_lock); + if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; - } else { + } else if (existing_folio) { struct extent_buffer *existing_eb; existing_eb = grab_extent_buffer(fs_info, @@ -3760,6 +3782,7 @@ retry: if (existing_eb) { /* The extent buffer still exists, we can use it directly. */ *found_eb_ret = existing_eb; + spin_unlock(&mapping->i_private_lock); folio_unlock(existing_folio); folio_put(existing_folio); return 1; @@ -3768,6 +3791,22 @@ retry: __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; } + eb->folio_size = folio_size(eb->folios[i]); + eb->folio_shift = folio_shift(eb->folios[i]); + /* Should not fail, as we have preallocated the memory. */ + ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); + ASSERT(!ret); + /* + * To inform we have an extra eb under allocation, so that + * detach_extent_buffer_page() won't release the folio private when the + * eb hasn't been inserted into radix tree yet. + * + * The ref will be decreased when the eb releases the page, in + * detach_extent_buffer_page(). Thus needs no special handling in the + * error path. + */ + btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -3779,7 +3818,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; @@ -3845,7 +3883,7 @@ reallocate: for (int i = 0; i < num_folios; i++) { struct folio *folio; - ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); + ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); if (ret > 0) { ASSERT(existing_eb); goto out; @@ -3882,22 +3920,6 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - spin_lock(&mapping->i_private_lock); - /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_folio(eb, folio, prealloc); - ASSERT(!ret); - /* - * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the folio private - * when the eb hasn't yet been inserted into radix tree. - * - * The ref will be decreased when the eb released the page, in - * detach_extent_buffer_page(). - * Thus needs no special handling in error path. - */ - btrfs_folio_inc_eb_refs(fs_info, folio); - spin_unlock(&mapping->i_private_lock); - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); /* @@ -4169,6 +4191,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { bool subpage = eb->fs_info->nodesize < PAGE_SIZE; @@ -4344,7 +4367,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, for (int i = 0; i < num_folios; i++) { struct folio *folio = eb->folios[i]; - ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); } } @@ -4364,7 +4387,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, unsigned long len) { btrfs_warn(eb->fs_info, - "access to eb bytenr %llu len %lu out of range start %lu len %lu", + "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); @@ -4393,7 +4416,7 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *dst = (char *)dstv; @@ -4433,7 +4456,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char __user *dst = (char __user *)dstv; @@ -4473,7 +4496,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4544,7 +4567,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; size_t cur; size_t offset; char *kaddr; @@ -4593,7 +4616,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { - const int unit_size = folio_size(eb->folios[0]); + const int unit_size = eb->folio_size; unsigned long cur = start; if (eb->addr) { @@ -4624,7 +4647,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - const int unit_size = folio_size(src->folios[0]); + const int unit_size = src->folio_size; unsigned long cur = 0; ASSERT(dst->len == src->len); @@ -4646,7 +4669,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; u64 dst_len = dst->len; size_t cur; size_t offset; @@ -4702,10 +4725,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset; + offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; - *folio_index = offset >> folio_shift(eb->folios[0]); - *folio_offset = offset_in_folio(eb->folios[0], offset); + *folio_index = offset >> eb->folio_shift; + *folio_offset = offset_in_eb_folio(eb, offset); } /* @@ -4819,7 +4842,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - const int unit_size = folio_size(dst->folios[0]); + const int unit_size = dst->folio_size; unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2c9d6570b0..e3530d427e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,11 +7,32 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/slab.h> #include "compression.h" +#include "messages.h" #include "ulist.h" #include "misc.h" +struct page; +struct file; +struct folio; +struct inode; +struct fiemap_extent_info; +struct readahead_control; +struct address_space; +struct writeback_control; +struct extent_io_tree; +struct extent_map_tree; +struct btrfs_block_group; +struct btrfs_fs_info; +struct btrfs_inode; +struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, @@ -63,11 +84,6 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct btrfs_root; -struct btrfs_inode; -struct btrfs_fs_info; -struct extent_io_tree; -struct btrfs_tree_parent_check; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); @@ -75,7 +91,8 @@ void __cold extent_buffer_free_cachep(void); #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; - unsigned long len; + u32 len; + u32 folio_size; unsigned long bflags; struct btrfs_fs_info *fs_info; @@ -90,6 +107,7 @@ struct extent_buffer { int read_mirror; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + u8 folio_shift; struct rcu_head rcu_head; struct rw_semaphore lock; @@ -113,6 +131,13 @@ struct btrfs_eb_write_context { struct btrfs_block_group *zoned_bg; }; +static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb, + u64 start) +{ + ASSERT(eb->folio_size); + return start & (eb->folio_size - 1); +} + /* * Get the correct offset inside the page of extent buffer. * @@ -151,13 +176,13 @@ static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, * the folio_shift would be large enough to always make us * return 0 as index. * 1.2) Several page sized folios - * The folio_shift() would be PAGE_SHIFT, giving us the correct + * The folio_shift would be PAGE_SHIFT, giving us the correct * index. * * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case * The folio would only be page sized, and always give us 0 as index. */ - return offset >> folio_shift(eb->folios[0]); + return offset >> eb->folio_shift; } /* @@ -205,8 +230,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -struct extent_map_tree; - int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 54e111b5fa..24a048210b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -5,7 +5,6 @@ #include <linux/spinlock.h> #include "messages.h" #include "ctree.h" -#include "volumes.h" #include "extent_map.h" #include "compression.h" #include "btrfs_inode.h" @@ -16,8 +15,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", - sizeof(struct extent_map), 0, - SLAB_MEM_SPREAD, NULL); + sizeof(struct extent_map), 0, 0, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -539,7 +537,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, u64 end; u64 start_diff; - BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + if (map_start < em->start || map_start >= extent_map_end(em)) + return -EINVAL; if (existing->start > map_start) { next = existing; @@ -630,13 +629,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, */ ret = merge_extent_mapping(em_tree, existing, em, start); - if (ret) { + if (WARN_ON(ret)) { free_extent_map(em); *em_in = NULL; - WARN_ONCE(ret, -"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n", - ret, existing->start, existing->len, - orig_start, orig_len); + btrfs_warn(fs_info, +"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", + existing->start, extent_map_end(existing), + orig_start, orig_start + orig_len, start); } free_extent_map(existing); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index e380fc08bb..c5a098c99c 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -3,10 +3,18 @@ #ifndef BTRFS_EXTENT_MAP_H #define BTRFS_EXTENT_MAP_H +#include <linux/compiler_types.h> +#include <linux/rwlock_types.h> #include <linux/rbtree.h> +#include <linux/list.h> #include <linux/refcount.h> +#include "misc.h" +#include "extent_map.h" #include "compression.h" +struct btrfs_inode; +struct btrfs_fs_info; + #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 81ac1d474b..e58fb5347e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -10,17 +10,14 @@ #include <linux/sched/mm.h> #include <crypto/hash.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "bio.h" -#include "print-tree.h" #include "compression.h" #include "fs.h" #include "accessors.h" #include "file-item.h" -#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -179,7 +176,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1229,8 +1225,6 @@ insert: ins_size); if (ret < 0) goto out; - if (WARN_ON(ret != 0)) - goto out; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 04bd2d34ef..15c05cc0fc 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,21 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/list.h> +#include <uapi/linux/btrfs_tree.h> #include "accessors.h" +struct extent_map; +struct btrfs_file_extent_item; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_bio; +struct btrfs_trans_handle; +struct btrfs_root; +struct btrfs_ordered_sum; +struct btrfs_path; +struct btrfs_inode; + #define BTRFS_FILE_EXTENT_INLINE_DATA_START \ (offsetof(struct btrfs_file_extent_item, disk_bytenr)) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 616790d6e5..f9d7607239 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -22,10 +22,8 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "tree-log.h" #include "locking.h" -#include "volumes.h" #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" @@ -1912,6 +1910,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out_release_extents; } + btrfs_init_log_ctx_scratch_eb(&ctx); + /* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for @@ -1931,6 +1931,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + /* + * Scratch eb no longer needed, release before syncing log or commit + * transaction, to avoid holding unnecessary memory during such long + * operations. + */ + if (ctx.scratch_eb) { + free_extent_buffer(ctx.scratch_eb); + ctx.scratch_eb = NULL; + } btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -2006,6 +2015,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_commit_transaction(trans); out: + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.list)); ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); @@ -2176,7 +2186,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) struct extent_map *em; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, + em = btrfs_get_extent(inode, NULL, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) @@ -2835,7 +2845,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -2866,7 +2876,7 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -2909,8 +2919,7 @@ static int btrfs_zero_range(struct inode *inode, if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, - sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3005,7 +3014,7 @@ reserve_space: } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, - i_blocksize(inode), + fs_info->sectorsize, offset + len, &alloc_hint); unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); @@ -3126,7 +3135,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ while (cur_offset < alloc_end) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -3177,7 +3186,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, i_blocksize(inode), + range->len, blocksize, offset + len, &alloc_hint); /* * btrfs_prealloc_file_range() releases space even diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 82b34fbb29..77aaca208c 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -3,6 +3,21 @@ #ifndef BTRFS_FILE_H #define BTRFS_FILE_H +#include <linux/types.h> + +struct file; +struct extent_state; +struct kiocb; +struct iov_iter; +struct page; +struct btrfs_ioctl_encoded_io_args; +struct btrfs_drop_extents_args; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_path; +struct btrfs_replace_extent_info; +struct btrfs_trans_handle; + extern const struct file_operations btrfs_file_operations; int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 815bb146b1..c8a05d5eb9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -19,9 +19,7 @@ #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "volumes.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" #include "discard.h" #include "subpage.h" @@ -2621,7 +2619,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, } } -int __btrfs_add_free_space(struct btrfs_block_group *block_group, +static int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { @@ -4156,15 +4154,13 @@ out: int __init btrfs_free_space_init(void) { - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); + btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); if (!btrfs_free_space_cachep) return -ENOMEM; btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); + 0, NULL); if (!btrfs_free_space_bitmap_cachep) { kmem_cache_destroy(btrfs_free_space_cachep); return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33b4da3271..83774bfd7b 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -6,6 +6,19 @@ #ifndef BTRFS_FREE_SPACE_CACHE_H #define BTRFS_FREE_SPACE_CACHE_H +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include "fs.h" + +struct inode; +struct page; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_trans_handle; +struct btrfs_trim_block_group; + /* * This is the trim state of an extent or bitmap. * @@ -114,8 +127,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); -int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, - u64 size, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7b598b0707..90f2938bd7 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1176,12 +1176,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { ret = PTR_ERR(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } ret = btrfs_global_root_insert(free_space_root); if (ret) { btrfs_put_root(free_space_root); - goto abort; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; } node = rb_first_cached(&fs_info->block_group_cache_tree); @@ -1189,8 +1193,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_clear; + } node = rb_next(node); } @@ -1206,11 +1213,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: +out_clear: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); return ret; } @@ -1273,12 +1278,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } btrfs_global_root_delete(free_space_root); @@ -1295,11 +1306,6 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_put_root(free_space_root); return btrfs_commit_transaction(trans); - -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1322,8 +1328,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { @@ -1332,8 +1341,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } node = rb_next(node); } @@ -1344,10 +1356,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_commit_transaction(trans); clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); return ret; -abort: - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 6d5551d0ce..e6c6d6f4f2 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -6,7 +6,13 @@ #ifndef BTRFS_FREE_SPACE_TREE_H #define BTRFS_FREE_SPACE_TREE_H +#include <linux/bits.h> + struct btrfs_caching_control; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_block_group; +struct btrfs_trans_handle; /* * The default size for new free space bitmap items. The last bitmap in a block diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bd59cf0aae..93f5c57ea4 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -4,13 +4,50 @@ #define BTRFS_FS_H #include <linux/blkdev.h> -#include <linux/fs.h> -#include <linux/btrfs_tree.h> #include <linux/sizes.h> +#include <linux/time64.h> +#include <linux/compiler.h> +#include <linux/math.h> +#include <linux/atomic.h> +#include <linux/blkdev.h> +#include <linux/percpu_counter.h> +#include <linux/completion.h> +#include <linux/lockdep.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/rwlock_types.h> +#include <linux/rwsem.h> +#include <linux/semaphore.h> +#include <linux/list.h> +#include <linux/radix-tree.h> +#include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/wait_bit.h> +#include <linux/sched.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "extent-io-tree.h" -#include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" +#include "fs.h" + +struct inode; +struct super_block; +struct kobject; +struct reloc_control; +struct crypto_shash; +struct ulist; +struct btrfs_device; +struct btrfs_block_group; +struct btrfs_root; +struct btrfs_fs_devices; +struct btrfs_transaction; +struct btrfs_delayed_root; +struct btrfs_balance_control; +struct btrfs_subpage_info; +struct btrfs_stripe_hash_table; +struct btrfs_space_info; #define BTRFS_MAX_EXTENT_SIZE SZ_128M @@ -732,10 +769,13 @@ struct btrfs_fs_info { /* Reclaim partially filled block groups in the background */ struct work_struct reclaim_bgs_work; + /* Protected by unused_bgs_lock. */ struct list_head reclaim_bgs; int bg_reclaim_threshold; + /* Protects the lists unused_bgs and reclaim_bgs. */ spinlock_t unused_bgs_lock; + /* Protected by unused_bgs_lock. */ struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ @@ -933,6 +973,8 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation op); +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 7d734830e5..9c1394c0a6 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -9,7 +9,6 @@ #include "inode-item.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "space-info.h" #include "accessors.h" #include "extent-tree.h" diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index 4337bb26f4..c4aded8270 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -6,14 +6,15 @@ #include <linux/types.h> #include <linux/crc32c.h> +struct fscrypt_str; +struct extent_buffer; struct btrfs_trans_handle; struct btrfs_root; struct btrfs_path; struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; -struct extent_buffer; -struct fscrypt_str; +struct btrfs_truncate_control; /* * Return this if we need to call truncate_block for the last bit of the diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3c534c1bf..7fed887e70 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,14 +39,12 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "bio.h" #include "compression.h" #include "locking.h" -#include "free-space-cache.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -740,7 +738,8 @@ static noinline int add_async_extent(struct async_chunk *cow, struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); - BUG_ON(!async_extent); /* -ENOMEM */ + if (!async_extent) + return -ENOMEM; async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; @@ -1027,8 +1026,9 @@ again: * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - add_async_extent(async_chunk, start, total_in, total_compressed, pages, - nr_pages, compress_type); + ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages, + nr_pages, compress_type); + BUG_ON(ret); if (start + total_in < end) { start += total_in; cond_resched(); @@ -1040,8 +1040,9 @@ mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: - add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); + ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + BUG_ON(ret); free_pages: if (pages) { for (i = 0; i < nr_pages; i++) { @@ -2301,6 +2302,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; @@ -2339,6 +2342,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state u64 new_size, old_size; u32 num_extents; + lockdep_assert_held(&inode->io_tree.lock); + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; @@ -2386,55 +2391,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state spin_unlock(&inode->lock); } -static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct btrfs_inode *inode) +static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&inode->delalloc_inodes)) { - list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); - root->nr_delalloc_inodes++; - if (root->nr_delalloc_inodes == 1) { - spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(!list_empty(&root->delalloc_root)); - list_add_tail(&root->delalloc_root, - &fs_info->delalloc_roots); - spin_unlock(&fs_info->delalloc_root_lock); - } + ASSERT(list_empty(&inode->delalloc_inodes)); + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&fs_info->delalloc_root_lock); + ASSERT(list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } spin_unlock(&root->delalloc_lock); } -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) +void btrfs_del_delalloc_inode(struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + lockdep_assert_held(&root->delalloc_lock); + + /* + * We may be called after the inode was already deleted from the list, + * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), + * and then later through btrfs_clear_delalloc_extent() while the inode + * still has ->delalloc_bytes > 0. + */ if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); - BUG_ON(list_empty(&root->delalloc_root)); + ASSERT(!list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } -static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) -{ - spin_lock(&root->delalloc_lock); - __btrfs_del_delalloc_inode(root, inode); - spin_unlock(&root->delalloc_lock); -} - /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. @@ -2444,6 +2444,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s { struct btrfs_fs_info *fs_info = inode->root->fs_info; + lockdep_assert_held(&inode->io_tree.lock); + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* @@ -2452,10 +2454,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; + u64 prev_delalloc_bytes; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(inode); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, num_extents); @@ -2468,13 +2469,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&inode->lock); + prev_delalloc_bytes = inode->delalloc_bytes; inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) inode->defrag_bytes += len; - if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_add_delalloc_inodes(root, inode); spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_clear_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) + btrfs_add_delalloc_inode(inode); } if (!(state->state & EXTENT_DELALLOC_NEW) && @@ -2496,6 +2504,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); + lockdep_assert_held(&inode->io_tree.lock); + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; @@ -2509,7 +2519,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; - bool do_list = !btrfs_is_free_space_inode(inode); + u64 new_delalloc_bytes; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); @@ -2529,7 +2539,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, return; if (!btrfs_is_data_reloc_root(root) && - do_list && !(state->state & EXTENT_NORESERVE) && + !btrfs_is_free_space_inode(inode) && + !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2537,11 +2548,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; - if (do_list && inode->delalloc_bytes == 0 && - test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &inode->runtime_flags)) - btrfs_del_delalloc_inode(root, inode); + new_delalloc_bytes = inode->delalloc_bytes; spin_unlock(&inode->lock); + + /* + * We don't need to be under the protection of the inode's lock, + * because we are called while holding the inode's io_tree lock + * and are therefore protected against concurrent calls of this + * function and btrfs_set_delalloc_extent(). + */ + if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { + spin_lock(&root->delalloc_lock); + btrfs_del_delalloc_inode(inode); + spin_unlock(&root->delalloc_lock); + } } if ((state->state & EXTENT_DELALLOC_NEW) && @@ -2631,7 +2651,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + em = btrfs_get_extent(inode, NULL, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); @@ -4387,7 +4407,14 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto out; + } ret = 0; if (path->slots[0] > 0) { @@ -4710,7 +4737,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); - struct page *page; + struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; @@ -4742,8 +4769,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } again: - page = find_or_create_page(mapping, index, mask); - if (!page) { + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); @@ -4751,15 +4779,15 @@ again: goto out; } - if (!PageUptodate(page)) { - ret = btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); goto again; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto out_unlock; } @@ -4771,19 +4799,19 @@ again: * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent(io_tree, block_start, block_end, &cached_state); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -4804,15 +4832,16 @@ again: if (!len) len = blocksize - offset; if (front) - memzero_page(page, (block_start - page_offset(page)), - offset); + folio_zero_range(folio, block_start - folio_pos(folio), + offset); else - memzero_page(page, (block_start - page_offset(page)) + offset, - len); + folio_zero_range(folio, + (block_start - folio_pos(folio)) + offset, + len); } - btrfs_folio_clear_checked(fs_info, page_folio(page), block_start, + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); - btrfs_folio_set_dirty(fs_info, page_folio(page), block_start, + btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); @@ -4829,8 +4858,8 @@ out_unlock: block_start, blocksize, true); } btrfs_delalloc_release_extents(inode, blocksize); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: if (only_release_metadata) btrfs_check_nocow_unlock(inode); @@ -4922,8 +4951,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) &cached_state); cur_offset = hole_start; while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - block_end - cur_offset); + em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -5549,7 +5577,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); - BUG_ON(args->root && !BTRFS_I(inode)->root); if (args->root && args->root == args->root->fs_info->tree_root && args->ino != BTRFS_BTREE_INODE_OBJECTID) @@ -6772,7 +6799,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * * @inode: file to search in * @page: page to read extent data into if the extent is inline - * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * @@ -6786,8 +6812,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) + struct page *page, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6930,7 +6955,6 @@ next: * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ - ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); @@ -7571,7 +7595,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (ret < 0) goto err; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -8716,7 +8740,7 @@ int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) goto fail; @@ -10180,7 +10204,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, cond_resched(); } - em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_extent; @@ -10760,7 +10784,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_block_group *bg; u64 len = isize - start; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8851ba7a1e..0493272a76 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -34,11 +34,9 @@ #include "export.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "volumes.h" #include "locking.h" #include "backref.h" -#include "rcu-string.h" #include "send.h" #include "dev-replace.h" #include "props.h" @@ -47,9 +45,7 @@ #include "tree-log.h" #include "compression.h" #include "space-info.h" -#include "delalloc-space.h" #include "block-group.h" -#include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" @@ -231,6 +227,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } +int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) +{ + if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + +static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2) +{ + if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL) + return -ENAMETOOLONG; + return 0; +} + /* * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. @@ -929,7 +939,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(d_inode(victim->d_parent) != dir); + /* The @victim is not inside @dir. */ + if (d_inode(victim->d_parent) != dir) + return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); @@ -1147,7 +1159,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = PTR_ERR(vol_args); goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + sizestr = vol_args->name; cancel = (strcmp("cancel", sizestr) == 0); ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); @@ -1347,12 +1362,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out; ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); +out: kfree(vol_args); return ret; } @@ -1371,7 +1389,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto free_args; if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; @@ -2391,7 +2411,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { - vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; + err = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); + if (err < 0) + goto out; subvol_name = vol_args2->name; err = mnt_want_write_file(file); @@ -2475,7 +2497,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = 0; + err = btrfs_check_ioctl_vol_args_path(vol_args); + if (err < 0) + goto out; + subvol_name = vol_args->name; err = mnt_want_write_file(file); @@ -2686,12 +2711,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) goto out; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); +out_free: kfree(vol_args); out: if (restore_op) @@ -2707,7 +2736,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2723,7 +2752,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); + if (ret < 0) + goto out; + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { args.devid = vol_args->devid; } else if (!strcmp("cancel", vol_args->name)) { @@ -2744,7 +2776,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); btrfs_exclop_finish(fs_info); @@ -2758,8 +2790,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2772,7 +2804,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args *vol_args; - struct bdev_handle *bdev_handle = NULL; + struct file *bdev_file = NULL; int ret; bool cancel = false; @@ -2783,7 +2815,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) + goto out_free; + if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { @@ -2799,17 +2834,18 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev_handle); + ret = btrfs_rm_device(fs_info, &args, &bdev_file); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev_handle) - bdev_release(bdev_handle); + if (bdev_file) + fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); +out_free: kfree(vol_args); return ret; } diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index d51b9a2f2f..2c5dc25ec6 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -3,6 +3,15 @@ #ifndef BTRFS_IOCTL_H #define BTRFS_IOCTL_H +#include <linux/types.h> + +struct file; +struct dentry; +struct mnt_idmap; +struct fileattr; +struct btrfs_fs_info; +struct btrfs_ioctl_balance_args; + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 74d8e2003f..99ccab86bb 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -13,7 +13,6 @@ #include "ctree.h" #include "extent_io.h" #include "locking.h" -#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given @@ -85,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int { struct btrfs_lockdep_keyset *ks; - BUG_ON(level >= ARRAY_SIZE(ks->keys)); + ASSERT(level < ARRAY_SIZE(ks->keys)); /* Find the matching keyset, id 0 is the default entry */ for (ks = btrfs_lockdep_keysets; ks->id; ks++) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 7d6ee1e609..9576f485a3 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -8,8 +8,14 @@ #include <linux/atomic.h> #include <linux/wait.h> +#include <linux/lockdep.h> #include <linux/percpu_counter.h> #include "extent_io.h" +#include "locking.h" + +struct extent_buffer; +struct btrfs_path; +struct btrfs_root; #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 @@ -157,8 +163,6 @@ enum btrfs_lockdep_trans_states { static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); -struct btrfs_path; - void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h index 00328c856b..e32906ab6f 100644 --- a/fs/btrfs/lru_cache.h +++ b/fs/btrfs/lru_cache.h @@ -3,8 +3,10 @@ #ifndef BTRFS_LRU_CACHE_H #define BTRFS_LRU_CACHE_H +#include <linux/types.h> #include <linux/maple_tree.h> #include <linux/list.h> +#include "lru_cache.h" /* * A cache entry. This is meant to be embedded in a structure of a user of @@ -50,11 +52,6 @@ struct btrfs_lru_cache { #define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \ list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list) -static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache) -{ - return cache->size; -} - static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( struct btrfs_lru_cache *cache) { diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index cdada48658..210d9c82e2 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -3,13 +3,11 @@ #include "fs.h" #include "messages.h" #include "discard.h" -#include "transaction.h" -#include "space-info.h" #include "super.h" #ifdef CONFIG_PRINTK -#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_PREFACE " state " #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) /* diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 40f2d9f1a1..dde4904aea 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -3,6 +3,8 @@ #ifndef BTRFS_MISC_H #define BTRFS_MISC_H +#include <linux/types.h> +#include <linux/bitmap.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/math64.h> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 81f67ebf74..c2a42bcde9 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,7 +19,6 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" -#include "super.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -1237,10 +1236,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( int __init ordered_data_init(void) { - btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", - sizeof(struct btrfs_ordered_extent), 0, - SLAB_MEM_SPREAD, - NULL); + btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 127ef8bf0f..34413fc5b4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -6,6 +6,21 @@ #ifndef BTRFS_ORDERED_DATA_H #define BTRFS_ORDERED_DATA_H +#include <linux/types.h> +#include <linux/list.h> +#include <linux/refcount.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <linux/wait.h> +#include "async-thread.h" + +struct inode; +struct page; +struct extent_state; +struct btrfs_inode; +struct btrfs_root; +struct btrfs_fs_info; + struct btrfs_ordered_sum { /* * Logical start address and length for of the blocks covered by diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 7a1b021b56..6195a2215b 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -4,7 +4,6 @@ */ #include "ctree.h" -#include "disk-io.h" #include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h index 3faab5cbb5..aa54a88a60 100644 --- a/fs/btrfs/orphan.h +++ b/fs/btrfs/orphan.h @@ -3,6 +3,11 @@ #ifndef BTRFS_ORPHAN_H #define BTRFS_ORPHAN_H +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_root; + int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index c42bc666d5..8504bf1702 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,6 +9,9 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 +struct extent_buffer; +struct btrfs_key; + void btrfs_print_leaf(const struct extent_buffer *l); void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index ac4a0af2b5..2a9b7b029e 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,6 +4,7 @@ */ #include <linux/hashtable.h> +#include <linux/xattr.h> #include "messages.h" #include "props.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 6e283196e3..f60cd89feb 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,7 +6,12 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H -#include "ctree.h" +#include <linux/compiler_types.h> + +struct inode; +struct btrfs_inode; +struct btrfs_path; +struct btrfs_trans_handle; int __init btrfs_props_init(void); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index cacc12d0ff..1167899a16 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -468,6 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } if (!qgroup) { struct btrfs_qgroup *prealloc; + struct btrfs_root *tree_root = fs_info->tree_root; prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) { @@ -475,6 +476,25 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); + /* + * If a qgroup exists for a subvolume ID, it is possible + * that subvolume has been deleted, in which case + * re-using that ID would lead to incorrect accounting. + * + * Ensure that we skip any such subvol ids. + * + * We don't need to lock because this is only called + * during mount before we start doing things like creating + * subvolumes. + */ + if (is_fstree(qgroup->qgroupid) && + qgroup->qgroupid > tree_root->free_objectid) + /* + * Don't need to check against BTRFS_LAST_FREE_OBJECTID, + * as it will get checked on the next call to + * btrfs_get_free_objectid. + */ + tree_root->free_objectid = qgroup->qgroupid + 1; } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) @@ -1324,7 +1344,7 @@ static int flush_reservations(struct btrfs_fs_info *fs_info) trans = btrfs_join_transaction(fs_info->tree_root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); return ret; } @@ -2500,8 +2520,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; - BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); - BUG_ON(root_eb == NULL); + ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); + ASSERT(root_eb != NULL); if (!btrfs_qgroup_full_accounting(fs_info)) return 0; @@ -2856,8 +2876,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (nr_old_roots == 0 && nr_new_roots == 0) goto out_free; - BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); @@ -3131,13 +3149,69 @@ static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, qgids = res->qgroups; list_for_each_entry(qg_list, &inode_qg->groups, next_group) - qgids[i] = qg_list->group->qgroupid; + qgids[i++] = qg_list->group->qgroupid; *inherit = res; return 0; } /* + * Check if we can skip rescan when inheriting qgroups. If @src has a single + * @parent, and that @parent is owning all its bytes exclusively, we can skip + * the full rescan, by just adding nodesize to the @parent's excl/rfer. + * + * Return <0 for fatal errors (like srcid/parentid has no qgroup). + * Return 0 if a quick inherit is done. + * Return >0 if a quick inherit is not possible, and a full rescan is needed. + */ +static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info, + u64 srcid, u64 parentid) +{ + struct btrfs_qgroup *src; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + int nr_parents = 0; + + src = find_qgroup_rb(fs_info, srcid); + if (!src) + return -ENOENT; + parent = find_qgroup_rb(fs_info, parentid); + if (!parent) + return -ENOENT; + + /* + * Source has no parent qgroup, but our new qgroup would have one. + * Qgroup numbers would become inconsistent. + */ + if (list_empty(&src->groups)) + return 1; + + list_for_each_entry(list, &src->groups, next_group) { + /* The parent is not the same, quick update is not possible. */ + if (list->group->qgroupid != parentid) + return 1; + nr_parents++; + /* + * More than one parent qgroup, we can't be sure about accounting + * consistency. + */ + if (nr_parents > 1) + return 1; + } + + /* + * The parent is not exclusively owning all its bytes. We're not sure + * if the source has any bytes not fully owned by the parent. + */ + if (parent->excl != parent->rfer) + return 1; + + parent->excl += fs_info->nodesize; + parent->rfer += fs_info->nodesize; + return 0; +} + +/* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error @@ -3305,6 +3379,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); + + /* + * If the source qgroup has parent but the new one doesn't, + * we need a full rescan. + */ + if (!inherit && !list_empty(&srcgroup->groups)) + need_rescan = true; } if (!inherit) @@ -3319,14 +3400,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (ret) goto unlock; } + if (srcid) { + /* Check if we can do a quick inherit. */ + ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); + if (ret < 0) + goto unlock; + if (ret > 0) + need_rescan = true; + ret = 0; + } ++i_qgroups; - - /* - * If we're doing a snapshot, and adding the snapshot to a new - * qgroup, the numbers are guaranteed to be incorrect. - */ - if (srcid) - need_rescan = true; } for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { @@ -3763,14 +3846,14 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled"); - ret = -EINVAL; + ret = -ENOTCONN; } if (ret) @@ -3781,14 +3864,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - btrfs_warn(fs_info, - "qgroup rescan is already in progress"); ret = -EINPROGRESS; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { - btrfs_warn(fs_info, + btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled"); - ret = -EINVAL; + ret = -ENOTCONN; } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 45a7d8920d..706640be0e 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -6,12 +6,22 @@ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H +#include <linux/types.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/kobject.h> -#include "ulist.h" -#include "delayed-ref.h" -#include "misc.h" +#include <linux/list.h> +#include <uapi/linux/btrfs_tree.h> + +struct extent_buffer; +struct extent_changeset; +struct btrfs_delayed_extent_op; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_ioctl_quota_ctl_args; +struct btrfs_trans_handle; +struct btrfs_delayed_ref_root; +struct btrfs_inode; /* * Btrfs qgroup overview @@ -321,7 +331,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit); int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9589362acf..6af6b4b9a3 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -11,7 +11,6 @@ #include "disk-io.h" #include "raid-stripe-tree.h" #include "volumes.h" -#include "misc.h" #include "print-tree.h" int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index cdb58b38fc..c9c258f849 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -6,6 +6,10 @@ #ifndef BTRFS_RAID_STRIPE_TREE_H #define BTRFS_RAID_STRIPE_TREE_H +#include <linux/types.h> +#include <uapi/linux/btrfs_tree.h> +#include "fs.h" + #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ BTRFS_BLOCK_GROUP_RAID0 | \ @@ -13,6 +17,7 @@ struct btrfs_io_context; struct btrfs_io_stripe; +struct btrfs_fs_info; struct btrfs_ordered_extent; struct btrfs_trans_handle; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 792c8e17c3..6f4a9cfeea 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -14,7 +14,6 @@ #include <linux/raid/xor.h> #include <linux/mm.h> #include "messages.h" -#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -918,6 +917,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); + /* + * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 + * (limited by u8). + */ + ASSERT(real_stripes >= 2); + ASSERT(real_stripes <= U8_MAX); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -955,6 +961,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); + ASSERT(rbio->nr_data > 0); return rbio; } @@ -1181,6 +1188,26 @@ static inline void bio_list_put(struct bio_list *bio_list) bio_put(bio); } +static void assert_rbio(struct btrfs_raid_bio *rbio) +{ + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || + !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + /* + * At least two stripes (2 disks RAID5), and since real_stripes is U8, + * we won't go beyond 256 disks anyway. + */ + ASSERT(rbio->real_stripes >= 2); + ASSERT(rbio->nr_data > 0); + + /* + * This is another check to make sure nr data stripes is smaller + * than total stripes. + */ + ASSERT(rbio->nr_data < rbio->real_stripes); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1212,6 +1239,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { @@ -2473,6 +2501,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } if (has_qstripe) { + assert_rbio(rbio); /* RAID6, call the library function to fill in our P/Q */ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 4702136888..0d7b4c2fb6 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,9 +7,18 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H +#include <linux/types.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/bio.h> +#include <linux/refcount.h> #include <linux/workqueue.h> #include "volumes.h" +struct page; +struct sector_ptr; +struct btrfs_fs_info; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c2b66d155..1c2d7cb1fe 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -6,6 +6,12 @@ #ifndef BTRFS_RCU_STRING_H #define BTRFS_RCU_STRING_H +#include <linux/types.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/rcupdate.h> +#include <linux/printk.h> + struct rcu_string { struct rcu_head rcu; char str[]; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 855de37719..3511e1a5c9 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -6,7 +6,16 @@ #ifndef BTRFS_REF_VERIFY_H #define BTRFS_REF_VERIFY_H +#include <linux/types.h> +#include <linux/rbtree_types.h> + +struct btrfs_fs_info; +struct btrfs_ref; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY + +#include <linux/spinlock.h> + int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h index ecb309b4da..1e291f7d85 100644 --- a/fs/btrfs/reflink.h +++ b/fs/btrfs/reflink.h @@ -3,7 +3,9 @@ #ifndef BTRFS_REFLINK_H #define BTRFS_REFLINK_H -#include <linux/fs.h> +#include <linux/types.h> + +struct file; loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2fca67f2b3..f96f267fb4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -523,7 +523,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( if (handle_useless_nodes(rc, node)) node = NULL; out: - btrfs_backref_iter_free(iter); + btrfs_free_path(iter->path); + kfree(iter); btrfs_free_path(path); if (err) { btrfs_backref_error_cleanup(cache, node); diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 5fb60f2deb..788c86d863 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -3,6 +3,15 @@ #ifndef BTRFS_RELOCATION_H #define BTRFS_RELOCATION_H +#include <linux/types.h> + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_root; +struct btrfs_trans_handle; +struct btrfs_ordered_extent; +struct btrfs_pending_snapshot; + int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 5260677ad5..7007f9e0c9 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -10,7 +10,6 @@ #include "messages.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "qgroup.h" #include "space-info.h" #include "accessors.h" @@ -82,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, if (ret > 0) goto out; } else { - BUG_ON(ret == 0); /* Logical error */ + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of the valid range. + */ + if (ret == 0) { + ret = -EUCLEAN; + goto out; + } if (path->slots[0] == 0) goto out; path->slots[0]--; @@ -323,8 +329,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) goto out; - - BUG_ON(ret != 0); + if (ret != 0) { + /* The root must exist but we did not find it by the key. */ + ret = -EUCLEAN; + goto out; + } ret = btrfs_del_item(trans, root, path); out: diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index de0175cbdb..8f5739e732 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -3,7 +3,17 @@ #ifndef BTRFS_ROOT_TREE_H #define BTRFS_ROOT_TREE_H +#include <linux/types.h> + struct fscrypt_str; +struct extent_buffer; +struct btrfs_key; +struct btrfs_root; +struct btrfs_root_item; +struct btrfs_path; +struct btrfs_fs_info; +struct btrfs_block_rsv; +struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d89d5c0a8c..4b22cfe9a9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1390,8 +1390,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist an extent + * item with such offset, but this is out of the valid range. + */ + btrfs_release_path(path); + return -EUCLEAN; + } - ASSERT(ret > 0); /* * Here we intentionally pass 0 as @min_objectid, as there could be * an extent item starting before @search_start. diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index 7639103ebf..f0df597b75 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -3,6 +3,12 @@ #ifndef BTRFS_SCRUB_H #define BTRFS_SCRUB_H +#include <linux/types.h> + +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_scrub_progress; + int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e818766915..50b4a76ac8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -25,7 +25,6 @@ #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" -#include "xattr.h" #include "print-tree.h" #include "accessors.h" #include "dir-item.h" @@ -777,7 +776,12 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) if (WARN_ON(!sctx->send_buf)) return -EINVAL; - BUG_ON(sctx->send_size); + if (unlikely(sctx->send_size != 0)) { + btrfs_err(sctx->send_root->fs_info, + "send: command header buffer not empty cmd %d offset %llu", + cmd, sctx->send_off); + return -EINVAL; + } sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; @@ -1414,7 +1418,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; - if (btrfs_lru_cache_size(&sctx->backref_cache) == 0) + if (sctx->backref_cache.size == 0) return false; /* @@ -1512,7 +1516,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here. */ - if (btrfs_lru_cache_size(&sctx->backref_cache) == 1) + if (sctx->backref_cache.size == 1) sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; } @@ -2817,8 +2821,7 @@ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) static int trim_dir_utimes_cache(struct send_ctx *sctx) { - while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) > - SEND_MAX_DIR_UTIMES_CACHE_SIZE) { + while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) { struct btrfs_lru_cache_entry *lru; int ret; @@ -4190,7 +4193,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * This should never happen as the root dir always has the same ref * which is always '..' */ - BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) { + btrfs_err(fs_info, + "send: unexpected inode %llu in process_recorded_refs()", + sctx->cur_ino); + ret = -EINVAL; + goto out; + } valid_path = fs_path_alloc(); if (!valid_path) { @@ -6466,21 +6475,18 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) return 0; - if (sctx->cur_inode_last_extent == (u64)-1) { - ret = get_last_extent(sctx, key->offset - 1); - if (ret) - return ret; - } - - if (path->slots[0] == 0 && - sctx->cur_inode_last_extent < key->offset) { - /* - * We might have skipped entire leafs that contained only - * file extent items for our current inode. These leafs have - * a generation number smaller (older) than the one in the - * current leaf and the leaf our last extent came from, and - * are located between these 2 leafs. - */ + /* + * Get last extent's end offset (exclusive) if we haven't determined it + * yet (we're processing the first file extent item that is new), or if + * we're at the first slot of a leaf and the last extent's end is less + * than the current extent's offset, because we might have skipped + * entire leaves that contained only file extent items for our current + * inode. These leaves have a generation number smaller (older) than the + * one in the current leaf and the leaf our last extent came from, and + * are located between these 2 leaves. + */ + if ((sctx->cur_inode_last_extent == (u64)-1) || + (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) { ret = get_last_extent(sctx, key->offset - 1); if (ret) return ret; @@ -7437,8 +7443,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_done = 0; lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + ASSERT(*level != 0); - BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) return PTR_ERR(eb); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 4f5509cb18..dd1c9f02b0 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -8,6 +8,11 @@ #define BTRFS_SEND_H #include <linux/types.h> +#include <linux/sizes.h> +#include <linux/align.h> + +struct inode; +struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ @@ -25,9 +30,6 @@ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K #define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) -struct inode; -struct btrfs_ioctl_send_args; - enum btrfs_tlv_type { BTRFS_TLV_U8, BTRFS_TLV_U16, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3b54eb5834..d620323d08 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -9,7 +9,6 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 92c595fed1..a733458fd1 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -4,8 +4,17 @@ #define BTRFS_SPACE_INFO_H #include <trace/events/btrfs.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/kobject.h> +#include <linux/lockdep.h> +#include <linux/wait.h> +#include <linux/rwsem.h> #include "volumes.h" +struct btrfs_fs_info; +struct btrfs_block_group; + /* * Different levels for to flush space when doing space reservations. * diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 0e49dab8da..54736f6238 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -111,6 +111,9 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector subpage_info->checked_offset = cur; cur += nr_bits; + subpage_info->locked_offset = cur; + cur += nr_bits; + subpage_info->total_nr_bits = cur; } @@ -237,28 +240,58 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, start + len <= folio_pos(folio) + PAGE_SIZE); } +#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ +({ \ + unsigned int start_bit; \ + \ + btrfs_subpage_assert(fs_info, folio, start, len); \ + start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + start_bit += fs_info->subpage_info->name##_offset; \ + start_bit; \ +}) + void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = len >> fs_info->sectorsize_bits; + unsigned long flags; + btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); + /* + * Even though it's just for reading the page, no one should have + * locked the subpage range. + */ + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + bitmap_set(subpage->bitmaps, start_bit, nbits); atomic_add(nbits, &subpage->readers); + spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = len >> fs_info->sectorsize_bits; + unsigned long flags; bool is_data; bool last; btrfs_subpage_assert(fs_info, folio, start, len); is_data = is_data_inode(folio->mapping->host); + + spin_lock_irqsave(&subpage->lock, flags); + + /* The range should have already been locked. */ + ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); ASSERT(atomic_read(&subpage->readers) >= nbits); + + bitmap_clear(subpage->bitmaps, start_bit, nbits); last = atomic_sub_and_test(nbits, &subpage->readers); /* @@ -270,6 +303,7 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, */ if (is_data && last) folio_unlock(folio); + spin_unlock_irqrestore(&subpage->lock, flags); } static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) @@ -290,28 +324,38 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) orig_start + orig_len) - *start; } -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); + unsigned long flags; int ret; btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); ASSERT(atomic_read(&subpage->readers) == 0); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->writers); ASSERT(ret == nbits); + spin_unlock_irqrestore(&subpage->lock, flags); } -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); + unsigned long flags; + bool last; btrfs_subpage_assert(fs_info, folio, start, len); + spin_lock_irqsave(&subpage->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. @@ -319,11 +363,18 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * This @locked_page is locked by plain lock_page(), thus its * subpage::writers is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) + if (atomic_read(&subpage->writers) == 0) { + spin_unlock_irqrestore(&subpage->lock, flags); return true; + } ASSERT(atomic_read(&subpage->writers) >= nbits); - return atomic_sub_and_test(nbits, &subpage->writers); + /* The target range should have been locked. */ + ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); + bitmap_clear(subpage->bitmaps, start_bit, nbits); + last = atomic_sub_and_test(nbits, &subpage->writers); + spin_unlock_irqrestore(&subpage->lock, flags); + return last; } /* @@ -365,16 +416,6 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } -#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ -({ \ - unsigned int start_bit; \ - \ - btrfs_subpage_assert(fs_info, folio, start, len); \ - start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - start_bit += fs_info->subpage_info->name##_offset; \ - start_bit; \ -}) - #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ fs_info->subpage_info->name##_offset, \ @@ -751,6 +792,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 793c2b314a..b6dc013b0f 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -4,6 +4,11 @@ #define BTRFS_SUBPAGE_H #include <linux/spinlock.h> +#include <linux/atomic.h> + +struct address_space; +struct folio; +struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -28,7 +33,7 @@ struct btrfs_subpage_info { unsigned int total_nr_bits; /* - * *_start indicates where the bitmap starts, the length is always + * *_offset indicates where the bitmap starts, the length is always * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. */ unsigned int uptodate_offset; @@ -36,6 +41,16 @@ struct btrfs_subpage_info { unsigned int writeback_offset; unsigned int ordered_offset; unsigned int checked_offset; + + /* + * For locked bitmaps, normally it's subpage representation for folio + * Locked flag, but metadata is different: + * + * - Metadata doesn't really lock the folio + * It's just to prevent page::private get cleared before the last + * end_page_read(). + */ + unsigned int locked_offset; }; /* @@ -93,10 +108,6 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); -void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c45fdaf24c..fa6964de34 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -34,13 +34,11 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "print-tree.h" #include "props.h" #include "xattr.h" #include "bio.h" #include "export.h" #include "compression.h" -#include "rcu-string.h" #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" @@ -121,6 +119,7 @@ enum { Opt_thread_pool, Opt_treelog, Opt_user_subvol_rm_allowed, + Opt_norecovery, /* Rescue options */ Opt_rescue, @@ -247,6 +246,8 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), + /* For compatibility only, alias for "rescue=nologreplay". */ + fsparam_flag("norecovery", Opt_norecovery), /* Debugging options. */ fsparam_flag_no("enospc_debug", Opt_enospc_debug), @@ -440,6 +441,11 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; + case Opt_norecovery: + btrfs_info(NULL, +"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; case Opt_flushoncommit: if (result.negated) btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); @@ -2203,7 +2209,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); - vol->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_check_ioctl_vol_args_path(vol); + if (ret < 0) + goto out; switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -2245,6 +2253,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; } +out: kfree(vol); return ret; } diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index f18253ca28..cbcab434b5 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -3,6 +3,13 @@ #ifndef BTRFS_SUPER_H #define BTRFS_SUPER_H +#include <linux/types.h> +#include <linux/fs.h> +#include "fs.h" + +struct super_block; +struct btrfs_fs_info; + bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, unsigned long flags); int btrfs_sync_fs(struct super_block *sb, int wait); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 84c05246ff..c6387a8ddb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -421,7 +421,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { - return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); + return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); } BTRFS_ATTR(static_feature, acl, acl_show); @@ -1228,11 +1228,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy); ssize_t ret = 0; int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (fs_devices->read_policy == i) + if (policy == i) ret += sysfs_emit_at(buf, ret, "%s[%s]", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); @@ -1256,8 +1257,8 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != fs_devices->read_policy) { - fs_devices->read_policy = i; + if (i != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, i); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", btrfs_read_policy_name[i]); @@ -1306,6 +1307,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); +#ifdef CONFIG_BTRFS_DEBUG +static ssize_t btrfs_offload_csum_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + + switch (READ_ONCE(fs_devices->offload_csum_mode)) { + case BTRFS_OFFLOAD_CSUM_AUTO: + return sysfs_emit(buf, "auto\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_ON: + return sysfs_emit(buf, "1\n"); + case BTRFS_OFFLOAD_CSUM_FORCE_OFF: + return sysfs_emit(buf, "0\n"); + default: + WARN_ON(1); + return -EINVAL; + } +} + +static ssize_t btrfs_offload_csum_store(struct kobject *kobj, + struct kobj_attribute *a, const char *buf, + size_t len) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int ret; + bool val; + + ret = kstrtobool(buf, &val); + if (ret == 0) + WRITE_ONCE(fs_devices->offload_csum_mode, + val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); + else if (ret == -EINVAL && sysfs_streq(buf, "auto")) + WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); + else + return -EINVAL; + + return len; +} +BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); +#endif + /* * Per-filesystem information and stats. * @@ -1325,6 +1367,9 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(, offload_csum), +#endif NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 86c7eef128..e6a284c598 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -3,8 +3,17 @@ #ifndef BTRFS_SYSFS_H #define BTRFS_SYSFS_H +#include <linux/types.h> +#include <linux/compiler_types.h> #include <linux/kobject.h> +struct btrfs_fs_info; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_block_group; +struct btrfs_space_info; +struct btrfs_qgroup; + enum btrfs_feature_set { FEAT_COMPAT, FEAT_COMPAT_RO, diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 9957de9f78..99da9d34b7 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -258,7 +258,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); @@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -316,7 +316,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -339,7 +339,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -367,7 +367,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -396,7 +396,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -418,7 +418,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -452,7 +452,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -481,7 +481,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -511,7 +511,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -544,7 +544,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -579,7 +579,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -613,7 +613,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -648,7 +648,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -675,7 +675,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -710,7 +710,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -737,7 +737,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -850,7 +850,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -872,7 +872,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b93cdf5f17..85f359e0e0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,12 +23,10 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" -#include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" @@ -1958,19 +1956,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->uuid_tree_generation = root_item->generation; } -int btrfs_transaction_in_commit(struct btrfs_fs_info *info) -{ - struct btrfs_transaction *trans; - int ret = 0; - - spin_lock(&info->trans_lock); - trans = info->running_transaction; - if (trans) - ret = (trans->state >= TRANS_STATE_COMMIT_START); - spin_unlock(&info->trans_lock); - return ret; -} - int btrfs_transaction_blocked(struct btrfs_fs_info *info) { struct btrfs_transaction *trans; @@ -2685,9 +2670,7 @@ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, int __init btrfs_transaction_init(void) { - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY); if (!btrfs_trans_handle_cachep) return -ENOMEM; return 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 2bf8bbdfd0..4e451ab173 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -6,12 +6,27 @@ #ifndef BTRFS_TRANSACTION_H #define BTRFS_TRANSACTION_H +#include <linux/atomic.h> #include <linux/refcount.h> +#include <linux/list.h> +#include <linux/time64.h> +#include <linux/mutex.h> +#include <linux/wait.h> #include "btrfs_inode.h" #include "delayed-ref.h" -#include "ctree.h" +#include "extent-io-tree.h" +#include "block-rsv.h" +#include "messages.h" #include "misc.h" +struct dentry; +struct inode; +struct btrfs_pending_snapshot; +struct btrfs_fs_info; +struct btrfs_root_item; +struct btrfs_root; +struct btrfs_path; + /* Radix-tree tag for roots that are part of the trasaction. */ #define BTRFS_ROOT_TRANS_TAG 0 @@ -262,7 +277,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); -int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 75c78a5556..32604e9b31 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -21,7 +21,6 @@ #include "messages.h" #include "ctree.h" #include "tree-checker.h" -#include "disk-io.h" #include "compression.h" #include "volumes.h" #include "misc.h" @@ -30,7 +29,6 @@ #include "file-item.h" #include "inode-item.h" #include "dir-item.h" -#include "raid-stripe-tree.h" #include "extent-tree.h" /* @@ -67,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -94,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -154,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -649,6 +650,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1005,6 +1007,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", @@ -1260,6 +1263,7 @@ static void extent_err(const struct extent_buffer *eb, int slot, vaf.fmt = fmt; vaf.va = &args; + dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 580ec4fde0..01669cfa65 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,10 +6,12 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H +#include <linux/types.h> #include <uapi/linux/btrfs_tree.h> struct extent_buffer; struct btrfs_chunk; +struct btrfs_key; /* All the extra info needed to verify the parentness of a tree block. */ struct btrfs_tree_parent_check { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 331fc74299..d4fc5fedd8 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -13,13 +13,11 @@ #include "tree-log.h" #include "disk-io.h" #include "locking.h" -#include "print-tree.h" #include "backref.h" #include "compression.h" #include "qgroup.h" #include "block-group.h" #include "space-info.h" -#include "zoned.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" @@ -2820,6 +2818,52 @@ static void wait_for_writer(struct btrfs_root *root) finish_wait(&root->log_writer_wait, &wait); } +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + ctx->log_new_dentries = false; + ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; + ctx->logged_before = false; + ctx->inode = inode; + INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; + ctx->scratch_eb = NULL; +} + +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = BTRFS_I(ctx->inode); + + if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return; + + /* + * Don't care about allocation failure. This is just for optimization, + * if we fail to allocate here, we will try again later if needed. + */ + ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0); +} + +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + + static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { @@ -3619,6 +3663,30 @@ out: return ret; } +static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) +{ + const int slot = path->slots[0]; + + if (ctx->scratch_eb) { + copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]); + } else { + ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]); + if (!ctx->scratch_eb) + return -ENOMEM; + } + + btrfs_release_path(path); + path->nodes[0] = ctx->scratch_eb; + path->slots[0] = slot; + /* + * Add extra ref to scratch eb so that it is not freed when callers + * release the path, so we can reuse it later if needed. + */ + atomic_inc(&ctx->scratch_eb->refs); + + return 0; +} + static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, @@ -3633,23 +3701,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, bool last_found = false; int batch_start = 0; int batch_size = 0; - int i; + int ret; /* * We need to clone the leaf, release the read lock on it, and use the * clone before modifying the log tree. See the comment at copy_items() * about why we need to do this. */ - src = btrfs_clone_extent_buffer(path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(path, ctx); + if (ret < 0) + return ret; - i = path->slots[0]; - btrfs_release_path(path); - path->nodes[0] = src; - path->slots[0] = i; + src = path->nodes[0]; - for (; i < nritems; i++) { + for (int i = path->slots[0]; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -4259,17 +4324,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct btrfs_path *src_path, int start_slot, int nr, int inode_only, - u64 logged_isize) + u64 logged_isize, struct btrfs_log_ctx *ctx) { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; struct extent_buffer *src; - int ret = 0; + int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; - int i; int dst_index; const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); @@ -4302,14 +4366,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * while the other is holding the delayed node's mutex and wants to * write lock the same subvolume leaf for flushing delayed items. */ - src = btrfs_clone_extent_buffer(src_path->nodes[0]); - if (!src) - return -ENOMEM; + ret = clone_leaf(src_path, ctx); + if (ret < 0) + return ret; - i = src_path->slots[0]; - btrfs_release_path(src_path); - src_path->nodes[0] = src; - src_path->slots[0] = i; + src = src_path->nodes[0]; ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4324,7 +4385,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.nr = 0; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; struct btrfs_root *csum_root; struct btrfs_ordered_sum *sums; @@ -4431,7 +4492,7 @@ add_to_batch: goto out; dst_index = 0; - for (i = 0; i < nr; i++) { + for (int i = 0; i < nr; i++) { const int src_slot = start_slot + i; const int dst_slot = dst_path->slots[0] + dst_index; struct btrfs_key key; @@ -4704,7 +4765,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, */ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; struct btrfs_key key; @@ -4770,7 +4832,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) goto out; ins_nr = 0; @@ -4794,18 +4856,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, path->slots[0]++; continue; } - if (!dropped_extents) { - /* - * Avoid logging extent items logged in past fsync calls - * and leading to duplicate keys in the log tree. - */ + /* + * Avoid overlapping items in the log tree. The first time we + * get here, get rid of everything from a past fsync. After + * that, if the current extent starts before the end of the last + * extent we copied, truncate the last one. This can happen if + * an ordered extent completion modifies the subvolume tree + * while btrfs_next_leaf() has the tree unlocked. + */ + if (!dropped_extents || key.offset < truncate_offset) { ret = truncate_inode_items(trans, root->log_root, inode, - truncate_offset, + min(key.offset, truncate_offset), BTRFS_EXTENT_DATA_KEY); if (ret) goto out; dropped_extents = true; } + truncate_offset = btrfs_file_extent_end(path); if (ins_nr == 0) start_slot = slot; ins_nr++; @@ -4820,7 +4887,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } if (ins_nr > 0) ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); btrfs_free_path(dst_path); @@ -4899,7 +4966,7 @@ process: write_unlock(&tree->lock); if (!ret) - ret = btrfs_log_prealloc_extents(trans, inode, path); + ret = btrfs_log_prealloc_extents(trans, inode, path, ctx); if (ret) return ret; @@ -4980,7 +5047,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; int ret; @@ -5009,7 +5077,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, if (slot >= nritems) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5035,7 +5103,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, } if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - start_slot, ins_nr, 1, 0); + start_slot, ins_nr, 1, 0, ctx); if (ret < 0) return ret; } @@ -5847,7 +5915,7 @@ again: } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, - inode_only, logged_isize); + inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5866,7 +5934,7 @@ again: goto next_slot; ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5883,7 +5951,7 @@ again: } ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret < 0) return ret; ins_nr = 1; @@ -5898,7 +5966,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, - logged_isize); + logged_isize, ctx); if (ret < 0) return ret; ins_nr = 0; @@ -5923,7 +5991,7 @@ next_key: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, - ins_nr, inode_only, logged_isize); + ins_nr, inode_only, logged_isize, ctx); if (ret) return ret; } @@ -5934,7 +6002,7 @@ next_key: * lock the same leaf with btrfs_log_prealloc_extents() below. */ btrfs_release_path(path); - ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx); } return ret; @@ -6526,7 +6594,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; xattrs_logged = true; @@ -6553,7 +6621,7 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; btrfs_release_path(path); @@ -7502,6 +7570,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; + btrfs_init_log_ctx_scratch_eb(&ctx); /* * We don't care about the return value. If we fail to log the new name * then we know the next attempt to sync the log will fallback to a full @@ -7510,6 +7579,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.conflict_inodes)); out: /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a550a8a375..22e9cbc815 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,10 +6,18 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include <linux/list.h> +#include <linux/fs.h> #include "messages.h" #include "ctree.h" #include "transaction.h" +struct inode; +struct dentry; +struct btrfs_ordered_extent; +struct btrfs_root; +struct btrfs_trans_handle; + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 @@ -36,37 +44,20 @@ struct btrfs_log_ctx { struct list_head conflict_inodes; int num_conflict_inodes; bool logging_conflict_inodes; + /* + * Used for fsyncs that need to copy items from the subvolume tree to + * the log tree (full sync flag set or copy everything flag set) to + * avoid allocating a temporary extent buffer while holding a lock on + * an extent buffer of the subvolume tree and under the log transaction. + * Also helps to avoid allocating and freeing a temporary extent buffer + * in case we need to process multiple leaves from the subvolume tree. + */ + struct extent_buffer *scratch_eb; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, - struct inode *inode) -{ - ctx->log_ret = 0; - ctx->log_transid = 0; - ctx->log_new_dentries = false; - ctx->logging_new_name = false; - ctx->logging_new_delayed_dentries = false; - ctx->logged_before = false; - ctx->inode = inode; - INIT_LIST_HEAD(&ctx->list); - INIT_LIST_HEAD(&ctx->ordered_extents); - INIT_LIST_HEAD(&ctx->conflict_inodes); - ctx->num_conflict_inodes = 0; - ctx->logging_conflict_inodes = false; -} - -static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) -{ - struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_extent *tmp; - - ASSERT(inode_is_locked(ctx->inode)); - - list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { - list_del_init(&ordered->log_list); - btrfs_put_ordered_extent(ordered); - } -} +void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode); +void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx); +void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx); static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) { diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 3df6153d5d..43b3accbed 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -44,7 +44,7 @@ struct tree_mod_elem { /* * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } @@ -170,8 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,7 +187,7 @@ static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, } /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ -static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, +static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) @@ -367,9 +366,9 @@ free_tms: return ret; } -static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct tree_mod_elem **tm_list, - int nritems) +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { int i, j; int ret; diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 94f10afeee..ff00c8e8a3 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -3,7 +3,13 @@ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H -#include "ctree.h" +#include <linux/list.h> + +struct extent_buffer; +struct btrfs_fs_info; +struct btrfs_path; +struct btrfs_root; +struct btrfs_seq_list; /* Represents a tree mod log user. */ struct btrfs_seq_list { diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index b4ac2b0cd2..183863f4bf 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -7,7 +7,6 @@ #include <linux/slab.h> #include "messages.h" #include "ulist.h" -#include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index b2cef187ea..8e200fe1a2 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -7,6 +7,7 @@ #ifndef BTRFS_ULIST_H #define BTRFS_ULIST_H +#include <linux/types.h> #include <linux/list.h> #include <linux/rbtree.h> diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 5be74f9e47..b0aff297d6 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -9,7 +9,6 @@ #include "ctree.h" #include "transaction.h" #include "disk-io.h" -#include "print-tree.h" #include "fs.h" #include "accessors.h" #include "uuid-tree.h" @@ -114,7 +113,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); - if (ret >= 0) { + if (ret == 0) { /* Add an item for the type for the first time */ eb = path->nodes[0]; slot = path->slots[0]; diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index 5350c87fe2..080ede0227 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -3,6 +3,11 @@ #ifndef BTRFS_UUID_TREE_H #define BTRFS_UUID_TREE_H +#include <linux/types.h> + +struct btrfs_trans_handle; +struct btrfs_fs_info; + int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 66e2270b0d..4042dd6437 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -14,7 +14,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" -#include "disk-io.h" #include "locking.h" #include "fs.h" #include "accessors.h" diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h index 91c10f7d0a..d696659e43 100644 --- a/fs/btrfs/verity.h +++ b/fs/btrfs/verity.h @@ -3,8 +3,13 @@ #ifndef BTRFS_VERITY_H #define BTRFS_VERITY_H +struct inode; +struct btrfs_inode; + #ifdef CONFIG_FS_VERITY +#include <linux/fsverity.h> + extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); @@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) #else +#include <linux/errno.h> + static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) { return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3ebadc5ec8..ef6bd2f425 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,10 +14,8 @@ #include <linux/namei.h> #include "misc.h" #include "ctree.h" -#include "extent_map.h" #include "disk-io.h" #include "transaction.h" -#include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "rcu-string.h" @@ -468,39 +466,39 @@ static noinline struct btrfs_fs_devices *find_fsid( static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct bdev_handle **bdev_handle, + int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { struct block_device *bdev; int ret; - *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); - if (IS_ERR(*bdev_handle)) { - ret = PTR_ERR(*bdev_handle); + if (IS_ERR(*bdev_file)) { + ret = PTR_ERR(*bdev_file); goto error; } - bdev = (*bdev_handle)->bdev; + bdev = file_bdev(*bdev_file); if (flush) sync_blockdev(bdev); ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - bdev_release(*bdev_handle); + fput(*bdev_file); goto error; } return 0; error: - *bdev_handle = NULL; + *bdev_file = NULL; return ret; } @@ -643,7 +641,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -654,7 +652,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) return ret; @@ -678,20 +676,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev_handle->bdev)) + if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(bdev_handle->bdev)) + if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true; - if (bdev_max_discard_sectors(bdev_handle->bdev)) + if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (device->devt != device->bdev->bd_dev) { @@ -716,7 +714,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return -EINVAL; } @@ -779,8 +777,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; - pr_info("BTRFS: device %s using temp-fsid %pU\n", - path, fs_devices->fsid); + pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid); } mutex_lock(&fs_devices->device_list_mutex); @@ -809,8 +808,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, -"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", - path, fs_devices->fsid, current->comm, +"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, MAJOR(path_devt), MINOR(path_devt), + fs_devices->fsid, current->comm, task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); @@ -836,13 +836,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (disk_super->label[0]) pr_info( - "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->label, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); else pr_info( - "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", +"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", disk_super->fsid, devid, found_transid, path, + MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); } else if (!device->name || strcmp(device->name->str, path)) { @@ -1025,10 +1027,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; - if (device->bdev_handle) { - bdev_release(device->bdev_handle); + if (device->bdev_file) { + fput(device->bdev_file); device->bdev = NULL; - device->bdev_handle = NULL; + device->bdev_file = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1073,7 +1075,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - bdev_release(device->bdev_handle); + fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1374,8 +1376,9 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct bdev_handle *bdev_handle; + struct file *bdev_file; u64 bytenr, bytenr_orig; + dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); @@ -1397,31 +1400,30 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev_handle = bdev_open_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev_handle)) - return ERR_CAST(bdev_handle); + bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_file)) + return ERR_CAST(bdev_file); bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr); + ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } - disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr, + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; } - if (btrfs_skip_registration(disk_super, path, bdev_handle->bdev->bd_dev, - mount_arg_dev)) { + devt = file_bdev(bdev_file)->bd_dev; + if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", - path, MAJOR(bdev_handle->bdev->bd_dev), - MINOR(bdev_handle->bdev->bd_dev)); + path, MAJOR(devt), MINOR(devt)); - btrfs_free_stale_devices(bdev_handle->bdev->bd_dev, NULL); + btrfs_free_stale_devices(devt, NULL); device = NULL; goto free_disk_super; @@ -1435,7 +1437,7 @@ free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - bdev_release(bdev_handle); + fput(bdev_file); return device; } @@ -2086,11 +2088,10 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, copy_num, ret); } -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device) { int copy_num; + struct block_device *bdev = device->bdev; if (!bdev) return; @@ -2106,12 +2107,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + update_dev_time(device->name->str); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle) + struct file **bdev_file) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2220,7 +2221,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, btrfs_assign_next_active_device(device, NULL); - if (device->bdev_handle) { + if (device->bdev_file) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@ -2236,20 +2237,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and bdev_release() will pull in the ->open_mutex on - * the block device and it's dependencies. Instead just flush the - * device and let the caller do the final bdev_release. + * write lock, and fput() on the block device will pull in the + * ->open_mutex on the block device and it's dependencies. Instead + * just flush the device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { - btrfs_scratch_superblocks(fs_info, device->bdev, - device->name->str); + btrfs_scratch_superblocks(fs_info, device); if (device->bdev) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } } - *bdev_handle = device->bdev_handle; + *bdev_file = device->bdev_file; synchronize_rcu(); btrfs_free_device(device); @@ -2355,8 +2355,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_unlock(&fs_devices->device_list_mutex); - btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, - tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); btrfs_close_bdev(tgtdev); synchronize_rcu(); @@ -2386,7 +2385,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, const char *path) { struct btrfs_super_block *disk_super; - struct bdev_handle *bdev_handle; + struct file *bdev_file; int ret; if (!path || !path[0]) @@ -2404,7 +2403,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev_handle, &disk_super); + &bdev_file, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@ -2417,7 +2416,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - bdev_release(bdev_handle); + fput(bdev_file); return 0; } @@ -2637,7 +2636,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@ -2650,12 +2649,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); - if (IS_ERR(bdev_handle)) - return PTR_ERR(bdev_handle); + if (IS_ERR(bdev_file)) + return PTR_ERR(bdev_file); - if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) { + if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { ret = -EINVAL; goto error; } @@ -2667,11 +2666,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - sync_blockdev(bdev_handle->bdev); + sync_blockdev(file_bdev(bdev_file)); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev_handle->bdev) { + if (device->bdev == file_bdev(bdev_file)) { ret = -EEXIST; rcu_read_unlock(); goto error; @@ -2687,8 +2686,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } device->fs_info = fs_info; - device->bdev_handle = bdev_handle; - device->bdev = bdev_handle->bdev; + device->bdev_file = bdev_file; + device->bdev = file_bdev(bdev_file); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@ -2871,7 +2870,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - bdev_release(bdev_handle); + fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -3545,6 +3544,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { @@ -3689,7 +3726,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) struct btrfs_balance_control *bctl = fs_info->balance_ctl; int ret; - BUG_ON(!fs_info->balance_ctl); + ASSERT(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); fs_info->balance_ctl = NULL; @@ -6009,6 +6046,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { + const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; int num_stripes; int preferred_mirror; @@ -6023,13 +6061,12 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - switch (fs_info->fs_devices->read_policy) { + switch (policy) { default: /* Shouldn't happen, just warn and use pid instead of failing */ - btrfs_warn_rl(fs_info, - "unknown read_policy type %u, reset to pid", - fs_info->fs_devices->read_policy); - fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid", + policy); + WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID); fallthrough; case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53f87f398d..93854609a4 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,13 +6,28 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H +#include <linux/blk_types.h> +#include <linux/sizes.h> +#include <linux/atomic.h> #include <linux/sort.h> -#include <linux/btrfs.h> -#include "async-thread.h" +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/log2.h> +#include <linux/kobject.h> +#include <linux/refcount.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <uapi/linux/btrfs.h> #include "messages.h" -#include "tree-checker.h" #include "rcu-string.h" +struct block_device; +struct bdev_handle; +struct btrfs_fs_info; +struct btrfs_block_group; +struct btrfs_trans_handle; +struct btrfs_zoned_device_info; + #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) extern struct mutex uuid_mutex; @@ -77,7 +92,7 @@ enum btrfs_raid_types { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) -struct btrfs_zoned_device_info; +struct btrfs_fs_devices; struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ @@ -90,7 +105,7 @@ struct btrfs_device { u64 generation; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct block_device *bdev; struct btrfs_zoned_device_info *zone_info; @@ -276,6 +291,25 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; +#ifdef CONFIG_BTRFS_DEBUG +/* + * Checksum mode - offload it to workqueues or do it synchronously in + * btrfs_submit_chunk(). + */ +enum btrfs_offload_csum_mode { + /* + * Choose offloading checksum or do it synchronously automatically. + * Do it synchronously if the checksum is fast, or offload to workqueues + * otherwise. + */ + BTRFS_OFFLOAD_CSUM_AUTO, + /* Always offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_ON, + /* Never offload checksum to workqueues. */ + BTRFS_OFFLOAD_CSUM_FORCE_OFF, +}; +#endif + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ @@ -380,6 +414,11 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; + +#ifdef CONFIG_BTRFS_DEBUG + /* Checksum mode - offload it or do it synchronously. */ + enum btrfs_offload_csum_mode offload_csum_mode; +#endif }; #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ @@ -557,8 +596,6 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) } } -struct btrfs_balance_args; -struct btrfs_balance_progress; struct btrfs_balance_control { struct btrfs_balance_args data; struct btrfs_balance_args meta; @@ -661,7 +698,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct bdev_handle **bdev_handle); + struct file **bdev_file); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -780,9 +817,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); -void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device); enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags); int btrfs_bg_type_to_factor(u64 flags); diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 118118ca3e..b9376ea258 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,7 +6,11 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H -#include <linux/xattr.h> +struct dentry; +struct inode; +struct qstr; +struct xattr_handler; +struct btrfs_trans_handle; extern const struct xattr_handler * const btrfs_xattr_handlers[]; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8da66ea699..e5b3f20038 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -398,7 +398,7 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, out: if (unlikely(to_copy != destlen)) { - pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", + pr_warn_ratelimited("BTRFS: inflate failed, decompressed=%lu expected=%zu\n", to_copy, destlen); ret = -EIO; } else { diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 9729fa29c7..459514da16 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -12,10 +12,8 @@ #include "rcu-string.h" #include "disk-io.h" #include "block-group.h" -#include "transaction.h" #include "dev-replace.h" #include "space-info.h" -#include "super.h" #include "fs.h" #include "accessors.h" #include "bio.h" @@ -824,11 +822,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + unsigned int nofs_flags; + ASSERT(sb_zone_is_full(reset)); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_NOFS); + reset->start, reset->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -974,11 +975,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { + unsigned int nofs_flags; int ret; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_NOFS); + zone->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; } @@ -996,11 +1000,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { + unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; + int ret; zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); @@ -1011,9 +1017,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) if (sb_zone + 1 >= nr_zones) return -ENOENT; - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zone_start_sector(sb_zone, bdev), + zone_sectors * BTRFS_NR_SB_LOG_ZONES); + memalloc_nofs_restore(nofs_flags); + return ret; } /* @@ -1124,12 +1133,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { + unsigned int nofs_flags; int ret; *bytes = 0; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_NOFS); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -1279,7 +1290,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct btrfs_chunk_map *map) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - struct btrfs_device *device = map->stripes[zone_idx].dev; + struct btrfs_device *device; int dev_replace_is_ongoing = 0; unsigned int nofs_flag; struct blk_zone zone; @@ -1287,7 +1298,11 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, info->physical = map->stripes[zone_idx].physical; + down_read(&dev_replace->rwsem); + device = map->stripes[zone_idx].dev; + if (!device->bdev) { + up_read(&dev_replace->rwsem); info->alloc_offset = WP_MISSING_DEV; return 0; } @@ -1297,6 +1312,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, __set_bit(zone_idx, active); if (!btrfs_dev_is_sequential(device, info->physical)) { + up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; return 0; } @@ -1304,11 +1320,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); - down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); - up_read(&dev_replace->rwsem); /* * The group is mapped to a sequential zone. Get the zone write pointer @@ -1319,6 +1333,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); if (ret) { + up_read(&dev_replace->rwsem); if (ret != -EIO && ret != -EOPNOTSUPP) return ret; info->alloc_offset = WP_MISSING_DEV; @@ -1330,6 +1345,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), device->devid); + up_read(&dev_replace->rwsem); return -EIO; } @@ -1357,6 +1373,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, break; } + up_read(&dev_replace->rwsem); + return 0; } @@ -2241,14 +2259,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; + unsigned int nofs_flags; if (zinfo->max_active_zones == 0) continue; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) { up_read(&dev_replace->rwsem); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f573bda496..77c4321e33 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -4,12 +4,27 @@ #define BTRFS_ZONED_H #include <linux/types.h> +#include <linux/atomic.h> #include <linux/blkdev.h> +#include <linux/blkzoned.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> #include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" #include "btrfs_inode.h" +#include "fs.h" + +struct block_device; +struct extent_buffer; +struct btrfs_bio; +struct btrfs_ordered_extent; +struct btrfs_fs_info; +struct btrfs_space_info; +struct btrfs_eb_write_context; +struct btrfs_fs_devices; #define BTRFS_DEFAULT_RECLAIM_THRESH (75) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0d66db8bc1..92b3744b81 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -18,8 +18,9 @@ #include <linux/slab.h> #include <linux/zstd.h> #include "misc.h" +#include "fs.h" #include "compression.h" -#include "ctree.h" +#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -618,80 +619,48 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - size_t ret2; - unsigned long total_out = 0; - unsigned long pg_offset = 0; + unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { pr_warn("BTRFS: zstd_init_dstream failed\n"); - ret = -EIO; goto finish; } - destlen = min_t(size_t, destlen, PAGE_SIZE); - workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; - - ret2 = 1; - while (pg_offset < destlen - && workspace->in_buf.pos < workspace->in_buf.size) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - /* Check if the frame is over and we still need more input */ - if (ret2 == 0) { - pr_debug("BTRFS: zstd_decompress_stream ended early\n"); - ret = -EIO; - goto finish; - } - ret2 = zstd_decompress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); - ret = -EIO; - goto finish; - } - - buf_start = total_out; - total_out += workspace->out_buf.pos; - workspace->out_buf.pos = 0; - - if (total_out <= start_byte) - continue; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min_t(unsigned long, destlen - pg_offset, - workspace->out_buf.size - buf_offset); - - memcpy_to_page(dest_page, pg_offset, - workspace->out_buf.dst + buf_offset, bytes); - - pg_offset += bytes; + workspace->out_buf.size = sectorsize; + + /* + * Since both input and output buffers should not exceed one sector, + * one call should end the decompression. + */ + ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (zstd_is_error(ret)) { + pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n", + zstd_get_error_code(ret)); + goto finish; } - ret = 0; + to_copy = workspace->out_buf.pos; + memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy); finish: - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); + /* Error or early end. */ + if (unlikely(to_copy < destlen)) { + ret = -EIO; + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); } return ret; } |