summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/accessors.c15
-rw-r--r--fs/btrfs/accessors.h50
-rw-r--r--fs/btrfs/acl.c1
-rw-r--r--fs/btrfs/acl.h11
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/async-thread.h3
-rw-r--r--fs/btrfs/backref.c167
-rw-r--r--fs/btrfs/backref.h136
-rw-r--r--fs/btrfs/bio.c21
-rw-r--r--fs/btrfs/bio.h2
-rw-r--r--fs/btrfs/block-group.c48
-rw-r--r--fs/btrfs/block-group.h14
-rw-r--r--fs/btrfs/block-rsv.c12
-rw-r--r--fs/btrfs/block-rsv.h7
-rw-r--r--fs/btrfs/btrfs_inode.h45
-rw-r--r--fs/btrfs/compression.c131
-rw-r--r--fs/btrfs/compression.h54
-rw-r--r--fs/btrfs/ctree.c118
-rw-r--r--fs/btrfs/ctree.h39
-rw-r--r--fs/btrfs/defrag.c104
-rw-r--r--fs/btrfs/defrag.h10
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delalloc-space.h4
-rw-r--r--fs/btrfs/delayed-inode.c23
-rw-r--r--fs/btrfs/delayed-inode.h21
-rw-r--r--fs/btrfs/delayed-ref.c362
-rw-r--r--fs/btrfs/delayed-ref.h220
-rw-r--r--fs/btrfs/dev-replace.c21
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/dir-item.h6
-rw-r--r--fs/btrfs/disk-io.c194
-rw-r--r--fs/btrfs/disk-io.h20
-rw-r--r--fs/btrfs/export.c9
-rw-r--r--fs/btrfs/export.h4
-rw-r--r--fs/btrfs/extent-io-tree.c64
-rw-r--r--fs/btrfs/extent-io-tree.h7
-rw-r--r--fs/btrfs/extent-tree.c458
-rw-r--r--fs/btrfs/extent-tree.h18
-rw-r--r--fs/btrfs/extent_io.c404
-rw-r--r--fs/btrfs/extent_io.h54
-rw-r--r--fs/btrfs/extent_map.c398
-rw-r--r--fs/btrfs/extent_map.h75
-rw-r--r--fs/btrfs/file-item.c94
-rw-r--r--fs/btrfs/file-item.h16
-rw-r--r--fs/btrfs/file.c434
-rw-r--r--fs/btrfs/file.h15
-rw-r--r--fs/btrfs/free-space-cache.c25
-rw-r--r--fs/btrfs/free-space-cache.h15
-rw-r--r--fs/btrfs/free-space-tree.c66
-rw-r--r--fs/btrfs/free-space-tree.h6
-rw-r--r--fs/btrfs/fs.h52
-rw-r--r--fs/btrfs/inode-item.c17
-rw-r--r--fs/btrfs/inode-item.h5
-rw-r--r--fs/btrfs/inode.c1110
-rw-r--r--fs/btrfs/ioctl.c162
-rw-r--r--fs/btrfs/ioctl.h9
-rw-r--r--fs/btrfs/locking.c29
-rw-r--r--fs/btrfs/locking.h24
-rw-r--r--fs/btrfs/lru_cache.h7
-rw-r--r--fs/btrfs/lzo.c89
-rw-r--r--fs/btrfs/messages.c4
-rw-r--r--fs/btrfs/misc.h2
-rw-r--r--fs/btrfs/ordered-data.c45
-rw-r--r--fs/btrfs/ordered-data.h16
-rw-r--r--fs/btrfs/orphan.c1
-rw-r--r--fs/btrfs/orphan.h5
-rw-r--r--fs/btrfs/print-tree.c2
-rw-r--r--fs/btrfs/print-tree.h3
-rw-r--r--fs/btrfs/props.c3
-rw-r--r--fs/btrfs/props.h7
-rw-r--r--fs/btrfs/qgroup.c218
-rw-r--r--fs/btrfs/qgroup.h17
-rw-r--r--fs/btrfs/raid-stripe-tree.c1
-rw-r--r--fs/btrfs/raid-stripe-tree.h5
-rw-r--r--fs/btrfs/raid56.c34
-rw-r--r--fs/btrfs/raid56.h9
-rw-r--r--fs/btrfs/rcu-string.h6
-rw-r--r--fs/btrfs/ref-verify.c17
-rw-r--r--fs/btrfs/ref-verify.h9
-rw-r--r--fs/btrfs/reflink.c56
-rw-r--r--fs/btrfs/reflink.h4
-rw-r--r--fs/btrfs/relocation.c426
-rw-r--r--fs/btrfs/relocation.h9
-rw-r--r--fs/btrfs/root-tree.c20
-rw-r--r--fs/btrfs/root-tree.h10
-rw-r--r--fs/btrfs/scrub.c35
-rw-r--r--fs/btrfs/scrub.h6
-rw-r--r--fs/btrfs/send.c126
-rw-r--r--fs/btrfs/send.h8
-rw-r--r--fs/btrfs/space-info.c30
-rw-r--r--fs/btrfs/space-info.h10
-rw-r--r--fs/btrfs/subpage.c74
-rw-r--r--fs/btrfs/subpage.h21
-rw-r--r--fs/btrfs/super.c48
-rw-r--r--fs/btrfs/super.h7
-rw-r--r--fs/btrfs/sysfs.c61
-rw-r--r--fs/btrfs/sysfs.h9
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/btrfs/tests/extent-map-tests.c216
-rw-r--r--fs/btrfs/tests/inode-tests.c40
-rw-r--r--fs/btrfs/transaction.c103
-rw-r--r--fs/btrfs/transaction.h18
-rw-r--r--fs/btrfs/tree-checker.c10
-rw-r--r--fs/btrfs/tree-checker.h2
-rw-r--r--fs/btrfs/tree-log.c247
-rw-r--r--fs/btrfs/tree-log.h49
-rw-r--r--fs/btrfs/tree-mod-log.c15
-rw-r--r--fs/btrfs/tree-mod-log.h8
-rw-r--r--fs/btrfs/ulist.c1
-rw-r--r--fs/btrfs/ulist.h1
-rw-r--r--fs/btrfs/uuid-tree.c3
-rw-r--r--fs/btrfs/uuid-tree.h5
-rw-r--r--fs/btrfs/verity.c1
-rw-r--r--fs/btrfs/verity.h7
-rw-r--r--fs/btrfs/volumes.c211
-rw-r--r--fs/btrfs/volumes.h67
-rw-r--r--fs/btrfs/xattr.c10
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/btrfs/zlib.c114
-rw-r--r--fs/btrfs/zoned.c52
-rw-r--r--fs/btrfs/zoned.h15
-rw-r--r--fs/btrfs/zstd.c157
122 files changed, 4800 insertions, 3457 deletions
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 1925a0919c..79026917db 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -5,7 +5,8 @@
#include <asm/unaligned.h>
#include "messages.h"
-#include "ctree.h"
+#include "extent_io.h"
+#include "fs.h"
#include "accessors.h"
static bool check_setget_bounds(const struct extent_buffer *eb,
@@ -63,8 +64,8 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
const unsigned long oil = get_eb_offset_in_folio(token->eb, \
member_offset);\
- const int unit_size = folio_size(token->eb->folios[0]); \
- const int unit_shift = folio_shift(token->eb->folios[0]); \
+ const int unit_size = token->eb->folio_size; \
+ const int unit_shift = token->eb->folio_shift; \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
const int part = unit_size - oil; \
@@ -94,7 +95,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
const unsigned long oil = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = folio_size(eb->folios[0]); \
+ const int unit_size = eb->folio_size; \
char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
const int part = unit_size - oil; \
@@ -117,8 +118,8 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
const unsigned long oil = get_eb_offset_in_folio(token->eb, \
member_offset);\
- const int unit_size = folio_size(token->eb->folios[0]); \
- const int unit_shift = folio_shift(token->eb->folios[0]); \
+ const int unit_size = token->eb->folio_size; \
+ const int unit_shift = token->eb->folio_shift; \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
const int part = unit_size - oil; \
@@ -151,7 +152,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
const unsigned long oil = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = folio_size(eb->folios[0]); \
+ const int unit_size = eb->folio_size; \
char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
const int part = unit_size - oil; \
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index ed7aa32972..6fce3e8d3d 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -3,8 +3,17 @@
#ifndef BTRFS_ACCESSORS_H
#define BTRFS_ACCESSORS_H
-#include <linux/stddef.h>
#include <asm/unaligned.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/align.h>
+#include <linux/build_bug.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
struct btrfs_map_token {
struct extent_buffer *eb;
@@ -844,45 +853,6 @@ static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}
-static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
- const struct btrfs_disk_balance_args *disk)
-{
- memset(cpu, 0, sizeof(*cpu));
-
- cpu->profiles = le64_to_cpu(disk->profiles);
- cpu->usage = le64_to_cpu(disk->usage);
- cpu->devid = le64_to_cpu(disk->devid);
- cpu->pstart = le64_to_cpu(disk->pstart);
- cpu->pend = le64_to_cpu(disk->pend);
- cpu->vstart = le64_to_cpu(disk->vstart);
- cpu->vend = le64_to_cpu(disk->vend);
- cpu->target = le64_to_cpu(disk->target);
- cpu->flags = le64_to_cpu(disk->flags);
- cpu->limit = le64_to_cpu(disk->limit);
- cpu->stripes_min = le32_to_cpu(disk->stripes_min);
- cpu->stripes_max = le32_to_cpu(disk->stripes_max);
-}
-
-static inline void btrfs_cpu_balance_args_to_disk(
- struct btrfs_disk_balance_args *disk,
- const struct btrfs_balance_args *cpu)
-{
- memset(disk, 0, sizeof(*disk));
-
- disk->profiles = cpu_to_le64(cpu->profiles);
- disk->usage = cpu_to_le64(cpu->usage);
- disk->devid = cpu_to_le64(cpu->devid);
- disk->pstart = cpu_to_le64(cpu->pstart);
- disk->pend = cpu_to_le64(cpu->pend);
- disk->vstart = cpu_to_le64(cpu->vstart);
- disk->vend = cpu_to_le64(cpu->vend);
- disk->target = cpu_to_le64(cpu->target);
- disk->flags = cpu_to_le64(cpu->flags);
- disk->limit = cpu_to_le64(cpu->limit);
- disk->stripes_min = cpu_to_le32(cpu->stripes_min);
- disk->stripes_max = cpu_to_le32(cpu->stripes_max);
-}
-
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 7427449a04..e0ba00d64e 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -12,7 +12,6 @@
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
-#include "btrfs_inode.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index a270e71ec0..48b9ddae4a 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -3,8 +3,15 @@
#ifndef BTRFS_ACL_H
#define BTRFS_ACL_H
+struct posix_acl;
+struct inode;
+struct btrfs_trans_handle;
+
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+struct mnt_idmap;
+struct dentry;
+
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
struct posix_acl *acl, int type);
@@ -13,6 +20,10 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
#else
+#include <linux/errno.h>
+
+struct btrfs_trans_handle;
+
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 9e261aac67..361a866c19 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -11,7 +11,6 @@
#include <linux/freezer.h>
#include <trace/events/btrfs.h>
#include "async-thread.h"
-#include "ctree.h"
enum {
WORK_DONE_BIT,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 62b8a0d578..04c2f31758 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -7,11 +7,14 @@
#ifndef BTRFS_ASYNC_THREAD_H
#define BTRFS_ASYNC_THREAD_H
+#include <linux/compiler_types.h>
#include <linux/workqueue.h>
+#include <linux/list.h>
struct btrfs_fs_info;
struct btrfs_workqueue;
struct btrfs_work;
+
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index fc0930dd92..a2de5c05f9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -198,10 +198,7 @@ static struct kmem_cache *btrfs_prelim_ref_cache;
int __init btrfs_prelim_ref_init(void)
{
btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
- sizeof(struct prelim_ref),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
+ sizeof(struct prelim_ref), 0, 0, NULL);
if (!btrfs_prelim_ref_cache)
return -ENOMEM;
return 0;
@@ -264,7 +261,7 @@ static void update_share_count(struct share_check *sc, int oldcount,
else if (oldcount < 1 && newcount > 0)
sc->share_count++;
- if (newref->root_id == sc->root->root_key.objectid &&
+ if (newref->root_id == btrfs_root_id(sc->root) &&
newref->wanted_disk_byte == sc->data_bytenr &&
newref->key_for_search.objectid == sc->inum)
sc->self_ref_count += newref->count;
@@ -772,7 +769,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
continue;
}
- if (sc && ref->root_id != sc->root->root_key.objectid) {
+ if (sc && ref->root_id != btrfs_root_id(sc->root)) {
free_pref(ref);
ret = BACKREF_FOUND_SHARED;
goto out;
@@ -922,40 +919,38 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
- struct btrfs_delayed_tree_ref *ref;
struct btrfs_key *key_ptr = NULL;
+ /* The owner of a tree block ref is the level. */
+ int level = btrfs_delayed_ref_owner(node);
if (head->extent_op && head->extent_op->update_key) {
btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
key_ptr = &key;
}
- ref = btrfs_delayed_node_to_tree_ref(node);
- ret = add_indirect_ref(fs_info, preftrees, ref->root,
- key_ptr, ref->level + 1,
- node->bytenr, count, sc,
- GFP_ATOMIC);
+ ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
+ key_ptr, level + 1, node->bytenr,
+ count, sc, GFP_ATOMIC);
break;
}
case BTRFS_SHARED_BLOCK_REF_KEY: {
- /* SHARED DIRECT METADATA backref */
- struct btrfs_delayed_tree_ref *ref;
-
- ref = btrfs_delayed_node_to_tree_ref(node);
+ /*
+ * SHARED DIRECT METADATA backref
+ *
+ * The owner of a tree block ref is the level.
+ */
+ int level = btrfs_delayed_ref_owner(node);
- ret = add_direct_ref(fs_info, preftrees, ref->level + 1,
- ref->parent, node->bytenr, count,
+ ret = add_direct_ref(fs_info, preftrees, level + 1,
+ node->parent, node->bytenr, count,
sc, GFP_ATOMIC);
break;
}
case BTRFS_EXTENT_DATA_REF_KEY: {
/* NORMAL INDIRECT DATA backref */
- struct btrfs_delayed_data_ref *ref;
- ref = btrfs_delayed_node_to_data_ref(node);
-
- key.objectid = ref->objectid;
+ key.objectid = btrfs_delayed_ref_owner(node);
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = ref->offset;
+ key.offset = btrfs_delayed_ref_offset(node);
/*
* If we have a share check context and a reference for
@@ -975,18 +970,14 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
if (sc && count < 0)
sc->have_delayed_delete_refs = true;
- ret = add_indirect_ref(fs_info, preftrees, ref->root,
+ ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
&key, 0, node->bytenr, count, sc,
GFP_ATOMIC);
break;
}
case BTRFS_SHARED_DATA_REF_KEY: {
/* SHARED DIRECT FULL backref */
- struct btrfs_delayed_data_ref *ref;
-
- ref = btrfs_delayed_node_to_data_ref(node);
-
- ret = add_direct_ref(fs_info, preftrees, 0, ref->parent,
+ ret = add_direct_ref(fs_info, preftrees, 0, node->parent,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -1036,8 +1027,6 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
slot = path->slots[0];
item_size = btrfs_item_size(leaf, slot);
- BUG_ON(item_size < sizeof(*ei));
-
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
if (ctx->check_extent_item) {
@@ -1435,8 +1424,10 @@ again:
if (ret < 0)
goto out;
if (ret == 0) {
- /* This shouldn't happen, indicates a bug or fs corruption. */
- ASSERT(ret != 0);
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
ret = -EUCLEAN;
goto out;
}
@@ -2225,6 +2216,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ return -EUCLEAN;
+ }
ret = btrfs_previous_extent_item(extent_root, path, 0);
if (ret) {
@@ -2247,7 +2245,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
eb = path->nodes[0];
item_size = btrfs_item_size(eb, path->slots[0]);
- BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
@@ -2626,7 +2623,7 @@ static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
btrfs_debug(fs_root->fs_info,
"following ref at offset %u for inode %llu in tree %llu",
cur, found_key.objectid,
- fs_root->root_key.objectid);
+ btrfs_root_id(fs_root));
ret = inode_to_path(parent, name_len,
(unsigned long)(iref + 1), eb, ipath);
if (ret)
@@ -2844,6 +2841,16 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf
return ret;
}
+static void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
+{
+ iter->bytenr = 0;
+ iter->item_ptr = 0;
+ iter->cur_ptr = 0;
+ iter->end_ptr = 0;
+ btrfs_release_path(iter->path);
+ memset(&iter->cur_key, 0, sizeof(iter->cur_key));
+}
+
int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
{
struct btrfs_fs_info *fs_info = iter->fs_info;
@@ -2862,6 +2869,10 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
if (ret < 0)
return ret;
if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
ret = -EUCLEAN;
goto release;
}
@@ -2932,6 +2943,14 @@ release:
return ret;
}
+static bool btrfs_backref_iter_is_inline_ref(struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
+ return true;
+ return false;
+}
+
/*
* Go to the next backref item of current bytenr, can be either inlined or
* keyed.
@@ -2944,7 +2963,7 @@ release:
*/
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
{
- struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct extent_buffer *eb = iter->path->nodes[0];
struct btrfs_root *extent_root;
struct btrfs_path *path = iter->path;
struct btrfs_extent_inline_ref *iref;
@@ -3032,6 +3051,19 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
return node;
}
+void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ if (node) {
+ ASSERT(list_empty(&node->list));
+ ASSERT(list_empty(&node->lower));
+ ASSERT(node->eb == NULL);
+ cache->nr_nodes--;
+ btrfs_put_root(node->root);
+ kfree(node);
+ }
+}
+
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache)
{
@@ -3043,6 +3075,52 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge(
return edge;
}
+void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge)
+{
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
+}
+
+void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
+void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node)
+{
+ if (node->eb) {
+ btrfs_backref_unlock_node_buffer(node);
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+/*
+ * Drop the backref node from cache without cleaning up its children
+ * edges.
+ *
+ * This can only be called on node without parent edges.
+ * The children edges are still kept as is.
+ */
+void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node)
+{
+ ASSERT(list_empty(&node->upper));
+
+ btrfs_backref_drop_node_buffer(node);
+ list_del_init(&node->list);
+ list_del_init(&node->lower);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ btrfs_backref_free_node(tree, node);
+}
+
/*
* Drop the backref node from cache, also cleaning up all its
* upper edges and any uncached nodes in the path.
@@ -3114,6 +3192,19 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
ASSERT(!cache->nr_edges);
}
+void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which)
+{
+ ASSERT(upper && lower && upper->level == lower->level + 1);
+ edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
+ if (link_which & LINK_LOWER)
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ if (link_which & LINK_UPPER)
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+}
/*
* Handle direct tree backref
*
@@ -3264,7 +3355,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
btrfs_err(fs_info,
"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1, root->root_key.objectid,
+ cur->bytenr, level - 1, btrfs_root_id(root),
tree_key->objectid, tree_key->type, tree_key->offset);
btrfs_put_root(root);
ret = -ENOENT;
@@ -3422,7 +3513,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
int type;
cond_resched();
- eb = btrfs_backref_get_eb(iter);
+ eb = iter->path->nodes[0];
key.objectid = iter->bytenr;
if (btrfs_backref_iter_is_inline_ref(iter)) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ab4ca0eda6..e8c22cccb5 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -6,11 +6,23 @@
#ifndef BTRFS_BACKREF_H
#define BTRFS_BACKREF_H
-#include <linux/btrfs.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
-#include "ulist.h"
+#include "locking.h"
#include "disk-io.h"
#include "extent_io.h"
+#include "ctree.h"
+
+struct extent_inode_elem;
+struct ulist;
+struct btrfs_extent_item;
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
/*
* Used by implementations of iterate_extent_inodes_t (see definition below) to
@@ -271,22 +283,6 @@ struct btrfs_backref_iter {
struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
-static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
-{
- if (!iter)
- return;
- btrfs_free_path(iter->path);
- kfree(iter);
-}
-
-static inline struct extent_buffer *btrfs_backref_get_eb(
- struct btrfs_backref_iter *iter)
-{
- if (!iter)
- return NULL;
- return iter->path->nodes[0];
-}
-
/*
* For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
* is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
@@ -306,25 +302,6 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
-static inline bool btrfs_backref_iter_is_inline_ref(
- struct btrfs_backref_iter *iter)
-{
- if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
- iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
- return true;
- return false;
-}
-
-static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
-{
- iter->bytenr = 0;
- iter->item_ptr = 0;
- iter->cur_ptr = 0;
- iter->end_ptr = 0;
- btrfs_release_path(iter->path);
- memset(&iter->cur_key, 0, sizeof(iter->cur_key));
-}
-
/*
* Backref cache related structures
*
@@ -452,83 +429,22 @@ struct btrfs_backref_edge *btrfs_backref_alloc_edge(
#define LINK_LOWER (1 << 0)
#define LINK_UPPER (1 << 1)
-static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which)
-{
- ASSERT(upper && lower && upper->level == lower->level + 1);
- edge->node[LOWER] = lower;
- edge->node[UPPER] = upper;
- if (link_which & LINK_LOWER)
- list_add_tail(&edge->list[LOWER], &lower->upper);
- if (link_which & LINK_UPPER)
- list_add_tail(&edge->list[UPPER], &upper->lower);
-}
-
-static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
- struct btrfs_backref_node *node)
-{
- if (node) {
- ASSERT(list_empty(&node->list));
- ASSERT(list_empty(&node->lower));
- ASSERT(node->eb == NULL);
- cache->nr_nodes--;
- btrfs_put_root(node->root);
- kfree(node);
- }
-}
-
-static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
- struct btrfs_backref_edge *edge)
-{
- if (edge) {
- cache->nr_edges--;
- kfree(edge);
- }
-}
-
-static inline void btrfs_backref_unlock_node_buffer(
- struct btrfs_backref_node *node)
-{
- if (node->locked) {
- btrfs_tree_unlock(node->eb);
- node->locked = 0;
- }
-}
-static inline void btrfs_backref_drop_node_buffer(
- struct btrfs_backref_node *node)
-{
- if (node->eb) {
- btrfs_backref_unlock_node_buffer(node);
- free_extent_buffer(node->eb);
- node->eb = NULL;
- }
-}
-
-/*
- * Drop the backref node from cache without cleaning up its children
- * edges.
- *
- * This can only be called on node without parent edges.
- * The children edges are still kept as is.
- */
-static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
- struct btrfs_backref_node *node)
-{
- ASSERT(list_empty(&node->upper));
-
- btrfs_backref_drop_node_buffer(node);
- list_del_init(&node->list);
- list_del_init(&node->lower);
- if (!RB_EMPTY_NODE(&node->rb_node))
- rb_erase(&node->rb_node, &tree->rb_root);
- btrfs_backref_free_node(tree, node);
-}
+void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which);
+void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge);
+void btrfs_backref_unlock_node_buffer(struct btrfs_backref_node *node);
+void btrfs_backref_drop_node_buffer(struct btrfs_backref_node *node);
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
+void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node);
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 928f512cdb..e3a57196b0 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -11,7 +11,6 @@
#include "raid56.h"
#include "async-thread.h"
#include "dev-replace.h"
-#include "rcu-string.h"
#include "zoned.h"
#include "file-item.h"
#include "raid-stripe-tree.h"
@@ -509,8 +508,6 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
if (!bioc) {
/* Single mirror read/write fast path. */
btrfs_bio(bio)->mirror_num = mirror_num;
- if (bio_op(bio) != REQ_OP_READ)
- btrfs_bio(bio)->orig_physical = smap->physical;
bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
if (bio_op(bio) != REQ_OP_READ)
btrfs_bio(bio)->orig_physical = smap->physical;
@@ -611,8 +608,20 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
static bool should_async_write(struct btrfs_bio *bbio)
{
+ bool auto_csum_mode = true;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
+ enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
+
+ if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
+ return false;
+
+ auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
+#endif
+
/* Submit synchronously if the checksum implementation is fast. */
- if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+ if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
return false;
/*
@@ -732,7 +741,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
ret = btrfs_bio_csum(bbio);
if (ret)
goto fail_put_bio;
- } else if (use_append) {
+ } else if (use_append ||
+ (btrfs_is_zoned(fs_info) && inode &&
+ inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
if (ret)
goto fail_put_bio;
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index bbaed31716..d9dd527609 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -7,12 +7,14 @@
#ifndef BTRFS_BIO_H
#define BTRFS_BIO_H
+#include <linux/types.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include "tree-checker.h"
struct btrfs_bio;
struct btrfs_fs_info;
+struct btrfs_inode;
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index be70acea87..350cd1201c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -418,7 +418,7 @@ struct btrfs_caching_control *btrfs_get_caching_control(
return ctl;
}
-void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
+static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
{
if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
@@ -1063,7 +1063,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
bool remove_rsv = false;
block_group = btrfs_lookup_block_group(fs_info, map->start);
- BUG_ON(!block_group);
+ if (!block_group)
+ return -ENOENT;
+
BUG_ON(!block_group->ro);
trace_btrfs_remove_block_group(block_group);
@@ -1214,8 +1216,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->space_info->total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
- block_group->space_info->bytes_zone_unusable -=
- block_group->zone_unusable;
+ btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info,
+ -block_group->zone_unusable);
block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
@@ -1387,7 +1389,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes to readonly */
sinfo->bytes_readonly += cache->zone_unusable;
- sinfo->bytes_zone_unusable -= cache->zone_unusable;
+ btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
+ -cache->zone_unusable);
cache->zone_unusable = 0;
}
cache->ro++;
@@ -1429,7 +1432,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
* group in pinned_extents before we were able to clear the whole block
* group range from pinned_extents. This means that task can lookup for
* the block group after we unpinned it from pinned_extents and removed
- * it, leading to a BUG_ON() at unpin_extent_range().
+ * it, leading to an error at unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
@@ -1522,6 +1525,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* outstanding allocations in this block group. We do
* the ro check in case balance is currently acting on
* this block group.
+ *
+ * Also bail out if this is the only block group for its
+ * type, because otherwise we would lose profile
+ * information from fs_info->avail_*_alloc_bits and the
+ * next block group of this type would be created with a
+ * "single" profile (even if we're in a raid fs) because
+ * fs_info->avail_*_alloc_bits would be 0.
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
@@ -1776,6 +1786,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
+ LIST_HEAD(retry_list);
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
@@ -1912,8 +1923,20 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
next:
- if (ret)
- btrfs_mark_bg_to_reclaim(bg);
+ if (ret) {
+ /* Refcount held by the reclaim_bgs list after splice. */
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * This block group might be added to the unused list
+ * during the above process. Move it back to the
+ * reclaim list otherwise.
+ */
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, &retry_list);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ }
btrfs_put_block_group(bg);
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -1933,6 +1956,9 @@ next:
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
end:
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb);
}
@@ -3009,9 +3035,11 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
cache->zone_unusable =
- (cache->alloc_offset - cache->used) +
+ (cache->alloc_offset - cache->used - cache->pinned -
+ cache->reserved) +
(cache->length - cache->zone_capacity);
- sinfo->bytes_zone_unusable += cache->zone_unusable;
+ btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
+ cache->zone_unusable);
sinfo->bytes_readonly -= cache->zone_unusable;
}
num_bytes = cache->length - cache->reserved -
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 962b119839..85e2d4cd12 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -3,9 +3,22 @@
#ifndef BTRFS_BLOCK_GROUP_H
#define BTRFS_BLOCK_GROUP_H
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/wait.h>
+#include <linux/sizes.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs_tree.h>
#include "free-space-cache.h"
struct btrfs_chunk_map;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_trans_handle;
enum btrfs_disk_cache_state {
BTRFS_DC_WRITTEN,
@@ -297,7 +310,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
-void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 1043a81423..b299b82d67 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,7 +6,6 @@
#include "space-info.h"
#include "transaction.h"
#include "block-group.h"
-#include "disk-io.h"
#include "fs.h"
#include "accessors.h"
@@ -342,9 +341,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
read_lock(&fs_info->global_root_lock);
rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
rb_node) {
- if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) {
num_bytes += btrfs_root_used(&root->root_item);
min_items++;
}
@@ -407,7 +406,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- switch (root->root_key.objectid) {
+ switch (btrfs_root_id(root)) {
case BTRFS_CSUM_TREE_OBJECTID:
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
@@ -469,8 +468,7 @@ static struct btrfs_block_rsv *get_block_rsv(
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
(root == fs_info->uuid_root) ||
- (trans->adding_csums &&
- root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
+ (trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID))
block_rsv = trans->block_rsv;
if (!block_rsv)
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 43a9a6b5a7..1f53b967d0 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -3,8 +3,15 @@
#ifndef BTRFS_BLOCK_RSV_H
#define BTRFS_BLOCK_RSV_H
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+
struct btrfs_trans_handle;
struct btrfs_root;
+struct btrfs_space_info;
+struct btrfs_block_rsv;
+struct btrfs_fs_info;
enum btrfs_reserve_flush_enum;
/*
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7f7c5a92d2..6ed495ca7a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -8,13 +8,32 @@
#include <linux/hash.h>
#include <linux/refcount.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/compiler.h>
#include <linux/fscrypt.h>
+#include <linux/lockdep.h>
+#include <uapi/linux/btrfs_tree.h>
#include <trace/events/btrfs.h>
+#include "block-rsv.h"
+#include "btrfs_inode.h"
#include "extent_map.h"
#include "extent_io.h"
+#include "extent-io-tree.h"
#include "ordered-data.h"
#include "delayed-inode.h"
+struct extent_state;
+struct posix_acl;
+struct iov_iter;
+struct writeback_control;
+struct btrfs_root;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+
/*
* Since we search a directory based on f_pos (struct dir_context::pos) we have
* to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
@@ -41,7 +60,6 @@ enum {
*/
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
- BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
/*
@@ -71,6 +89,16 @@ enum {
BTRFS_INODE_FREE_SPACE_INODE,
/* Set when there are no capabilities in XATTs for the inode. */
BTRFS_INODE_NO_CAP_XATTR,
+ /*
+ * Set if an error happened when doing a COW write before submitting a
+ * bio or during writeback. Used for both buffered writes and direct IO
+ * writes. This is to signal a fast fsync that it has to wait for
+ * ordered extents to complete and therefore not log extent maps that
+ * point to unwritten extents (when an ordered extent completes and it
+ * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
+ * range).
+ */
+ BTRFS_INODE_COW_WRITE_ERROR,
};
/* in memory btrfs inode */
@@ -363,9 +391,11 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
}
/*
- * Should be called while holding the inode's VFS lock in exclusive mode or in a
- * context where no one else can access the inode concurrently (during inode
- * creation or when loading an inode from disk).
+ * Should be called while holding the inode's VFS lock in exclusive mode, or
+ * while holding the inode's mmap lock (struct btrfs_inode::i_mmap_lock) in
+ * either shared or exclusive mode, or in a context where no one else can access
+ * the inode concurrently (during inode creation or when loading an inode from
+ * disk).
*/
static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
{
@@ -428,7 +458,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes, bool nowait, bool strict);
-void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
+void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -478,7 +508,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
@@ -490,8 +519,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct btrfs_root *root, struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
+ struct page *page, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
@@ -527,6 +555,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
size_t done_before);
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
size_t done_before);
+struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino);
extern const struct dentry_operations btrfs_dentry_operations;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0041613a36..7c22f5b8c5 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -25,8 +25,6 @@
#include "misc.h"
#include "ctree.h"
#include "fs.h"
-#include "disk-io.h"
-#include "transaction.h"
#include "btrfs_inode.h"
#include "bio.h"
#include "ordered-data.h"
@@ -34,8 +32,7 @@
#include "extent_io.h"
#include "extent_map.h"
#include "subpage.h"
-#include "zoned.h"
-#include "file-item.h"
+#include "messages.h"
#include "super.h"
static struct bio_set btrfs_compressed_bioset;
@@ -93,20 +90,20 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
}
static int compression_compress_pages(int type, struct list_head *ws,
- struct address_space *mapping, u64 start, struct page **pages,
- unsigned long *out_pages, unsigned long *total_in,
- unsigned long *total_out)
+ struct address_space *mapping, u64 start,
+ struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB:
- return zlib_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return zlib_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_LZO:
- return lzo_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return lzo_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_ZSTD:
- return zstd_compress_pages(ws, mapping, start, pages,
- out_pages, total_in, total_out);
+ return zstd_compress_folios(ws, mapping, start, folios,
+ out_folios, total_in, total_out);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -118,7 +115,7 @@ static int compression_compress_pages(int type, struct list_head *ws,
* Not a big deal, just need to inform caller that we
* haven't allocated any pages yet.
*/
- *out_pages = 0;
+ *out_folios = 0;
return -E2BIG;
}
}
@@ -161,11 +158,11 @@ static int compression_decompress(int type, struct list_head *ws,
}
}
-static void btrfs_free_compressed_pages(struct compressed_bio *cb)
+static void btrfs_free_compressed_folios(struct compressed_bio *cb)
{
- for (unsigned int i = 0; i < cb->nr_pages; i++)
- btrfs_free_compr_page(cb->compressed_pages[i]);
- kfree(cb->compressed_pages);
+ for (unsigned int i = 0; i < cb->nr_folios; i++)
+ btrfs_free_compr_folio(cb->compressed_folios[i]);
+ kfree(cb->compressed_folios);
}
static int btrfs_decompress_bio(struct compressed_bio *cb);
@@ -226,25 +223,25 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co
/*
* Common wrappers for page allocation from compression wrappers
*/
-struct page *btrfs_alloc_compr_page(void)
+struct folio *btrfs_alloc_compr_folio(void)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
spin_lock(&compr_pool.lock);
if (compr_pool.count > 0) {
- page = list_first_entry(&compr_pool.list, struct page, lru);
- list_del_init(&page->lru);
+ folio = list_first_entry(&compr_pool.list, struct folio, lru);
+ list_del_init(&folio->lru);
compr_pool.count--;
}
spin_unlock(&compr_pool.lock);
- if (page)
- return page;
+ if (folio)
+ return folio;
- return alloc_page(GFP_NOFS);
+ return folio_alloc(GFP_NOFS, 0);
}
-void btrfs_free_compr_page(struct page *page)
+void btrfs_free_compr_folio(struct folio *folio)
{
bool do_free = false;
@@ -252,7 +249,7 @@ void btrfs_free_compr_page(struct page *page)
if (compr_pool.count > compr_pool.thresh) {
do_free = true;
} else {
- list_add(&page->lru, &compr_pool.list);
+ list_add(&folio->lru, &compr_pool.list);
compr_pool.count++;
}
spin_unlock(&compr_pool.lock);
@@ -260,8 +257,8 @@ void btrfs_free_compr_page(struct page *page)
if (!do_free)
return;
- ASSERT(page_ref_count(page) == 1);
- put_page(page);
+ ASSERT(folio_ref_count(folio) == 1);
+ folio_put(folio);
}
static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
@@ -272,7 +269,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
if (!status)
status = errno_to_blk_status(btrfs_decompress_bio(cb));
- btrfs_free_compressed_pages(cb);
+ btrfs_free_compressed_folios(cb);
btrfs_bio_end_io(cb->orig_bbio, status);
bio_put(&bbio->bio);
}
@@ -326,7 +323,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
end_compressed_writeback(cb);
/* Note, our inode could be gone now */
- btrfs_free_compressed_pages(cb);
+ btrfs_free_compressed_folios(cb);
bio_put(&cb->bbio.bio);
}
@@ -345,17 +342,19 @@ static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
}
-static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
+static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
{
struct bio *bio = &cb->bbio.bio;
u32 offset = 0;
while (offset < cb->compressed_len) {
+ int ret;
u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
/* Maximum compressed extent is smaller than bio size limit. */
- __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
- len, 0);
+ ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT],
+ len, 0);
+ ASSERT(ret);
offset += len;
}
}
@@ -370,8 +369,8 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
* the end io hooks.
*/
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
- struct page **compressed_pages,
- unsigned int nr_pages,
+ struct folio **compressed_folios,
+ unsigned int nr_folios,
blk_opf_t write_flags,
bool writeback)
{
@@ -387,14 +386,14 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
end_bbio_comprssed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
- cb->compressed_pages = compressed_pages;
+ cb->compressed_folios = compressed_folios;
cb->compressed_len = ordered->disk_num_bytes;
cb->writeback = writeback;
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
- cb->nr_pages = nr_pages;
+ cb->nr_folios = nr_folios;
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
cb->bbio.ordered = ordered;
- btrfs_add_compressed_bio_pages(cb);
+ btrfs_add_compressed_bio_folios(cb);
btrfs_submit_bio(&cb->bbio, 0);
}
@@ -515,6 +514,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
put_page(page);
break;
}
+ add_size = min(em->start + em->len, page_end + 1) - cur;
free_extent_map(em);
if (page->index == end_index) {
@@ -527,7 +527,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
}
- add_size = min(em->start + em->len, page_end + 1) - cur;
ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
unlock_extent(tree, cur, page_end, NULL);
@@ -602,14 +601,14 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
free_extent_map(em);
- cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!cb->compressed_pages) {
+ cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS);
+ if (!cb->compressed_folios) {
ret = BLK_STS_RESOURCE;
goto out_free_bio;
}
- ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0);
+ ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios, 0);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -621,7 +620,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
/* include any pages we added in add_ra-bio_pages */
cb->len = bbio->bio.bi_iter.bi_size;
cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
- btrfs_add_compressed_bio_pages(cb);
+ btrfs_add_compressed_bio_folios(cb);
if (memstall)
psi_memstall_leave(&pflags);
@@ -630,7 +629,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
return;
out_free_compressed_pages:
- kfree(cb->compressed_pages);
+ kfree(cb->compressed_folios);
out_free_bio:
bio_put(&cb->bbio.bio);
out:
@@ -977,6 +976,29 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
return level;
}
+/* Wrapper around find_get_page(), with extra error message. */
+int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
+ struct folio **in_folio_ret)
+{
+ struct folio *in_folio;
+
+ /*
+ * The compressed write path should have the folio locked already, thus
+ * we only need to grab one reference.
+ */
+ in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(in_folio)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_crit(inode->root->fs_info,
+ "failed to get page cache, root %lld ino %llu file offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode), start);
+ return -ENOENT;
+ }
+ *in_folio_ret = in_folio;
+ return 0;
+}
+
/*
* Given an address space and start and length, compress the bytes into @pages
* that are allocated on demand.
@@ -997,11 +1019,9 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
- u64 start, struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level);
@@ -1010,8 +1030,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
level = btrfs_compress_set_level(type, level);
workspace = get_workspace(type, level);
- ret = compression_compress_pages(type, workspace, mapping, start, pages,
- out_pages, total_in, total_out);
+ ret = compression_compress_pages(type, workspace, mapping, start, folios,
+ out_folios, total_in, total_out);
put_workspace(type, workspace);
return ret;
}
@@ -1479,11 +1499,6 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
/*
* Compression heuristic.
*
- * For now is's a naive and optimistic 'return true', we'll extend the logic to
- * quickly (compared to direct compression) detect data characteristics
- * (compressible/incompressible) to avoid wasting CPU time on incompressible
- * data.
- *
* The following types of analysis can be performed:
* - detect mostly zero data
* - detect data with low "byte set" size (text, etc)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index afd7e50d07..c20c1a1b09 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -7,10 +7,18 @@
#define BTRFS_COMPRESSION_H
#include <linux/sizes.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
#include "bio.h"
+struct address_space;
+struct page;
+struct inode;
struct btrfs_inode;
struct btrfs_ordered_extent;
+struct btrfs_bio;
/*
* We want to make sure that amount of RAM required to uncompress an extent is
@@ -32,14 +40,12 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
-struct page;
-
struct compressed_bio {
- /* Number of compressed pages in the array */
- unsigned int nr_pages;
+ /* Number of compressed folios in the array. */
+ unsigned int nr_folios;
- /* the pages with the compressed data on them */
- struct page **compressed_pages;
+ /* The folios with the compressed data on them. */
+ struct folio **compressed_folios;
/* starting offset in the inode for our pages */
u64 start;
@@ -79,27 +85,24 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
-int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
- u64 start, struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out);
+int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
- struct page **compressed_pages,
- unsigned int nr_pages,
- blk_opf_t write_flags,
- bool writeback);
+ struct folio **compressed_folios,
+ unsigned int nr_folios, blk_opf_t write_flags,
+ bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
-struct page *btrfs_alloc_compr_page(void);
-void btrfs_free_compr_page(struct page *page);
+struct folio *btrfs_alloc_compr_folio(void);
+void btrfs_free_compr_folio(struct folio *folio);
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
@@ -143,8 +146,11 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
+ struct folio **in_folio_ret);
+
+int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
@@ -154,8 +160,8 @@ struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
struct list_head *zlib_get_workspace(unsigned int level);
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
@@ -164,12 +170,12 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
+int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e65e012bac..8a791b648a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -291,7 +291,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
- if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
else
@@ -321,7 +321,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- trans->transid != root->last_trans);
+ trans->transid != btrfs_get_root_last_trans(root));
level = btrfs_header_level(buf);
if (level == 0)
@@ -454,7 +454,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
} else {
refs = 1;
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
else
@@ -466,15 +466,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
if (refs > 1) {
- if ((owner == root->root_key.objectid ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+ if ((owner == btrfs_root_id(root) ||
+ btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
ret = btrfs_inc_ref(trans, root, buf, 1);
if (ret)
return ret;
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_dec_ref(trans, root, buf, 0);
if (ret)
return ret;
@@ -485,8 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
} else {
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
@@ -500,8 +498,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- if (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
@@ -554,7 +551,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- trans->transid != root->last_trans);
+ trans->transid != btrfs_get_root_last_trans(root));
level = btrfs_header_level(buf);
@@ -563,13 +560,13 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
if (parent)
parent_start = parent->start;
reloc_src_root = btrfs_header_owner(buf);
}
cow = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, &disk_key, level,
+ btrfs_root_id(root), &disk_key, level,
search_start, empty_size, reloc_src_root, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -582,10 +579,10 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
BTRFS_HEADER_FLAG_RELOC);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
else
- btrfs_set_header_owner(cow, root->root_key.objectid);
+ btrfs_set_header_owner(cow, btrfs_root_id(root));
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
@@ -609,7 +606,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
if (buf == root->node) {
WARN_ON(parent && parent != buf);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
@@ -623,10 +620,16 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
atomic_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
- btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
- parent_start, last_ref);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
+ if (ret < 0) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
@@ -651,8 +654,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
}
- btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
- parent_start, last_ref);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
+ if (ret < 0) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -685,7 +694,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
*/
if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
- !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
!test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
return 0;
@@ -820,7 +829,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
}
while (low < high) {
- const int unit_size = folio_size(eb->folios[0]);
+ const int unit_size = eb->folio_size;
unsigned long oil;
unsigned long offset;
struct btrfs_disk_key *tmp;
@@ -986,9 +995,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
return 0;
}
if (btrfs_header_nritems(mid) >
@@ -1003,7 +1016,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -1021,7 +1034,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right,
BTRFS_NESTING_RIGHT_COW);
@@ -1056,10 +1069,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), right,
- 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
+ right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
@@ -1114,9 +1131,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
@@ -1205,7 +1226,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (IS_ERR(left))
return PTR_ERR(left);
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -1265,7 +1286,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (IS_ERR(right))
return PTR_ERR(right);
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -1511,7 +1532,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
check.has_first_key = true;
check.level = parent_level - 1;
check.transid = gen;
- check.owner_root = root->root_key.objectid;
+ check.owner_root = btrfs_root_id(root);
/*
* If we need to read an extent buffer from disk and we are holding locks
@@ -1556,7 +1577,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_release_path(p);
return -EIO;
}
- if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ if (btrfs_check_eb_owner(tmp, btrfs_root_id(root))) {
free_extent_buffer(tmp);
btrfs_release_path(p);
return -EUCLEAN;
@@ -2865,7 +2886,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
&lower_key, level, root->node->start, 0,
0, BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
@@ -2886,7 +2907,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
old = root->node;
ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
if (ret < 0) {
- btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
+ int ret2;
+
+ ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
+ if (ret2 < 0)
+ btrfs_abort_transaction(trans, ret2);
btrfs_tree_unlock(c);
free_extent_buffer(c);
return ret;
@@ -3009,7 +3034,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
&disk_key, level, c->start, 0,
0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
@@ -3267,7 +3292,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return PTR_ERR(right);
- __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
@@ -3483,7 +3508,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return PTR_ERR(left);
- __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
@@ -3761,7 +3786,7 @@ again:
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
- right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
&disk_key, 0, l->start, 0, 0,
num_doubles ? BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
@@ -4280,6 +4305,10 @@ void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
+ *
+ * Returns: 0 on success
+ * -EEXIST if the first key already exists
+ * < 0 on other errors
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -4451,9 +4480,12 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used_bytes(root);
atomic_inc(&leaf->refs);
- btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
- return 0;
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
}
/*
* delete the item at the leaf level in path. If that empties
@@ -5082,9 +5114,7 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
int __init btrfs_ctree_init(void)
{
- btrfs_path_cachep = kmem_cache_create("btrfs_path",
- sizeof(struct btrfs_path), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0);
if (!btrfs_path_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 70e828d331..a56209d275 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -7,25 +7,24 @@
#define BTRFS_CTREE_H
#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/xarray.h>
+#include <linux/refcount.h>
+#include <uapi/linux/btrfs_tree.h>
#include "locking.h"
#include "fs.h"
#include "accessors.h"
+#include "extent-io-tree.h"
+struct extent_buffer;
+struct btrfs_block_rsv;
struct btrfs_trans_handle;
-struct btrfs_transaction;
-struct btrfs_pending_snapshot;
-struct btrfs_delayed_ref_root;
-struct btrfs_space_info;
struct btrfs_block_group;
-struct btrfs_ordered_sum;
-struct btrfs_ref;
-struct btrfs_bio;
-struct btrfs_ioctl_encoded_io_args;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-struct reloc_control;
/* Read ahead values for struct btrfs_path.reada */
enum {
@@ -355,6 +354,16 @@ static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int c
WRITE_ONCE(root->last_log_commit, commit_id);
}
+static inline u64 btrfs_get_root_last_trans(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->last_trans);
+}
+
+static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transid)
+{
+ WRITE_ONCE(root->last_trans, transid);
+}
+
/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
@@ -448,6 +457,7 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
+ bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
@@ -478,8 +488,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
-int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
- u64 start, u64 end);
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index a77be9896d..f664678c71 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -6,7 +6,6 @@
#include <linux/sched.h>
#include "ctree.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "transaction.h"
#include "locking.h"
#include "accessors.h"
@@ -140,7 +139,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
if (trans)
transid = trans->transid;
else
- transid = inode->root->last_trans;
+ transid = btrfs_get_root_last_trans(root);
defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
@@ -148,7 +147,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
- defrag->root = root->root_key.objectid;
+ defrag->root = btrfs_root_id(root);
defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
@@ -521,7 +520,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
* keep_locks set and lowest_level is 1, regardless of the value of
* path->slots[1].
*/
- BUG_ON(path->locks[1] == 0);
+ ASSERT(path->locks[1] != 0);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
&last_ret,
@@ -861,20 +860,21 @@ out:
* NOTE: Caller should also wait for page writeback after the cluster is
* prepared, here we don't do writeback wait for each page.
*/
-static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
+static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index)
{
struct address_space *mapping = inode->vfs_inode.i_mapping;
gfp_t mask = btrfs_alloc_write_mask(mapping);
u64 page_start = (u64)index << PAGE_SHIFT;
u64 page_end = page_start + PAGE_SIZE - 1;
struct extent_state *cached_state = NULL;
- struct page *page;
+ struct folio *folio;
int ret;
again:
- page = find_or_create_page(mapping, index, mask);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio))
+ return folio;
/*
* Since we can defragment files opened read-only, we can encounter
@@ -884,16 +884,16 @@ again:
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
- if (PageCompound(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio_test_large(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-ETXTBSY);
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(ret);
}
@@ -908,17 +908,17 @@ again:
if (!ordered)
break;
- unlock_page(page);
+ folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
- lock_page(page);
+ folio_lock(folio);
/*
- * We unlocked the page above, so we need check if it was
+ * We unlocked the folio above, so we need check if it was
* released or not.
*/
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio->mapping != mapping || !folio->private) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
}
@@ -927,21 +927,21 @@ again:
* Now the page range has no ordered extent any more. Read the page to
* make it uptodate.
*/
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping || !folio->private) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-EIO);
}
}
- return page;
+ return folio;
}
struct defrag_target_range {
@@ -1162,7 +1162,7 @@ static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
*/
static int defrag_one_locked_target(struct btrfs_inode *inode,
struct defrag_target_range *target,
- struct page **pages, int nr_pages,
+ struct folio **folios, int nr_pages,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1171,7 +1171,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
const u64 len = target->len;
unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = page_index(pages[0]);
+ unsigned long first_index = folios[0]->index;
int ret = 0;
int i;
@@ -1188,8 +1188,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
/* Update the page status */
for (i = start_index - first_index; i <= last_index - first_index; i++) {
- ClearPageChecked(pages[i]);
- btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len);
+ folio_clear_checked(folios[i]);
+ btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
@@ -1205,7 +1205,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
struct defrag_target_range *entry;
struct defrag_target_range *tmp;
LIST_HEAD(target_list);
- struct page **pages;
+ struct folio **folios;
const u32 sectorsize = inode->root->fs_info->sectorsize;
u64 last_index = (start + len - 1) >> PAGE_SHIFT;
u64 start_index = start >> PAGE_SHIFT;
@@ -1216,21 +1216,21 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages)
+ folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS);
+ if (!folios)
return -ENOMEM;
/* Prepare all pages */
for (i = 0; i < nr_pages; i++) {
- pages[i] = defrag_prepare_one_page(inode, start_index + i);
- if (IS_ERR(pages[i])) {
- ret = PTR_ERR(pages[i]);
- pages[i] = NULL;
- goto free_pages;
+ folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+ if (IS_ERR(folios[i])) {
+ ret = PTR_ERR(folios[i]);
+ nr_pages = i;
+ goto free_folios;
}
}
for (i = 0; i < nr_pages; i++)
- wait_on_page_writeback(pages[i]);
+ folio_wait_writeback(folios[i]);
/* Lock the pages range */
lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
@@ -1250,7 +1250,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
goto unlock_extent;
list_for_each_entry(entry, &target_list, list) {
- ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ ret = defrag_one_locked_target(inode, entry, folios, nr_pages,
&cached_state);
if (ret < 0)
break;
@@ -1264,14 +1264,12 @@ unlock_extent:
unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
(last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
&cached_state);
-free_pages:
+free_folios:
for (i = 0; i < nr_pages; i++) {
- if (pages[i]) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ folio_unlock(folios[i]);
+ folio_put(folios[i]);
}
- kfree(pages);
+ kfree(folios);
return ret;
}
@@ -1512,9 +1510,7 @@ void __cold btrfs_auto_defrag_exit(void)
int __init btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
- sizeof(struct inode_defrag), 0,
- SLAB_MEM_SPREAD,
- NULL);
+ sizeof(struct inode_defrag), 0, 0, NULL);
if (!btrfs_inode_defrag_cachep)
return -ENOMEM;
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 5a62763528..878528e086 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -3,6 +3,16 @@
#ifndef BTRFS_DEFRAG_H
#define BTRFS_DEFRAG_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
+struct inode;
+struct file_ra_state;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_trans_handle;
+struct btrfs_ioctl_defrag_range_args;
+
int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index acf9f4b6c0..b3527efd0b 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -6,9 +6,7 @@
#include "block-rsv.h"
#include "btrfs_inode.h"
#include "space-info.h"
-#include "transaction.h"
#include "qgroup.h"
-#include "block-group.h"
#include "fs.h"
/*
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index c5d573f236..ce4f889e4f 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -3,7 +3,11 @@
#ifndef BTRFS_DELALLOC_SPACE_H
#define BTRFS_DELALLOC_SPACE_H
+#include <linux/types.h>
+
struct extent_changeset;
+struct btrfs_inode;
+struct btrfs_fs_info;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ab5a833e7d..95a0497fa8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -28,11 +28,7 @@ static struct kmem_cache *delayed_node_cache;
int __init btrfs_delayed_inode_init(void)
{
- delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
- sizeof(struct btrfs_delayed_node),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
+ delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0);
if (!delayed_node_cache)
return -ENOMEM;
return 0;
@@ -43,6 +39,17 @@ void __cold btrfs_delayed_inode_exit(void)
kmem_cache_destroy(delayed_node_cache);
}
+void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root)
+{
+ atomic_set(&delayed_root->items, 0);
+ atomic_set(&delayed_root->items_seq, 0);
+ delayed_root->nodes = 0;
+ spin_lock_init(&delayed_root->lock);
+ init_waitqueue_head(&delayed_root->wait);
+ INIT_LIST_HEAD(&delayed_root->node_list);
+ INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
static inline void btrfs_init_delayed_node(
struct btrfs_delayed_node *delayed_node,
struct btrfs_root *root, u64 inode_id)
@@ -430,8 +437,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
delayed_root = delayed_node->root->fs_info->delayed_root;
- BUG_ON(!delayed_root);
-
if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
else
@@ -980,7 +985,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
if (delayed_node &&
test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- BUG_ON(!delayed_node->root);
+ ASSERT(delayed_node->root);
clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count--;
@@ -1646,7 +1651,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, node->root->root_key.objectid,
+ index, btrfs_root_id(node->root),
node->inode_id, ret);
btrfs_delayed_item_release_metadata(dir->root, item);
btrfs_release_delayed_item(item);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 5cceb31bbd..64e115d974 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -7,15 +7,23 @@
#ifndef BTRFS_DELAYED_INODE_H
#define BTRFS_DELAYED_INODE_H
+#include <linux/types.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/wait.h>
+#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include "ctree.h"
+struct btrfs_disk_key;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
enum btrfs_delayed_item_type {
BTRFS_DELAYED_INSERTION_ITEM,
BTRFS_DELAYED_DELETION_ITEM
@@ -98,18 +106,7 @@ struct btrfs_delayed_item {
char data[] __counted_by(data_len);
};
-static inline void btrfs_init_delayed_root(
- struct btrfs_delayed_root *delayed_root)
-{
- atomic_set(&delayed_root->items, 0);
- atomic_set(&delayed_root->items_seq, 0);
- delayed_root->nodes = 0;
- spin_lock_init(&delayed_root->lock);
- init_waitqueue_head(&delayed_root->wait);
- INIT_LIST_HEAD(&delayed_root->node_list);
- INIT_LIST_HEAD(&delayed_root->prepare_list);
-}
-
+void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root);
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 891ea2fa26..6cc80fb10d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -16,8 +16,7 @@
#include "fs.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
-struct kmem_cache *btrfs_delayed_tree_ref_cachep;
-struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_ref_node_cachep;
struct kmem_cache *btrfs_delayed_extent_op_cachep;
/*
* delayed back reference update tracking. For subvolume trees
@@ -305,50 +304,19 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
}
/*
- * compare two delayed tree backrefs with same bytenr and type
- */
-static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
- struct btrfs_delayed_tree_ref *ref2)
-{
- if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- } else {
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
- }
- return 0;
-}
-
-/*
* compare two delayed data backrefs with same bytenr and type
*/
-static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
- struct btrfs_delayed_data_ref *ref2)
+static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
+ struct btrfs_delayed_ref_node *ref2)
{
- if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- if (ref1->objectid < ref2->objectid)
- return -1;
- if (ref1->objectid > ref2->objectid)
- return 1;
- if (ref1->offset < ref2->offset)
- return -1;
- if (ref1->offset > ref2->offset)
- return 1;
- } else {
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
- }
+ if (ref1->data_ref.objectid < ref2->data_ref.objectid)
+ return -1;
+ if (ref1->data_ref.objectid > ref2->data_ref.objectid)
+ return 1;
+ if (ref1->data_ref.offset < ref2->data_ref.offset)
+ return -1;
+ if (ref1->data_ref.offset > ref2->data_ref.offset)
+ return 1;
return 0;
}
@@ -362,13 +330,20 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return -1;
if (ref1->type > ref2->type)
return 1;
- if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
- ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
- btrfs_delayed_node_to_tree_ref(ref2));
- else
- ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
- btrfs_delayed_node_to_data_ref(ref2));
+ if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY ||
+ ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ } else {
+ if (ref1->ref_root < ref2->ref_root)
+ return -1;
+ if (ref1->ref_root > ref2->ref_root)
+ return -1;
+ if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY)
+ ret = comp_data_refs(ref1, ref2);
+ }
if (ret)
return ret;
if (check_seq) {
@@ -828,18 +803,20 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
}
static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_ref *generic_ref,
struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, u64 ref_root,
- u64 reserved, int action, bool is_data,
- bool is_system, u64 owning_root)
+ u64 reserved)
{
int count_mod = 1;
bool must_insert_reserved = false;
/* If reserved is provided, it must be a data extent. */
- BUG_ON(!is_data && reserved);
+ BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved);
- switch (action) {
+ switch (generic_ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ /* count_mod is already set to 1. */
+ break;
case BTRFS_UPDATE_DELAYED_HEAD:
count_mod = 0;
break;
@@ -868,14 +845,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
}
refcount_set(&head_ref->refs, 1);
- head_ref->bytenr = bytenr;
- head_ref->num_bytes = num_bytes;
+ head_ref->bytenr = generic_ref->bytenr;
+ head_ref->num_bytes = generic_ref->num_bytes;
head_ref->ref_mod = count_mod;
head_ref->reserved_bytes = reserved;
head_ref->must_insert_reserved = must_insert_reserved;
- head_ref->owning_root = owning_root;
- head_ref->is_data = is_data;
- head_ref->is_system = is_system;
+ head_ref->owning_root = generic_ref->owning_root;
+ head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA);
+ head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
@@ -885,12 +862,12 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
mutex_init(&head_ref->mutex);
if (qrecord) {
- if (ref_root && reserved) {
+ if (generic_ref->ref_root && reserved) {
qrecord->data_rsv = reserved;
- qrecord->data_rsv_refroot = ref_root;
+ qrecord->data_rsv_refroot = generic_ref->ref_root;
}
- qrecord->bytenr = bytenr;
- qrecord->num_bytes = num_bytes;
+ qrecord->bytenr = generic_ref->bytenr;
+ qrecord->num_bytes = generic_ref->num_bytes;
qrecord->old_roots = NULL;
}
}
@@ -982,89 +959,104 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
*/
static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
- u64 bytenr, u64 num_bytes, u64 ref_root,
- int action, u8 ref_type)
+ struct btrfs_ref *generic_ref)
{
+ int action = generic_ref->action;
u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- if (is_fstree(ref_root))
+ if (is_fstree(generic_ref->ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
refcount_set(&ref->refs, 1);
- ref->bytenr = bytenr;
- ref->num_bytes = num_bytes;
+ ref->bytenr = generic_ref->bytenr;
+ ref->num_bytes = generic_ref->num_bytes;
ref->ref_mod = 1;
ref->action = action;
ref->seq = seq;
- ref->type = ref_type;
+ ref->type = btrfs_ref_type(generic_ref);
+ ref->ref_root = generic_ref->ref_root;
+ ref->parent = generic_ref->parent;
RB_CLEAR_NODE(&ref->ref_node);
INIT_LIST_HEAD(&ref->add_list);
+
+ if (generic_ref->type == BTRFS_REF_DATA)
+ ref->data_ref = generic_ref->data_ref;
+ else
+ ref->tree_ref = generic_ref->tree_ref;
}
-/*
- * add a delayed tree ref. This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
- struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op)
+void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
+ bool skip_qgroup)
+{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* If @real_root not set, use @root as fallback */
+ generic_ref->real_root = mod_root ?: generic_ref->ref_root;
+#endif
+ generic_ref->tree_ref.level = level;
+ generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
+}
+
+void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
+ u64 mod_root, bool skip_qgroup)
+{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* If @real_root not set, use @root as fallback */
+ generic_ref->real_root = mod_root ?: generic_ref->ref_root;
+#endif
+ generic_ref->data_ref.objectid = ino;
+ generic_ref->data_ref.offset = offset;
+ generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+}
+
+static int add_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ struct btrfs_delayed_extent_op *extent_op,
+ u64 reserved)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
bool qrecord_inserted;
- bool is_system;
- bool merged;
int action = generic_ref->action;
- int level = generic_ref->tree_ref.level;
- u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
- u64 parent = generic_ref->parent;
- u8 ref_type;
-
- is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+ bool merged;
- ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
- ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
- if (!ref)
+ node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS);
+ if (!node)
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
return -ENOMEM;
}
if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return -ENOMEM;
}
}
- if (parent)
- ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
- else
- ref_type = BTRFS_TREE_BLOCK_REF_KEY;
-
- init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.ref_root, action,
- ref_type);
- ref->root = generic_ref->tree_ref.ref_root;
- ref->parent = parent;
- ref->level = level;
-
- init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.ref_root, 0, action,
- false, is_system, generic_ref->owning_root);
+ init_delayed_ref_common(fs_info, node, generic_ref);
+ init_delayed_ref_head(head_ref, generic_ref, record, reserved);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -1077,7 +1069,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- merged = insert_delayed_ref(trans, head_ref, &ref->node);
+ merged = insert_delayed_ref(trans, head_ref, node);
spin_unlock(&delayed_refs->lock);
/*
@@ -1086,107 +1078,39 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
*/
btrfs_update_delayed_refs_rsv(trans);
- trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
- action == BTRFS_ADD_DELAYED_EXTENT ?
- BTRFS_ADD_DELAYED_REF : action);
+ if (generic_ref->type == BTRFS_REF_DATA)
+ trace_add_delayed_data_ref(trans->fs_info, node);
+ else
+ trace_add_delayed_tree_ref(trans->fs_info, node);
if (merged)
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
if (qrecord_inserted)
- btrfs_qgroup_trace_extent_post(trans, record);
-
+ return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
/*
+ * Add a delayed tree ref. This does all of the accounting required to make sure
+ * the delayed ref is eventually processed before this transaction commits.
+ */
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
+ return add_delayed_ref(trans, generic_ref, extent_op, 0);
+}
+
+/*
* add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
*/
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
u64 reserved)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_data_ref *ref;
- struct btrfs_delayed_ref_head *head_ref;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_qgroup_extent_record *record = NULL;
- bool qrecord_inserted;
- int action = generic_ref->action;
- bool merged;
- u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
- u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.ref_root;
- u64 owner = generic_ref->data_ref.ino;
- u64 offset = generic_ref->data_ref.offset;
- u8 ref_type;
-
- ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
- ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- if (parent)
- ref_type = BTRFS_SHARED_DATA_REF_KEY;
- else
- ref_type = BTRFS_EXTENT_DATA_REF_KEY;
- init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- ref_root, action, ref_type);
- ref->root = ref_root;
- ref->parent = parent;
- ref->objectid = owner;
- ref->offset = offset;
-
-
- head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
- if (!head_ref) {
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- return -ENOMEM;
- }
-
- if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
- record = kzalloc(sizeof(*record), GFP_NOFS);
- if (!record) {
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- kmem_cache_free(btrfs_delayed_ref_head_cachep,
- head_ref);
- return -ENOMEM;
- }
- }
-
- init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
- reserved, action, true, false, generic_ref->owning_root);
- head_ref->extent_op = NULL;
-
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
-
- /*
- * insert both the head node and the new ref without dropping
- * the spin lock
- */
- head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted);
-
- merged = insert_delayed_ref(trans, head_ref, &ref->node);
- spin_unlock(&delayed_refs->lock);
-
- /*
- * Need to update the delayed_refs_rsv with any changes we may have
- * made.
- */
- btrfs_update_delayed_refs_rsv(trans);
-
- trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
- action == BTRFS_ADD_DELAYED_EXTENT ?
- BTRFS_ADD_DELAYED_REF : action);
- if (merged)
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
-
-
- if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(trans, record);
- return 0;
+ ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action);
+ return add_delayed_ref(trans, generic_ref, NULL, reserved);
}
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
@@ -1195,13 +1119,18 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_ref generic_ref = {
+ .type = BTRFS_REF_METADATA,
+ .action = BTRFS_UPDATE_DELAYED_HEAD,
+ .bytenr = bytenr,
+ .num_bytes = num_bytes,
+ };
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
return -ENOMEM;
- init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, false, false, 0);
+ init_delayed_ref_head(head_ref, &generic_ref, NULL, 0);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -1220,6 +1149,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return 0;
}
+void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ if (refcount_dec_and_test(&ref->refs)) {
+ WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, ref);
+ }
+}
+
/*
* This does a simple search for the head node for a given extent. Returns the
* head node if found, or NULL if not.
@@ -1235,38 +1172,21 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
- kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
- kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+ kmem_cache_destroy(btrfs_delayed_ref_node_cachep);
kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
}
int __init btrfs_delayed_ref_init(void)
{
- btrfs_delayed_ref_head_cachep = kmem_cache_create(
- "btrfs_delayed_ref_head",
- sizeof(struct btrfs_delayed_ref_head), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0);
if (!btrfs_delayed_ref_head_cachep)
goto fail;
- btrfs_delayed_tree_ref_cachep = kmem_cache_create(
- "btrfs_delayed_tree_ref",
- sizeof(struct btrfs_delayed_tree_ref), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_delayed_tree_ref_cachep)
- goto fail;
-
- btrfs_delayed_data_ref_cachep = kmem_cache_create(
- "btrfs_delayed_data_ref",
- sizeof(struct btrfs_delayed_data_ref), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!btrfs_delayed_data_ref_cachep)
+ btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0);
+ if (!btrfs_delayed_ref_node_cachep)
goto fail;
- btrfs_delayed_extent_op_cachep = kmem_cache_create(
- "btrfs_delayed_extent_op",
- sizeof(struct btrfs_delayed_extent_op), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0);
if (!btrfs_delayed_extent_op_cachep)
goto fail;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 62d679d40f..04b180ebe1 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -6,7 +6,17 @@
#ifndef BTRFS_DELAYED_REF_H
#define BTRFS_DELAYED_REF_H
+#include <linux/types.h>
#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
/* these are the possible values of struct btrfs_delayed_ref_node->action */
enum btrfs_delayed_ref_action {
@@ -20,6 +30,32 @@ enum btrfs_delayed_ref_action {
BTRFS_UPDATE_DELAYED_HEAD,
} __packed;
+struct btrfs_data_ref {
+ /* For EXTENT_DATA_REF */
+
+ /* Inode which refers to this data extent */
+ u64 objectid;
+
+ /*
+ * file_offset - extent_offset
+ *
+ * file_offset is the key.offset of the EXTENT_DATA key.
+ * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
+ */
+ u64 offset;
+};
+
+struct btrfs_tree_ref {
+ /*
+ * Level of this tree block.
+ *
+ * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
+ */
+ int level;
+
+ /* For non-skinny metadata, no special member needed */
+};
+
struct btrfs_delayed_ref_node {
struct rb_node ref_node;
/*
@@ -38,6 +74,15 @@ struct btrfs_delayed_ref_node {
/* seq number to keep track of insertion order */
u64 seq;
+ /* The ref_root for this ref */
+ u64 ref_root;
+
+ /*
+ * The parent for this ref, if this isn't set the ref_root is the
+ * reference owner.
+ */
+ u64 parent;
+
/* ref count on this data structure */
refcount_t refs;
@@ -54,6 +99,11 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
+
+ union {
+ struct btrfs_tree_ref tree_ref;
+ struct btrfs_data_ref data_ref;
+ };
};
struct btrfs_delayed_extent_op {
@@ -141,21 +191,6 @@ struct btrfs_delayed_ref_head {
bool processing;
};
-struct btrfs_delayed_tree_ref {
- struct btrfs_delayed_ref_node node;
- u64 root;
- u64 parent;
- int level;
-};
-
-struct btrfs_delayed_data_ref {
- struct btrfs_delayed_ref_node node;
- u64 root;
- u64 parent;
- u64 objectid;
- u64 offset;
-};
-
enum btrfs_delayed_ref_flags {
/* Indicate that we are flushing delayed refs for the commit */
BTRFS_DELAYED_REFS_FLUSHING,
@@ -204,42 +239,6 @@ enum btrfs_ref_type {
BTRFS_REF_LAST,
} __packed;
-struct btrfs_data_ref {
- /* For EXTENT_DATA_REF */
-
- /* Root which owns this data reference. */
- u64 ref_root;
-
- /* Inode which refers to this data extent */
- u64 ino;
-
- /*
- * file_offset - extent_offset
- *
- * file_offset is the key.offset of the EXTENT_DATA key.
- * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
- */
- u64 offset;
-};
-
-struct btrfs_tree_ref {
- /*
- * Level of this tree block
- *
- * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
- */
- int level;
-
- /*
- * Root which owns this tree block reference.
- *
- * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
- */
- u64 ref_root;
-
- /* For non-skinny metadata, no special member needed */
-};
-
struct btrfs_ref {
enum btrfs_ref_type type;
enum btrfs_delayed_ref_action action;
@@ -257,9 +256,15 @@ struct btrfs_ref {
u64 real_root;
#endif
u64 bytenr;
- u64 len;
+ u64 num_bytes;
u64 owning_root;
+ /*
+ * The root that owns the reference for this reference, this will be set
+ * or ->parent will be set, depending on what type of reference this is.
+ */
+ u64 ref_root;
+
/* Bytenr of the parent tree block */
u64 parent;
union {
@@ -269,8 +274,7 @@ struct btrfs_ref {
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
-extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
-extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_ref_node_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
int __init btrfs_delayed_ref_init(void);
@@ -308,53 +312,10 @@ static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *
return btrfs_calc_metadata_size(fs_info, num_csum_items);
}
-static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
- int action, u64 bytenr, u64 len,
- u64 parent, u64 owning_root)
-{
- generic_ref->action = action;
- generic_ref->bytenr = bytenr;
- generic_ref->len = len;
- generic_ref->parent = parent;
- generic_ref->owning_root = owning_root;
-}
-
-static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level,
- u64 root, u64 mod_root, bool skip_qgroup)
-{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* If @real_root not set, use @root as fallback */
- generic_ref->real_root = mod_root ?: root;
-#endif
- generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.ref_root = root;
- generic_ref->type = BTRFS_REF_METADATA;
- if (skip_qgroup || !(is_fstree(root) &&
- (!mod_root || is_fstree(mod_root))))
- generic_ref->skip_qgroup = true;
- else
- generic_ref->skip_qgroup = false;
-
-}
-
-static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
- u64 ref_root, u64 ino, u64 offset, u64 mod_root,
- bool skip_qgroup)
-{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* If @real_root not set, use @root as fallback */
- generic_ref->real_root = mod_root ?: ref_root;
-#endif
- generic_ref->data_ref.ref_root = ref_root;
- generic_ref->data_ref.ino = ino;
- generic_ref->data_ref.offset = offset;
- generic_ref->type = BTRFS_REF_DATA;
- if (skip_qgroup || !(is_fstree(ref_root) &&
- (!mod_root || is_fstree(mod_root))))
- generic_ref->skip_qgroup = true;
- else
- generic_ref->skip_qgroup = false;
-}
+void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
+ bool skip_qgroup);
+void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
+ u64 mod_root, bool skip_qgroup);
static inline struct btrfs_delayed_extent_op *
btrfs_alloc_delayed_extent_op(void)
@@ -369,24 +330,7 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
}
-static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
-{
- if (refcount_dec_and_test(&ref->refs)) {
- WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
- switch (ref->type) {
- case BTRFS_TREE_BLOCK_REF_KEY:
- case BTRFS_SHARED_BLOCK_REF_KEY:
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- break;
- case BTRFS_EXTENT_DATA_REF_KEY:
- case BTRFS_SHARED_DATA_REF_KEY:
- kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
- break;
- default:
- BUG();
- }
- }
-}
+void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref);
static inline u64 btrfs_ref_head_to_space_flags(
struct btrfs_delayed_ref_head *head_ref)
@@ -446,19 +390,39 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
u64 num_bytes);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
-/*
- * helper functions to cast a node into its container
- */
-static inline struct btrfs_delayed_tree_ref *
-btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
+{
+ if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ return node->data_ref.objectid;
+ return node->tree_ref.level;
+}
+
+static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
{
- return container_of(node, struct btrfs_delayed_tree_ref, node);
+ if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ return node->data_ref.offset;
+ return 0;
}
-static inline struct btrfs_delayed_data_ref *
-btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
{
- return container_of(node, struct btrfs_delayed_data_ref, node);
+ ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);
+
+ if (ref->type == BTRFS_REF_DATA) {
+ if (ref->parent)
+ return BTRFS_SHARED_DATA_REF_KEY;
+ else
+ return BTRFS_EXTENT_DATA_REF_KEY;
+ } else {
+ if (ref->parent)
+ return BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ return BTRFS_TREE_BLOCK_REF_KEY;
+ }
+
+ return 0;
}
#endif
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 79c4293ddf..7130040d92 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -11,10 +11,8 @@
#include <linux/math64.h>
#include "misc.h"
#include "ctree.h"
-#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
#include "dev-replace.h"
@@ -246,7 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -257,13 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return -EINVAL;
}
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev_handle);
+ return PTR_ERR(bdev_file);
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
@@ -314,11 +312,11 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
- device->bdev_handle = bdev_handle;
+ device->bdev_file = bdev_file;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_devices;
ret = btrfs_get_dev_zone_info(device, false);
@@ -335,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return ret;
}
@@ -1000,8 +998,7 @@ error:
btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
- btrfs_scratch_superblocks(fs_info, src_device->bdev,
- src_device->name->str);
+ btrfs_scratch_superblocks(fs_info, src_device);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 675082ccec..23e480efe5 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -6,11 +6,15 @@
#ifndef BTRFS_DEV_REPLACE_H
#define BTRFS_DEV_REPLACE_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
struct btrfs_ioctl_dev_replace_args;
struct btrfs_fs_info;
struct btrfs_trans_handle;
struct btrfs_dev_replace;
struct btrfs_block_group;
+struct btrfs_device;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index e40a226373..00b3d83d75 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -3,9 +3,15 @@
#ifndef BTRFS_DIR_ITEM_H
#define BTRFS_DIR_ITEM_H
+#include <linux/types.h>
#include <linux/crc32c.h>
struct fscrypt_str;
+struct btrfs_fs_info;
+struct btrfs_key;
+struct btrfs_path;
+struct btrfs_root;
+struct btrfs_trans_handle;
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const struct fscrypt_str *name);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2567821224..3791813dc7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"
#include "sysfs.h"
@@ -193,7 +192,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
struct folio *folio = eb->folios[i];
u64 start = max_t(u64, eb->start, folio_pos(folio));
u64 end = min_t(u64, eb->start + eb->len,
- folio_pos(folio) + folio_size(folio));
+ folio_pos(folio) + eb->folio_size);
u32 len = end - start;
ret = btrfs_repair_io_failure(fs_info, 0, start, len,
@@ -647,7 +646,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
- bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ bool dummy = btrfs_is_testing(fs_info);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
@@ -659,13 +658,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->state = 0;
RB_CLEAR_NODE(&root->rb_node);
- root->last_trans = 0;
+ btrfs_set_root_last_trans(root, 0);
root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- /* GFP flags are compatible with XA_FLAGS_*. */
- xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
+ xa_init(&root->delayed_nodes);
btrfs_init_root_block_rsv(root);
@@ -777,7 +775,7 @@ int btrfs_global_root_insert(struct btrfs_root *root)
if (tmp) {
ret = -EEXIST;
btrfs_warn(fs_info, "global root %llu %llu already exists",
- root->root_key.objectid, root->root_key.offset);
+ btrfs_root_id(root), root->root_key.offset);
}
return ret;
}
@@ -1012,8 +1010,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
- log_root->last_trans = trans->transid;
- log_root->root_key.offset = root->root_key.objectid;
+ btrfs_set_root_last_trans(log_root, trans->transid);
+ log_root->root_key.offset = btrfs_root_id(root);
inode_item = &log_root->root_item.inode;
btrfs_set_stack_inode_generation(inode_item, 1);
@@ -1077,15 +1075,15 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
* For real fs, and not log/reloc trees, root owner must
* match its root node owner
*/
- if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
- root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- root->root_key.objectid != btrfs_header_owner(root->node)) {
+ if (!btrfs_is_testing(fs_info) &&
+ btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
+ btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_root_id(root) != btrfs_header_owner(root->node)) {
btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
- root->root_key.objectid, root->node->start,
+ btrfs_root_id(root), root->node->start,
btrfs_header_owner(root->node),
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EUCLEAN;
goto fail;
}
@@ -1122,9 +1120,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
btrfs_drew_lock_init(&root->snapshot_lock);
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root) &&
- is_fstree(root->root_key.objectid)) {
+ is_fstree(btrfs_root_id(root))) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1133,7 +1131,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
- if (is_fstree(root->root_key.objectid) &&
+ if (is_fstree(btrfs_root_id(root)) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
@@ -1220,7 +1218,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
root);
if (ret == 0) {
btrfs_grab_root(root);
@@ -1245,6 +1243,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "leaked root %s refcount %d",
btrfs_root_name(&root->root_key, buf),
refcount_read(&root->refs));
+ WARN_ON_ONCE(1);
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
btrfs_put_root(root);
@@ -1266,9 +1265,14 @@ static void free_global_roots(struct btrfs_fs_info *fs_info)
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
+ struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
+
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->ordered_bytes);
+ if (percpu_counter_initialized(em_counter))
+ ASSERT(percpu_counter_sum_positive(em_counter) == 0);
+ percpu_counter_destroy(em_counter);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
@@ -2240,7 +2244,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
struct btrfs_key location;
int ret;
- BUG_ON(!fs_info->tree_root);
+ ASSERT(fs_info->tree_root);
ret = load_global_roots(tree_root);
if (ret)
@@ -2584,7 +2588,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev
struct btrfs_tree_parent_check check = {
.level = level,
.transid = gen,
- .owner_root = root->root_key.objectid
+ .owner_root = btrfs_root_id(root)
};
int ret = 0;
@@ -2848,6 +2852,12 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ spin_lock_init(&fs_info->extent_map_shrinker_lock);
+
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -2930,7 +2940,7 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
- root_objectid = gang[ret - 1]->root_key.objectid + 1;
+ root_objectid = btrfs_root_id(gang[ret - 1]) + 1;
for (i = 0; i < ret; i++) {
/* Avoid to grab roots in dead_roots. */
@@ -2946,7 +2956,7 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++) {
if (!gang[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
+ root_objectid = btrfs_root_id(gang[i]);
err = btrfs_orphan_cleanup(gang[i]);
if (err)
goto out;
@@ -3618,28 +3628,25 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
static void btrfs_end_super_write(struct bio *bio)
{
struct btrfs_device *device = bio->bi_private;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
- struct page *page;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- page = bvec->bv_page;
+ struct folio_iter fi;
+ bio_for_each_folio_all(fi, bio) {
if (bio->bi_status) {
btrfs_warn_rl_in_rcu(device->fs_info,
- "lost page write due to IO error on %s (%d)",
+ "lost super block write due to IO error on %s (%d)",
btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
- ClearPageUptodate(page);
- SetPageError(page);
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_WRITE_ERRS);
- } else {
- SetPageUptodate(page);
+ /* Ensure failure if the primary sb fails. */
+ if (bio->bi_opf & REQ_FUA)
+ atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
+ &device->sb_write_errors);
+ else
+ atomic_inc(&device->sb_write_errors);
}
-
- put_page(page);
- unlock_page(page);
+ folio_unlock(fi.folio);
+ folio_put(fi.folio);
}
bio_put(bio);
@@ -3651,7 +3658,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
struct btrfs_super_block *super;
struct page *page;
u64 bytenr, bytenr_orig;
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
int ret;
bytenr_orig = btrfs_sb_offset(copy_num);
@@ -3726,34 +3733,36 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
- * pages we use for writing are locked.
+ * folios we use for writing are locked.
*
* Write @max_mirrors copies of the superblock, where 0 means default that fit
* the expected device size at commit time. Note that max_mirrors must be
* same for write and wait phases.
*
- * Return number of errors when page is not found or submission fails.
+ * Return number of errors when folio is not found or submission fails.
*/
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct address_space *mapping = device->bdev->bd_inode->i_mapping;
+ struct address_space *mapping = device->bdev->bd_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
- int errors = 0;
int ret;
u64 bytenr, bytenr_orig;
+ atomic_set(&device->sb_write_errors, 0);
+
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
shash->tfm = fs_info->csum_shash;
for (i = 0; i < max_mirrors; i++) {
- struct page *page;
+ struct folio *folio;
struct bio *bio;
struct btrfs_super_block *disk_super;
+ size_t offset;
bytenr_orig = btrfs_sb_offset(i);
ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
@@ -3763,7 +3772,7 @@ static int write_dev_supers(struct btrfs_device *device,
btrfs_err(device->fs_info,
"couldn't get super block location for mirror %d",
i);
- errors++;
+ atomic_inc(&device->sb_write_errors);
continue;
}
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@@ -3776,20 +3785,20 @@ static int write_dev_supers(struct btrfs_device *device,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
sb->csum);
- page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
- GFP_NOFS);
- if (!page) {
+ folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ GFP_NOFS);
+ if (IS_ERR(folio)) {
btrfs_err(device->fs_info,
"couldn't get super block page for bytenr %llu",
bytenr);
- errors++;
+ atomic_inc(&device->sb_write_errors);
continue;
}
+ ASSERT(folio_order(folio) == 0);
- /* Bump the refcount for wait_dev_supers() */
- get_page(page);
-
- disk_super = page_address(page);
+ offset = offset_in_folio(folio, bytenr);
+ disk_super = folio_address(folio) + offset;
memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
/*
@@ -3803,8 +3812,7 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
bio->bi_private = device;
bio->bi_end_io = btrfs_end_super_write;
- __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
- offset_in_page(bytenr));
+ bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
/*
* We FUA only the first super block. The others we allow to
@@ -3816,17 +3824,17 @@ static int write_dev_supers(struct btrfs_device *device,
submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
- errors++;
+ atomic_inc(&device->sb_write_errors);
}
- return errors < i ? 0 : -1;
+ return atomic_read(&device->sb_write_errors) < i ? 0 : -1;
}
/*
* Wait for write completion of superblocks done by write_dev_supers,
* @max_mirrors same for write and wait phases.
*
- * Return number of errors when page is not found or not marked up to
- * date.
+ * Return -1 if primary super block write failed or when there were no super block
+ * copies written. Otherwise 0.
*/
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
@@ -3840,7 +3848,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
for (i = 0; i < max_mirrors; i++) {
- struct page *page;
+ struct folio *folio;
ret = btrfs_sb_log_location(device, i, READ, &bytenr);
if (ret == -ENOENT) {
@@ -3855,30 +3863,21 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
device->commit_total_bytes)
break;
- page = find_get_page(device->bdev->bd_inode->i_mapping,
- bytenr >> PAGE_SHIFT);
- if (!page) {
- errors++;
- if (i == 0)
- primary_failed = true;
+ folio = filemap_get_folio(device->bdev->bd_mapping,
+ bytenr >> PAGE_SHIFT);
+ /* If the folio has been removed, then we know it completed. */
+ if (IS_ERR(folio))
continue;
- }
- /* Page is submitted locked and unlocked once the IO completes */
- wait_on_page_locked(page);
- if (PageError(page)) {
- errors++;
- if (i == 0)
- primary_failed = true;
- }
-
- /* Drop our reference */
- put_page(page);
+ ASSERT(folio_order(folio) == 0);
- /* Drop the reference from the writing run */
- put_page(page);
+ /* Folio will be unlocked once the write completes. */
+ folio_wait_locked(folio);
+ folio_put(folio);
}
- /* log error, force error return */
+ errors += atomic_read(&device->sb_write_errors);
+ if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
+ primary_failed = true;
if (primary_failed) {
btrfs_err(device->fs_info, "error writing primary super block to device %llu",
device->devid);
@@ -4139,7 +4138,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid);
+ (unsigned long)btrfs_root_id(root));
if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -4182,9 +4181,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
struct btrfs_transaction *tmp;
bool found = false;
- if (list_empty(&fs_info->trans_list))
- return;
-
/*
* This function is only called at the very end of close_ctree(),
* thus no other running transaction, no need to take trans_lock.
@@ -4484,7 +4480,7 @@ static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++) {
if (!gang[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
+ root_objectid = btrfs_root_id(gang[i]);
btrfs_free_log(NULL, gang[i]);
btrfs_put_root(gang[i]);
}
@@ -4544,18 +4540,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
- struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_delayed_ref_node *ref;
- delayed_refs = &trans->delayed_refs;
-
spin_lock(&delayed_refs->lock);
- if (atomic_read(&delayed_refs->num_entries) == 0) {
- spin_unlock(&delayed_refs->lock);
- btrfs_debug(fs_info, "delayed_refs has NO entry");
- return;
- }
-
while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
@@ -4629,7 +4617,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
struct inode *inode = NULL;
btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
delalloc_inodes);
- __btrfs_del_delalloc_inode(root, btrfs_inode);
+ btrfs_del_delalloc_inode(btrfs_inode);
spin_unlock(&root->delalloc_lock);
/*
@@ -4815,7 +4803,7 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
btrfs_qgroup_free_meta_all_pertrans(root);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
}
}
@@ -4844,14 +4832,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&fs_info->transaction_wait);
- btrfs_destroy_delayed_inodes(fs_info);
-
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
- btrfs_free_all_qgroup_pertrans(fs_info);
-
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}
@@ -4904,6 +4888,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
btrfs_assert_delayed_root_empty(fs_info);
btrfs_destroy_all_delalloc_inodes(fs_info);
btrfs_drop_all_logs(fs_info);
+ btrfs_free_all_qgroup_pertrans(fs_info);
mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
@@ -4928,7 +4913,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of valid range.
+ */
+ ret = -EUCLEAN;
+ goto error;
+ }
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
l = path->nodes[0];
@@ -4952,7 +4944,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
btrfs_warn(root->fs_info,
"the objectid of root %llu reaches its highest value",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -ENOSPC;
goto out;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index eb3473d1c1..76eb53fe7a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,6 +6,22 @@
#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H
+#include <linux/sizes.h>
+#include <linux/compiler_types.h>
+#include "ctree.h"
+#include "fs.h"
+
+struct block_device;
+struct super_block;
+struct extent_buffer;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info;
+struct btrfs_super_block;
+struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
+struct btrfs_transaction;
+
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
@@ -25,10 +41,6 @@ static inline u64 btrfs_sb_offset(int mirror)
return BTRFS_SUPER_INFO_OFFSET;
}
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_tree_parent_check;
-
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1ac00ee795..9e81f89e76 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -5,7 +5,6 @@
#include "ctree.h"
#include "disk-io.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "export.h"
#include "accessors.h"
#include "super.h"
@@ -35,7 +34,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
type = FILEID_BTRFS_WITHOUT_PARENT;
fid->objectid = btrfs_ino(BTRFS_I(inode));
- fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
+ fid->root_objectid = btrfs_root_id(BTRFS_I(inode)->root);
fid->gen = inode->i_generation;
if (parent) {
@@ -43,7 +42,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
- parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
+ parent_root_id = btrfs_root_id(BTRFS_I(parent)->root);
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
@@ -161,7 +160,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
return ERR_PTR(-ENOMEM);
if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = root->root_key.objectid;
+ key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
@@ -244,7 +243,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
return -ENOMEM;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index eba6bc4f5a..464582273a 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -4,6 +4,10 @@
#define BTRFS_EXPORT_H
#include <linux/exportfs.h>
+#include <linux/types.h>
+
+struct dentry;
+struct super_block;
extern const struct export_operations btrfs_export_ops;
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index e3ee5449cc..ed2cfc3d5d 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -6,7 +6,6 @@
#include "ctree.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"
-#include "misc.h"
static struct kmem_cache *extent_state_cache;
@@ -48,6 +47,7 @@ static inline void btrfs_extent_state_leak_debug_check(void)
extent_state_in_tree(state),
refcount_read(&state->refs));
list_del(&state->leak_list);
+ WARN_ON_ONCE(1);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -1059,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *prealloc = NULL;
struct rb_node **p = NULL;
struct rb_node *parent = NULL;
- int err = 0;
+ int ret = 0;
u64 last_start;
u64 last_end;
u32 exclusive_bits = (bits & EXTENT_LOCKED);
@@ -1122,7 +1122,7 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = state->start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1158,7 +1158,7 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
@@ -1175,12 +1175,12 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
+ ret = split_state(tree, state, prealloc, start);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (err)
+ if (ret)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits, changeset);
@@ -1224,8 +1224,8 @@ hit_next:
prealloc->end = this_end;
inserted_state = insert_state(tree, prealloc, bits, changeset);
if (IS_ERR(inserted_state)) {
- err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, prealloc, "insert", err);
+ ret = PTR_ERR(inserted_state);
+ extent_io_tree_panic(tree, prealloc, "insert", ret);
}
cache_state(inserted_state, cached_state);
@@ -1244,16 +1244,16 @@ hit_next:
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
- err = -EEXIST;
+ ret = -EEXIST;
goto out;
}
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1275,7 +1275,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return err;
+ return ret;
}
@@ -1312,7 +1312,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *prealloc = NULL;
struct rb_node **p = NULL;
struct rb_node *parent = NULL;
- int err = 0;
+ int ret = 0;
u64 last_start;
u64 last_end;
bool first_iteration = true;
@@ -1351,7 +1351,7 @@ again:
if (!state) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
prealloc->start = start;
@@ -1402,14 +1402,14 @@ hit_next:
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
+ ret = split_state(tree, state, prealloc, start);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (err)
+ if (ret)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits, NULL);
@@ -1442,7 +1442,7 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -1454,8 +1454,8 @@ hit_next:
prealloc->end = this_end;
inserted_state = insert_state(tree, prealloc, bits, NULL);
if (IS_ERR(inserted_state)) {
- err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, prealloc, "insert", err);
+ ret = PTR_ERR(inserted_state);
+ extent_io_tree_panic(tree, prealloc, "insert", ret);
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
@@ -1472,13 +1472,13 @@ hit_next:
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret)
+ extent_io_tree_panic(tree, state, "split", ret);
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
@@ -1500,7 +1500,7 @@ out:
if (prealloc)
free_extent_state(prealloc);
- return err;
+ return ret;
}
/*
@@ -1883,8 +1883,8 @@ void __cold extent_state_free_cachep(void)
int __init extent_state_init_cachep(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
- sizeof(struct extent_state), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_state), 0, 0,
+ NULL);
if (!extent_state_cache)
return -ENOMEM;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index ebe6390d65..9d3a52d8f5 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -3,9 +3,16 @@
#ifndef BTRFS_EXTENT_IO_TREE_H
#define BTRFS_EXTENT_IO_TREE_H
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/wait.h>
#include "misc.h"
struct extent_changeset;
+struct btrfs_fs_info;
+struct btrfs_inode;
/* Bits for the extent state */
enum {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8e8cc11112..844b677d05 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -18,7 +18,7 @@
#include <linux/crc32c.h>
#include "ctree.h"
#include "extent-tree.h"
-#include "tree-log.h"
+#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "volumes.h"
@@ -26,14 +26,11 @@
#include "locking.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "sysfs.h"
#include "qgroup.h"
#include "ref-verify.h"
#include "space-info.h"
#include "block-rsv.h"
-#include "delalloc-space.h"
#include "discard.h"
-#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
#include "fs.h"
@@ -49,9 +46,7 @@
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node, u64 parent,
- u64 root_objectid, u64 owner_objectid,
- u64 owner_offset,
+ struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -109,10 +104,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct extent_buffer *leaf;
struct btrfs_key key;
- u32 item_size;
u64 num_refs;
u64 extent_flags;
u64 owner = 0;
@@ -162,16 +154,11 @@ search_again:
}
if (ret == 0) {
- leaf = path->nodes[0];
- item_size = btrfs_item_size(leaf, path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- num_refs = btrfs_extent_refs(leaf, ei);
- extent_flags = btrfs_extent_flags(leaf, ei);
- owner = btrfs_get_extent_owner_root(fs_info, leaf,
- path->slots[0]);
- } else {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_extent_item *ei;
+ const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (unlikely(item_size < sizeof(*ei))) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %u expect >= %zu",
@@ -184,6 +171,10 @@ search_again:
goto out_free;
}
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
BUG_ON(num_refs == 0);
} else {
num_refs = 0;
@@ -451,9 +442,8 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_extent_data_ref *ref;
struct extent_buffer *leaf;
u32 nritems;
- int ret;
int recow;
- int err = -ENOENT;
+ int ret;
key.objectid = bytenr;
if (parent) {
@@ -467,26 +457,26 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
again:
recow = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
+ if (ret < 0)
+ return ret;
if (parent) {
- if (!ret)
- return 0;
- goto fail;
+ if (ret)
+ return -ENOENT;
+ return 0;
}
+ ret = -ENOENT;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- err = ret;
- if (ret)
- goto fail;
+ if (ret) {
+ if (ret > 0)
+ return -ENOENT;
+ return ret;
+ }
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -507,37 +497,37 @@ again:
btrfs_release_path(path);
goto again;
}
- err = 0;
+ ret = 0;
break;
}
path->slots[0]++;
}
fail:
- return err;
+ return ret;
}
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- u64 bytenr, u64 parent,
- u64 root_objectid, u64 owner,
- u64 offset, int refs_to_add)
+ struct btrfs_delayed_ref_node *node,
+ u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
u32 size;
u32 num_refs;
int ret;
key.objectid = bytenr;
- if (parent) {
+ if (node->parent) {
key.type = BTRFS_SHARED_DATA_REF_KEY;
- key.offset = parent;
+ key.offset = node->parent;
size = sizeof(struct btrfs_shared_data_ref);
} else {
key.type = BTRFS_EXTENT_DATA_REF_KEY;
- key.offset = hash_extent_data_ref(root_objectid,
- owner, offset);
+ key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
size = sizeof(struct btrfs_extent_data_ref);
}
@@ -546,15 +536,15 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
goto fail;
leaf = path->nodes[0];
- if (parent) {
+ if (node->parent) {
struct btrfs_shared_data_ref *ref;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_shared_data_ref);
if (ret == 0) {
- btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+ btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
} else {
num_refs = btrfs_shared_data_ref_count(leaf, ref);
- num_refs += refs_to_add;
+ num_refs += node->ref_mod;
btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
}
} else {
@@ -562,7 +552,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
while (ret == -EEXIST) {
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
- if (match_extent_data_ref(leaf, ref, root_objectid,
+ if (match_extent_data_ref(leaf, ref, node->ref_root,
owner, offset))
break;
btrfs_release_path(path);
@@ -577,14 +567,13 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
if (ret == 0) {
- btrfs_set_extent_data_ref_root(leaf, ref,
- root_objectid);
+ btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
btrfs_set_extent_data_ref_offset(leaf, ref, offset);
- btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+ btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
} else {
num_refs = btrfs_extent_data_ref_count(leaf, ref);
- num_refs += refs_to_add;
+ num_refs += node->ref_mod;
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
}
}
@@ -708,20 +697,20 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- u64 bytenr, u64 parent,
- u64 root_objectid)
+ struct btrfs_delayed_ref_node *node,
+ u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
key.objectid = bytenr;
- if (parent) {
+ if (node->parent) {
key.type = BTRFS_SHARED_BLOCK_REF_KEY;
- key.offset = parent;
+ key.offset = node->parent;
} else {
key.type = BTRFS_TREE_BLOCK_REF_KEY;
- key.offset = root_objectid;
+ key.offset = node->ref_root;
}
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1442,7 +1431,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -1465,34 +1454,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
* @node: The delayed ref node used to get the bytenr/length for
* extent whose references are incremented.
*
- * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
- * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
- * bytenr of the parent block. Since new extents are always
- * created with indirect references, this will only be the case
- * when relocating a shared extent. In that case, root_objectid
- * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
- * be 0
- *
- * @root_objectid: The id of the root where this modification has originated,
- * this can be either one of the well-known metadata trees or
- * the subvolume id which references this extent.
- *
- * @owner: For data extents it is the inode number of the owning file.
- * For metadata extents this parameter holds the level in the
- * tree of the extent.
- *
- * @offset: For metadata extents the offset is ignored and is currently
- * always passed as 0. For data extents it is the fileoffset
- * this extent belongs to.
- *
* @extent_op Pointer to a structure, holding information necessary when
* updating a tree block's flags
*
*/
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
- u64 parent, u64 root_objectid,
- u64 owner, u64 offset,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_path *path;
@@ -1501,6 +1468,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_key key;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
u64 refs;
int refs_to_add = node->ref_mod;
int ret;
@@ -1511,7 +1480,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
- parent, root_objectid, owner,
+ node->parent, node->ref_root, owner,
offset, refs_to_add, extent_op);
if ((ret < 0 && ret != -EAGAIN) || !ret)
goto out;
@@ -1534,12 +1503,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/* now insert the actual backref */
if (owner < BTRFS_FIRST_FREE_OBJECTID)
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
- root_objectid);
+ ret = insert_tree_block_ref(trans, path, node, bytenr);
else
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
- root_objectid, owner, offset,
- refs_to_add);
+ ret = insert_extent_data_ref(trans, path, node, bytenr);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -1572,15 +1538,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
bool insert_reserved)
{
int ret = 0;
- struct btrfs_delayed_data_ref *ref;
u64 parent = 0;
u64 flags = 0;
- ref = btrfs_delayed_node_to_data_ref(node);
- trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
+ trace_run_delayed_data_ref(trans->fs_info, node);
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
- parent = ref->parent;
+ parent = node->parent;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
struct btrfs_key key;
@@ -1591,6 +1555,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
.is_inc = true,
.generation = trans->transid,
};
+ u64 owner = btrfs_delayed_ref_owner(node);
+ u64 offset = btrfs_delayed_ref_offset(node);
if (extent_op)
flags |= extent_op->flags_to_set;
@@ -1599,21 +1565,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = node->num_bytes;
- ret = alloc_reserved_file_extent(trans, parent, ref->root,
- flags, ref->objectid,
- ref->offset, &key,
- node->ref_mod, href->owning_root);
+ ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
+ flags, owner, offset, &key,
+ node->ref_mod,
+ href->owning_root);
free_head_ref_squota_rsv(trans->fs_info, href);
if (!ret)
ret = btrfs_record_squota_delta(trans->fs_info, &delta);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root,
- ref->objectid, ref->offset,
- extent_op);
+ ret = __btrfs_inc_extent_ref(trans, node, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, href, node, parent,
- ref->root, ref->objectid,
- ref->offset, extent_op);
+ ret = __btrfs_free_extent(trans, href, node, extent_op);
} else {
BUG();
}
@@ -1735,16 +1697,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_delayed_tree_ref *ref;
u64 parent = 0;
u64 ref_root = 0;
- ref = btrfs_delayed_node_to_tree_ref(node);
- trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
+ trace_run_delayed_tree_ref(trans->fs_info, node);
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
- parent = ref->parent;
- ref_root = ref->root;
+ parent = node->parent;
+ ref_root = node->ref_root;
if (unlikely(node->ref_mod != 1)) {
btrfs_err(trans->fs_info,
@@ -1767,11 +1727,9 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (!ret)
btrfs_record_squota_delta(fs_info, &delta);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
- ref->level, 0, extent_op);
+ ret = __btrfs_inc_extent_ref(trans, node, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, href, node, parent, ref_root,
- ref->level, 0, extent_op);
+ ret = __btrfs_free_extent(trans, href, node, extent_op);
} else {
BUG();
}
@@ -2295,7 +2253,6 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_transaction *cur_trans;
struct rb_node *node;
@@ -2349,6 +2306,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
*/
for (node = rb_first_cached(&head->ref_tree); node;
node = rb_next(node)) {
+ u64 ref_owner;
+ u64 ref_offset;
+
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -2356,15 +2316,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
break;
}
- data_ref = btrfs_delayed_node_to_data_ref(ref);
+ ref_owner = btrfs_delayed_ref_owner(ref);
+ ref_offset = btrfs_delayed_ref_offset(ref);
/*
* If our ref doesn't match the one we're currently looking at
* then we have a cross reference.
*/
- if (data_ref->root != root->root_key.objectid ||
- data_ref->objectid != objectid ||
- data_ref->offset != offset) {
+ if (ref->ref_root != btrfs_root_id(root) ||
+ ref_owner != objectid || ref_offset != offset) {
ret = 1;
break;
}
@@ -2399,7 +2359,14 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = -ENOENT;
if (path->slots[0] == 0)
@@ -2450,8 +2417,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (btrfs_extent_refs(leaf, ei) !=
btrfs_extent_data_ref_count(leaf, ref) ||
- btrfs_extent_data_ref_root(leaf, ref) !=
- root->root_key.objectid ||
+ btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
goto out;
@@ -2488,14 +2454,11 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int full_backref, int inc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 bytenr;
- u64 num_bytes;
u64 parent;
u64 ref_root;
u32 nritems;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
- struct btrfs_ref generic_ref = { 0 };
bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
int i;
int action;
@@ -2522,6 +2485,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
action = BTRFS_DROP_DELAYED_REF;
for (i = 0; i < nritems; i++) {
+ struct btrfs_ref ref = {
+ .action = action,
+ .parent = parent,
+ .ref_root = ref_root,
+ };
+
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
@@ -2531,35 +2500,33 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
if (btrfs_file_extent_type(buf, fi) ==
BTRFS_FILE_EXTENT_INLINE)
continue;
- bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
- if (bytenr == 0)
+ ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (ref.bytenr == 0)
continue;
- num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ ref.owning_root = ref_root;
+
key.offset -= btrfs_file_extent_offset(buf, fi);
- btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent, ref_root);
- btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
- key.offset, root->root_key.objectid,
- for_reloc);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), for_reloc);
if (inc)
- ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ ret = btrfs_inc_extent_ref(trans, &ref);
else
- ret = btrfs_free_extent(trans, &generic_ref);
+ ret = btrfs_free_extent(trans, &ref);
if (ret)
goto fail;
} else {
- bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = fs_info->nodesize;
- /* We don't know the owning_root, use 0. */
- btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent, 0);
- btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
- root->root_key.objectid, for_reloc);
+ /* We don't know the owning_root, leave as 0. */
+ ref.bytenr = btrfs_node_blockptr(buf, i);
+ ref.num_bytes = fs_info->nodesize;
+
+ btrfs_init_tree_ref(&ref, level - 1,
+ btrfs_root_id(root), for_reloc);
if (inc)
- ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ ret = btrfs_inc_extent_ref(trans, &ref);
else
- ret = btrfs_free_extent(trans, &generic_ref);
+ ret = btrfs_free_extent(trans, &ref);
if (ret)
goto fail;
}
@@ -2780,6 +2747,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 total_unpinned = 0;
u64 empty_cluster = 0;
bool readonly;
+ int ret = 0;
while (start <= end) {
readonly = false;
@@ -2789,7 +2757,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(cache);
total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
- BUG_ON(!cache); /* Logic error */
+ if (cache == NULL) {
+ /* Logic error, something removed the block group. */
+ ret = -EUCLEAN;
+ goto out;
+ }
cluster = fetch_cluster_info(fs_info,
cache->space_info,
@@ -2830,7 +2802,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
readonly = true;
} else if (btrfs_is_zoned(fs_info)) {
/* Need reset before reusing in a zoned block group */
- space_info->bytes_zone_unusable += len;
+ btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
+ len);
readonly = true;
}
spin_unlock(&cache->lock);
@@ -2858,7 +2831,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
if (cache)
btrfs_put_block_group(cache);
- return 0;
+out:
+ return ret;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
@@ -2888,7 +2862,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, &cached_state);
- unpin_extent_range(fs_info, start, end, true);
+ ret = unpin_extent_range(fs_info, start, end, true);
+ BUG_ON(ret);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
free_extent_state(cached_state);
cond_resched();
@@ -3088,9 +3063,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node, u64 parent,
- u64 root_objectid, u64 owner_objectid,
- u64 owner_offset,
+ struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -3110,6 +3083,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
+ u64 owner_objectid = btrfs_delayed_ref_owner(node);
+ u64 owner_offset = btrfs_delayed_ref_offset(node);
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
u64 delayed_ref_root = href->owning_root;
@@ -3135,7 +3110,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
skinny_metadata = false;
ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
- parent, root_objectid, owner_objectid,
+ node->parent, node->ref_root, owner_objectid,
owner_offset);
if (ret == 0) {
/*
@@ -3237,7 +3212,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
} else if (WARN_ON(ret == -ENOENT)) {
abort_and_dump(trans, path,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
- bytenr, parent, root_objectid, owner_objectid,
+ bytenr, node->parent, node->ref_root, owner_objectid,
owner_offset, path->slots[0]);
goto out;
} else {
@@ -3441,29 +3416,42 @@ out_delayed_unlock:
return 0;
}
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref)
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_ref generic_ref = { 0 };
struct btrfs_block_group *bg;
int ret;
- btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
- buf->start, buf->len, parent, btrfs_header_owner(buf));
- btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root_id, 0, false);
-
if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = buf->start,
+ .num_bytes = buf->len,
+ .parent = parent,
+ .owning_root = btrfs_header_owner(buf),
+ .ref_root = root_id,
+ };
+
+ /*
+ * Assert that the extent buffer is not cleared due to
+ * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
+ * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
+ * detail.
+ */
+ ASSERT(btrfs_header_bytenr(buf) != 0);
+
+ btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret < 0)
+ return ret;
}
if (!last_ref)
- return;
+ return 0;
if (btrfs_header_generation(buf) != trans->transid)
goto out;
@@ -3520,6 +3508,7 @@ out:
* matter anymore.
*/
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ return 0;
}
/* Can return -ENOMEM */
@@ -3535,11 +3524,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree log blocks never actually go into the extent allocation
* tree, just update pinning info and exit early.
*/
- if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
- (ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
- btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
+ if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
+ btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
@@ -3547,10 +3533,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
- if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
- (ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+ if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -4685,7 +4668,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
- bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
@@ -4879,16 +4862,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
- struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
u64 flags = extent_op->flags_to_set;
+ /* The owner of a tree block is the level. */
+ int level = btrfs_delayed_ref_owner(node);
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
- ref = btrfs_delayed_node_to_tree_ref(node);
-
extent_key.objectid = node->bytenr;
if (skinny_metadata) {
- extent_key.offset = ref->level;
+ /* The owner of a tree block is the level. */
+ extent_key.offset = level;
extent_key.type = BTRFS_METADATA_ITEM_KEY;
} else {
extent_key.offset = node->num_bytes;
@@ -4921,18 +4904,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
- btrfs_set_tree_block_level(leaf, block_info, ref->level);
+ btrfs_set_tree_block_level(leaf, block_info, level);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
}
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
} else {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_TREE_BLOCK_REF_KEY);
- btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
}
btrfs_mark_buffer_dirty(trans, leaf);
@@ -4946,19 +4929,20 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins)
{
- struct btrfs_ref generic_ref = { 0 };
- u64 root_objectid = root->root_key.objectid;
- u64 owning_root = root_objectid;
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_ADD_DELAYED_EXTENT,
+ .bytenr = ins->objectid,
+ .num_bytes = ins->offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
- BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
- owning_root = root->relocation_src_root;
+ generic_ref.owning_root = root->relocation_src_root;
- btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins->objectid, ins->offset, 0, owning_root);
- btrfs_init_data_ref(&generic_ref, root_objectid, owner,
- offset, 0, false);
+ btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -5081,7 +5065,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
*/
btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
- __btrfs_tree_lock(buf, nest);
+ btrfs_tree_lock_nested(buf, nest);
btrfs_clear_buffer_dirty(trans, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
@@ -5096,7 +5080,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_owner(buf, owner);
write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
@@ -5137,7 +5121,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
struct btrfs_delayed_extent_op *extent_op;
- struct btrfs_ref generic_ref = { 0 };
u64 flags = 0;
int ret;
u32 blocksize = fs_info->nodesize;
@@ -5180,6 +5163,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(parent > 0);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_ref generic_ref = {
+ .action = BTRFS_ADD_DELAYED_EXTENT,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .parent = parent,
+ .owning_root = owning_root,
+ .ref_root = root_objectid,
+ };
extent_op = btrfs_alloc_delayed_extent_op();
if (!extent_op) {
ret = -ENOMEM;
@@ -5194,10 +5185,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->update_flags = true;
extent_op->level = level;
- btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins.objectid, ins.offset, parent, owning_root);
- btrfs_init_tree_ref(&generic_ref, level, root_objectid,
- root->root_key.objectid, false);
+ btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
@@ -5335,8 +5323,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
int ret;
- if (wc->stage == UPDATE_BACKREF &&
- btrfs_header_owner(eb) != root->root_key.objectid)
+ if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
return 1;
/*
@@ -5410,7 +5397,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
ret = lookup_extent_backref(trans, path, &iref, bytenr,
root->fs_info->nodesize, parent,
- root->root_key.objectid, level, 0);
+ btrfs_root_id(root), level, 0);
btrfs_free_path(path);
if (ret == -ENOENT)
return 0;
@@ -5440,11 +5427,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
- u64 parent;
u64 owner_root = 0;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_key key;
- struct btrfs_ref ref = { 0 };
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
@@ -5468,7 +5453,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
check.level = level - 1;
check.transid = generation;
- check.owner_root = root->root_key.objectid;
+ check.owner_root = btrfs_root_id(root);
check.has_first_key = true;
btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
path->slots[level]);
@@ -5476,7 +5461,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(fs_info, bytenr,
- root->root_key.objectid, level - 1);
+ btrfs_root_id(root), level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
reada = 1;
@@ -5561,19 +5546,25 @@ skip:
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
if (wc->stage == DROP_REFERENCE) {
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = bytenr,
+ .num_bytes = fs_info->nodesize,
+ .owning_root = owner_root,
+ .ref_root = btrfs_root_id(root),
+ };
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- parent = path->nodes[level]->start;
+ ref.parent = path->nodes[level]->start;
} else {
- ASSERT(root->root_key.objectid ==
+ ASSERT(btrfs_root_id(root) ==
btrfs_header_owner(path->nodes[level]));
- if (root->root_key.objectid !=
+ if (btrfs_root_id(root) !=
btrfs_header_owner(path->nodes[level])) {
btrfs_err(root->fs_info,
"mismatched block owner");
ret = -EIO;
goto out_unlock;
}
- parent = 0;
}
/*
@@ -5583,7 +5574,7 @@ skip:
* ->restarted flag.
*/
if (wc->restarted) {
- ret = check_ref_exists(trans, root, bytenr, parent,
+ ret = check_ref_exists(trans, root, bytenr, ref.parent,
level - 1);
if (ret < 0)
goto out_unlock;
@@ -5598,8 +5589,7 @@ skip:
* already accounted them at merge time (replace_path),
* thus we could skip expensive subtree trace here.
*/
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
- need_account) {
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) {
ret = btrfs_qgroup_trace_subtree(trans, next,
generation, level - 1);
if (ret) {
@@ -5618,10 +5608,7 @@ skip:
wc->drop_level = level;
find_next_key(path, level, &wc->drop_progress);
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- fs_info->nodesize, parent, owner_root);
- btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
- 0, false);
+ btrfs_init_tree_ref(&ref, level - 1, 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
@@ -5655,7 +5642,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ int ret = 0;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 parent = 0;
@@ -5712,7 +5699,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
else
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- if (is_fstree(root->root_key.objectid)) {
+ if (is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
btrfs_err_rl(fs_info,
@@ -5732,26 +5719,28 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else if (root->root_key.objectid != btrfs_header_owner(eb))
+ else if (btrfs_root_id(root) != btrfs_header_owner(eb))
goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else if (root->root_key.objectid !=
+ else if (btrfs_root_id(root) !=
btrfs_header_owner(path->nodes[level + 1]))
goto owner_mismatch;
}
- btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
- wc->refs[level] == 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+ wc->refs[level] == 1);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
- return 0;
+ return ret;
owner_mismatch:
btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
- btrfs_header_owner(eb), root->root_key.objectid);
+ btrfs_header_owner(eb), btrfs_root_id(root));
return -EUCLEAN;
}
@@ -5837,8 +5826,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
- const bool is_reloc_root = (root->root_key.objectid ==
- BTRFS_TREE_RELOC_OBJECTID);
+ const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5852,7 +5840,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
bool root_dropped = false;
bool unfinished_drop = false;
- btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
+ btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
path = btrfs_alloc_path();
if (!path) {
@@ -6050,8 +6038,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
*
* The most common failure here is just -ENOENT.
*/
- btrfs_del_orphan_item(trans, tree_root,
- root->root_key.objectid);
+ btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
}
}
@@ -6113,9 +6100,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
int level;
int parent_level;
int ret = 0;
- int wret;
- BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
path = btrfs_alloc_path();
if (!path)
@@ -6149,17 +6135,16 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
while (1) {
- wret = walk_down_tree(trans, root, path, wc);
- if (wret < 0) {
- ret = wret;
+ ret = walk_down_tree(trans, root, path, wc);
+ if (ret < 0)
break;
- }
- wret = walk_up_tree(trans, root, path, wc, parent_level);
- if (wret < 0)
- ret = wret;
- if (wret != 0)
+ ret = walk_up_tree(trans, root, path, wc, parent_level);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
break;
+ }
}
kfree(wc);
@@ -6167,10 +6152,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
- u64 start, u64 end)
+/*
+ * Unpin the extent range in an error context and don't add the space back.
+ * Errors are not propagated further.
+ */
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
{
- return unpin_extent_range(fs_info, start, end, false);
+ unpin_extent_range(fs_info, start, end, false);
}
/*
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 2e066035cc..2ad51130c0 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -3,11 +3,21 @@
#ifndef BTRFS_EXTENT_TREE_H
#define BTRFS_EXTENT_TREE_H
+#include <linux/types.h>
#include "misc.h"
#include "block-group.h"
+#include "locking.h"
+struct extent_buffer;
struct btrfs_free_cluster;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_ref;
+struct btrfs_disk_key;
struct btrfs_delayed_ref_head;
+struct btrfs_delayed_ref_root;
+struct btrfs_extent_inline_ref;
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
@@ -117,10 +127,10 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 empty_size,
u64 reloc_src_root,
enum btrfs_lock_nesting nest);
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref);
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b5175b9929..0486b1f911 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,7 +14,6 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/fsverity.h>
-#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
@@ -22,7 +21,6 @@
#include "btrfs_inode.h"
#include "bio.h"
#include "locking.h"
-#include "rcu-string.h"
#include "backref.h"
#include "disk-io.h"
#include "subpage.h"
@@ -78,10 +76,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
pr_err(
- "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
+ WARN_ON_ONCE(1);
kmem_cache_free(extent_buffer_cache, eb);
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
@@ -147,8 +146,8 @@ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
- sizeof(struct extent_buffer), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_buffer), 0, 0,
+ NULL);
if (!extent_buffer_cache)
return -ENOMEM;
@@ -397,15 +396,14 @@ again:
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, cached_state);
+
+ unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- unlock_extent(tree, delalloc_start, delalloc_end,
- &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
- free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
@@ -414,9 +412,10 @@ out_failed:
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
+ struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start, end, page_ops);
@@ -462,16 +461,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
*/
static void end_bbio_data_write(struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct folio_iter fi;
+ const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
- struct inode *inode = folio->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const u32 sectorsize = fs_info->sectorsize;
u64 start = folio_pos(folio) + fi.offset;
u32 len = fi.length;
@@ -593,22 +591,17 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
*/
static void end_bbio_data_read(struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
struct processed_extent processed = { 0 };
struct folio_iter fi;
- /*
- * The offset to the beginning of a bio, since one bio can never be
- * larger than UINT_MAX, u32 here is enough.
- */
- u32 bio_offset = 0;
+ const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
struct folio *folio = fi.folio;
struct inode *inode = folio->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const u32 sectorsize = fs_info->sectorsize;
u64 start;
u64 end;
u32 len;
@@ -667,10 +660,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
end_page_read(folio_page(folio, 0), uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
start, end, uptodate);
-
- ASSERT(bio_offset + len > bio_offset);
- bio_offset += len;
-
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -678,6 +667,37 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/*
+ * Populate every free slot in a provided array with folios.
+ *
+ * @nr_folios: number of folios to allocate
+ * @folio_array: the array to fill with folios; any existing non-NULL entries in
+ * the array will be skipped
+ * @extra_gfp: the extra GFP flags for the allocation
+ *
+ * Return: 0 if all folios were able to be allocated;
+ * -ENOMEM otherwise, the partially allocated folios would be freed and
+ * the array slots zeroed
+ */
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+ gfp_t extra_gfp)
+{
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ continue;
+ folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0);
+ if (!folio_array[i])
+ goto error;
+ }
+ return 0;
+error:
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ folio_put(folio_array[i]);
+ }
+ return -ENOMEM;
+}
+
+/*
* Populate every free slot in a provided array with pages.
*
* @nr_pages: number of pages to allocate
@@ -728,6 +748,8 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
for (int i = 0; i < num_pages; i++)
eb->folios[i] = page_folio(page_array[i]);
+ eb->folio_size = PAGE_SIZE;
+ eb->folio_shift = PAGE_SHIFT;
return 0;
}
@@ -964,13 +986,14 @@ void clear_page_extent_mapped(struct page *page)
folio_detach_private(folio);
}
-static struct extent_map *
-__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+static struct extent_map *__get_extent_map(struct inode *inode, struct page *page,
u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
- if (em_cached && *em_cached) {
+ ASSERT(em_cached);
+
+ if (*em_cached) {
em = *em_cached;
if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
@@ -982,8 +1005,8 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
- if (em_cached && !IS_ERR(em)) {
+ em = btrfs_get_extent(BTRFS_I(inode), page, start, len);
+ if (!IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
@@ -1045,8 +1068,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
end_page_read(page, true, cur, iosize);
break;
}
- em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, em_cached);
+ em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached);
if (IS_ERR(em)) {
unlock_extent(tree, cur, end, NULL);
end_page_read(page, false, cur, end + 1 - cur);
@@ -1155,11 +1177,14 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct extent_map *em_cached = NULL;
int ret;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL);
+ free_extent_map(em_cached);
+
/*
* If btrfs_do_readpage() failed we will want to submit the assembled
* bio to do the cleanup.
@@ -1177,6 +1202,8 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
struct btrfs_inode *inode = page_to_inode(pages[0]);
int index;
+ ASSERT(em_cached);
+
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
@@ -1365,7 +1392,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
continue;
}
- em = btrfs_get_extent(inode, NULL, 0, cur, len);
+ em = btrfs_get_extent(inode, NULL, cur, len);
if (IS_ERR(em)) {
ret = PTR_ERR_OR_ZERO(em);
goto out_error;
@@ -1575,7 +1602,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* can be no longer dirty nor marked anymore for writeback (if a
* subsequent modification to the extent buffer didn't happen before the
* transaction commit), which makes filemap_fdata[write|wait]_range not
- * able to find the pages tagged with SetPageError at transaction
+ * able to find the pages which contain errors at transaction
* commit time. So if this happens we must abort the transaction,
* otherwise we commit a super block with btree roots that point to
* btree nodes/leafs whose content on disk is invalid - either garbage
@@ -1733,10 +1760,10 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
folio_lock(folio);
folio_clear_dirty_for_io(folio);
folio_start_writeback(folio);
- ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+ ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
ASSERT(ret);
wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
- folio_size(folio));
+ eb->folio_size);
wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
@@ -2219,10 +2246,8 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
page = find_get_page(mapping, cur >> PAGE_SHIFT);
ASSERT(PageLocked(page));
- if (pages_dirty && page != locked_page) {
+ if (pages_dirty && page != locked_page)
ASSERT(PageDirty(page));
- clear_page_dirty_for_io(page);
- }
ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl,
i_size, &nr);
@@ -2250,8 +2275,7 @@ next_page:
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
}
-int extent_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -2271,7 +2295,7 @@ int extent_writepages(struct address_space *mapping,
return ret;
}
-void extent_readahead(struct readahead_control *rac)
+void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
struct page *pagepool[16];
@@ -2329,19 +2353,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
-static int try_release_extent_state(struct extent_io_tree *tree,
+static bool try_release_extent_state(struct extent_io_tree *tree,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- int ret = 1;
+ bool ret;
if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
- ret = 0;
+ ret = false;
} else {
u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
EXTENT_QGROUP_RESERVED);
+ int ret2;
/*
* At this point we can safely clear everything except the
@@ -2349,15 +2374,15 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
+ ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
*/
- if (ret < 0)
- ret = 0;
+ if (ret2 < 0)
+ ret = false;
else
- ret = 1;
+ ret = true;
}
return ret;
}
@@ -2367,84 +2392,80 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
-int try_release_extent_mapping(struct page *page, gfp_t mask)
+bool try_release_extent_mapping(struct page *page, gfp_t mask)
{
- struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- struct btrfs_inode *btrfs_inode = page_to_inode(page);
- struct extent_io_tree *tree = &btrfs_inode->io_tree;
- struct extent_map_tree *map = &btrfs_inode->extent_tree;
-
- if (gfpflags_allow_blocking(mask) &&
- page->mapping->host->i_size > SZ_16M) {
- u64 len;
- while (start <= end) {
- struct btrfs_fs_info *fs_info;
- u64 cur_gen;
-
- len = end - start + 1;
- write_lock(&map->lock);
- em = lookup_extent_mapping(map, start, len);
- if (!em) {
- write_unlock(&map->lock);
- break;
- }
- if ((em->flags & EXTENT_FLAG_PINNED) ||
- em->start != start) {
- write_unlock(&map->lock);
- free_extent_map(em);
- break;
- }
- if (test_range_bit_exists(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED))
- goto next;
- /*
- * If it's not in the list of modified extents, used
- * by a fast fsync, we can remove it. If it's being
- * logged we can safely remove it since fsync took an
- * extra reference on the em.
- */
- if (list_empty(&em->list) ||
- (em->flags & EXTENT_FLAG_LOGGING))
- goto remove_em;
- /*
- * If it's in the list of modified extents, remove it
- * only if its generation is older then the current one,
- * in which case we don't need it for a fast fsync.
- * Otherwise don't remove it, we could be racing with an
- * ongoing fast fsync that could miss the new extent.
- */
- fs_info = btrfs_inode->root->fs_info;
- spin_lock(&fs_info->trans_lock);
- cur_gen = fs_info->generation;
- spin_unlock(&fs_info->trans_lock);
- if (em->generation >= cur_gen)
- goto next;
-remove_em:
- /*
- * We only remove extent maps that are not in the list of
- * modified extents or that are in the list but with a
- * generation lower then the current generation, so there
- * is no need to set the full fsync flag on the inode (it
- * hurts the fsync performance for workloads with a data
- * size that exceeds or is close to the system's memory).
- */
- remove_extent_mapping(map, em);
- /* once for the rb tree */
+ struct btrfs_inode *inode = page_to_inode(page);
+ struct extent_io_tree *io_tree = &inode->io_tree;
+
+ while (start <= end) {
+ const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ const u64 len = end - start + 1;
+ struct extent_map_tree *extent_tree = &inode->extent_tree;
+ struct extent_map *em;
+
+ write_lock(&extent_tree->lock);
+ em = lookup_extent_mapping(extent_tree, start, len);
+ if (!em) {
+ write_unlock(&extent_tree->lock);
+ break;
+ }
+ if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
+ write_unlock(&extent_tree->lock);
free_extent_map(em);
+ break;
+ }
+ if (test_range_bit_exists(io_tree, em->start,
+ extent_map_end(em) - 1, EXTENT_LOCKED))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used by a fast
+ * fsync, we can remove it. If it's being logged we can safely
+ * remove it since fsync took an extra reference on the em.
+ */
+ if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it only if
+ * its generation is older then the current one, in which case
+ * we don't need it for a fast fsync. Otherwise don't remove it,
+ * we could be racing with an ongoing fast fsync that could miss
+ * the new extent.
+ */
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there is no
+ * need to set the full fsync flag on the inode (it hurts the
+ * fsync performance for workloads with a data size that exceeds
+ * or is close to the system's memory).
+ */
+ remove_extent_mapping(inode, em);
+ /* Once for the inode's extent map tree. */
+ free_extent_map(em);
next:
- start = extent_map_end(em);
- write_unlock(&map->lock);
+ start = extent_map_end(em);
+ write_unlock(&extent_tree->lock);
- /* once for us */
- free_extent_map(em);
+ /* Once for us, for the lookup_extent_mapping() reference. */
+ free_extent_map(em);
- cond_resched(); /* Allow large-extent preemption. */
+ if (need_resched()) {
+ /*
+ * If we need to resched but we can't block just exit
+ * and leave any remaining extent maps.
+ */
+ if (!gfpflags_allow_blocking(mask))
+ break;
+
+ cond_resched();
}
}
- return try_release_extent_state(tree, page, mask);
+ return try_release_extent_state(io_tree, page, mask);
}
struct btrfs_fiemap_entry {
@@ -2746,7 +2767,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct extent_buffer *clone;
+ struct extent_buffer *clone = path->nodes[0];
struct btrfs_key key;
int slot;
int ret;
@@ -2755,29 +2776,51 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p
if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
return 0;
+ /*
+ * Add a temporary extra ref to an already cloned extent buffer to
+ * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
+ * the cost of allocating a new one.
+ */
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
+ atomic_inc(&clone->refs);
+
ret = btrfs_next_leaf(inode->root, path);
if (ret != 0)
- return ret;
+ goto out;
/*
* Don't bother with cloning if there are no more file extent items for
* our inode.
*/
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
- return 1;
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
+ ret = 1;
+ goto out;
+ }
+ /*
+ * Important to preserve the start field, for the optimizations when
+ * checking if extents are shared (see extent_fiemap()).
+ *
+ * We must set ->start before calling copy_extent_buffer_full(). If we
+ * are on sub-pagesize blocksize, we use ->start to determine the offset
+ * into the folio where our eb exists, and if we update ->start after
+ * the fact then any subsequent reads of the eb may read from a
+ * different offset in the folio than where we originally copied into.
+ */
+ clone->start = path->nodes[0]->start;
/* See the comment at fiemap_search_slot() about why we clone. */
- clone = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!clone)
- return -ENOMEM;
+ copy_extent_buffer_full(clone, path->nodes[0]);
slot = path->slots[0];
btrfs_release_path(path);
path->nodes[0] = clone;
path->slots[0] = slot;
+out:
+ if (ret)
+ free_extent_buffer(clone);
- return 0;
+ return ret;
}
/*
@@ -3508,7 +3551,7 @@ err:
for (int i = 0; i < num_folios; i++) {
if (eb->folios[i]) {
detach_extent_buffer_folio(eb, eb->folios[i]);
- __folio_put(eb->folios[i]);
+ folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3644,6 +3687,8 @@ static struct extent_buffer *grab_extent_buffer(
struct folio *folio = page_folio(page);
struct extent_buffer *exists;
+ lockdep_assert_held(&page->mapping->i_private_lock);
+
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
@@ -3711,13 +3756,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct btrfs_subpage *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
const unsigned long index = eb->start >> PAGE_SHIFT;
- struct folio *existing_folio;
+ struct folio *existing_folio = NULL;
int ret;
ASSERT(found_eb_ret);
@@ -3729,30 +3775,31 @@ retry:
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
- return 0;
+ goto finish;
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
- if (IS_ERR(existing_folio))
+ if (IS_ERR(existing_folio)) {
+ existing_folio = NULL;
goto retry;
+ }
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
- if (folio_size(existing_folio) != folio_size(eb->folios[0])) {
+ if (folio_size(existing_folio) != eb->folio_size) {
folio_unlock(existing_folio);
folio_put(existing_folio);
return -EAGAIN;
}
- if (fs_info->nodesize < PAGE_SIZE) {
- /*
- * We're going to reuse the existing page, can drop our page
- * and subpage structure now.
- */
+finish:
+ spin_lock(&mapping->i_private_lock);
+ if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ /* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
- } else {
+ } else if (existing_folio) {
struct extent_buffer *existing_eb;
existing_eb = grab_extent_buffer(fs_info,
@@ -3760,6 +3807,7 @@ retry:
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
+ spin_unlock(&mapping->i_private_lock);
folio_unlock(existing_folio);
folio_put(existing_folio);
return 1;
@@ -3768,6 +3816,22 @@ retry:
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
}
+ eb->folio_size = folio_size(eb->folios[i]);
+ eb->folio_shift = folio_shift(eb->folios[i]);
+ /* Should not fail, as we have preallocated the memory. */
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have an extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the folio private when the
+ * eb hasn't been inserted into radix tree yet.
+ *
+ * The ref will be decreased when the eb releases the page, in
+ * detach_extent_buffer_page(). Thus needs no special handling in the
+ * error path.
+ */
+ btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
@@ -3779,7 +3843,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
@@ -3845,7 +3908,7 @@ reallocate:
for (int i = 0; i < num_folios; i++) {
struct folio *folio;
- ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
if (ret > 0) {
ASSERT(existing_eb);
goto out;
@@ -3882,22 +3945,6 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- spin_lock(&mapping->i_private_lock);
- /* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_folio(eb, folio, prealloc);
- ASSERT(!ret);
- /*
- * To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the folio private
- * when the eb hasn't yet been inserted into radix tree.
- *
- * The ref will be decreased when the eb released the page, in
- * detach_extent_buffer_page().
- * Thus needs no special handling in error path.
- */
- btrfs_folio_inc_eb_refs(fs_info, folio);
- spin_unlock(&mapping->i_private_lock);
-
WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
/*
@@ -4169,6 +4216,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
@@ -4246,6 +4294,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
}
+static void clear_extent_buffer_reading(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+}
+
static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
@@ -4254,6 +4309,13 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
struct folio_iter fi;
u32 bio_offset = 0;
+ /*
+ * If the extent buffer is marked UPTODATE before the read operation
+ * completes, other calls to read_extent_buffer_pages() will return
+ * early without waiting for the read to finish, causing data races.
+ */
+ WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags));
+
eb->read_mirror = bbio->mirror_num;
if (uptodate &&
@@ -4280,9 +4342,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
bio_offset += len;
}
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
bio_put(&bbio->bio);
@@ -4316,9 +4376,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
* will now be set, and we shouldn't read it in again.
*/
if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) {
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
return 0;
}
@@ -4344,7 +4402,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
for (int i = 0; i < num_folios; i++) {
struct folio *folio = eb->folios[i];
- ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+ ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
ASSERT(ret);
}
}
@@ -4364,7 +4422,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
unsigned long len)
{
btrfs_warn(eb->fs_info,
- "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ "access to eb bytenr %llu len %u out of range start %lu len %lu",
eb->start, eb->len, start, len);
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
@@ -4393,7 +4451,7 @@ static inline int check_eb_range(const struct extent_buffer *eb,
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
- const int unit_size = folio_size(eb->folios[0]);
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
char *dst = (char *)dstv;
@@ -4433,7 +4491,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dstv,
unsigned long start, unsigned long len)
{
- const int unit_size = folio_size(eb->folios[0]);
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
char __user *dst = (char __user *)dstv;
@@ -4473,7 +4531,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
- const int unit_size = folio_size(eb->folios[0]);
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
char *kaddr;
@@ -4544,7 +4602,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
const void *srcv, unsigned long start,
unsigned long len, bool use_memmove)
{
- const int unit_size = folio_size(eb->folios[0]);
+ const int unit_size = eb->folio_size;
size_t cur;
size_t offset;
char *kaddr;
@@ -4593,7 +4651,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
static void memset_extent_buffer(const struct extent_buffer *eb, int c,
unsigned long start, unsigned long len)
{
- const int unit_size = folio_size(eb->folios[0]);
+ const int unit_size = eb->folio_size;
unsigned long cur = start;
if (eb->addr) {
@@ -4624,7 +4682,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
- const int unit_size = folio_size(src->folios[0]);
+ const int unit_size = src->folio_size;
unsigned long cur = 0;
ASSERT(dst->len == src->len);
@@ -4646,7 +4704,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- const int unit_size = folio_size(dst->folios[0]);
+ const int unit_size = dst->folio_size;
u64 dst_len = dst->len;
size_t cur;
size_t offset;
@@ -4702,10 +4760,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset;
+ offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset;
- *folio_index = offset >> folio_shift(eb->folios[0]);
- *folio_offset = offset_in_folio(eb->folios[0], offset);
+ *folio_index = offset >> eb->folio_shift;
+ *folio_offset = offset_in_eb_folio(eb, offset);
}
/*
@@ -4819,7 +4877,7 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- const int unit_size = folio_size(dst->folios[0]);
+ const int unit_size = dst->folio_size;
unsigned long cur_off = 0;
if (check_eb_range(dst, dst_offset, len) ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2c9d6570b0..dca6b12769 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,11 +7,33 @@
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/slab.h>
#include "compression.h"
+#include "messages.h"
#include "ulist.h"
#include "misc.h"
+struct page;
+struct file;
+struct folio;
+struct inode;
+struct fiemap_extent_info;
+struct readahead_control;
+struct address_space;
+struct writeback_control;
+struct extent_io_tree;
+struct extent_map_tree;
+struct extent_state;
+struct btrfs_block_group;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
enum {
EXTENT_BUFFER_UPTODATE,
@@ -63,11 +85,6 @@ enum {
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-struct btrfs_root;
-struct btrfs_inode;
-struct btrfs_fs_info;
-struct extent_io_tree;
-struct btrfs_tree_parent_check;
int __init extent_buffer_init_cachep(void);
void __cold extent_buffer_free_cachep(void);
@@ -75,7 +92,8 @@ void __cold extent_buffer_free_cachep(void);
#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
- unsigned long len;
+ u32 len;
+ u32 folio_size;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
@@ -90,6 +108,7 @@ struct extent_buffer {
int read_mirror;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
+ u8 folio_shift;
struct rcu_head rcu_head;
struct rw_semaphore lock;
@@ -113,6 +132,13 @@ struct btrfs_eb_write_context {
struct btrfs_block_group *zoned_bg;
};
+static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb,
+ u64 start)
+{
+ ASSERT(eb->folio_size);
+ return start & (eb->folio_size - 1);
+}
+
/*
* Get the correct offset inside the page of extent buffer.
*
@@ -151,13 +177,13 @@ static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
* the folio_shift would be large enough to always make us
* return 0 as index.
* 1.2) Several page sized folios
- * The folio_shift() would be PAGE_SHIFT, giving us the correct
+ * The folio_shift would be PAGE_SHIFT, giving us the correct
* index.
*
* 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
* The folio would only be page sized, and always give us 0 as index.
*/
- return offset >> folio_shift(eb->folios[0]);
+ return offset >> eb->folio_shift;
}
/*
@@ -205,20 +231,17 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
-struct extent_map_tree;
-
-int try_release_extent_mapping(struct page *page, gfp_t mask);
+bool try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int btrfs_read_folio(struct file *file, struct folio *folio);
void extent_write_locked_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty);
-int extent_writepages(struct address_space *mapping,
- struct writeback_control *wbc);
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-void extent_readahead(struct readahead_control *rac);
+void btrfs_readahead(struct readahead_control *rac);
int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int set_folio_extent_mapped(struct folio *folio);
@@ -330,6 +353,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
+ struct extent_state **cached,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
struct folio *folio, size_t offset);
@@ -338,6 +362,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
gfp_t extra_gfp);
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+ gfp_t extra_gfp);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 54e111b5fa..b4c9a6aa11 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -5,10 +5,10 @@
#include <linux/spinlock.h>
#include "messages.h"
#include "ctree.h"
-#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
#include "btrfs_inode.h"
+#include "disk-io.h"
static struct kmem_cache *extent_map_cache;
@@ -16,8 +16,7 @@ static struct kmem_cache *extent_map_cache;
int __init extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
- sizeof(struct extent_map), 0,
- SLAB_MEM_SPREAD, NULL);
+ sizeof(struct extent_map), 0, 0, NULL);
if (!extent_map_cache)
return -ENOMEM;
return 0;
@@ -78,6 +77,14 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
+static void dec_evictable_extent_maps(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
+ percpu_counter_dec(&fs_info->evictable_extent_maps);
+}
+
static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
struct rb_node **p = &root->rb_root.rb_node;
@@ -225,8 +232,9 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
return next->block_start == prev->block_start;
}
-static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
+static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -254,14 +262,13 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
- em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
- em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
+ dec_evictable_extent_maps(inode);
}
}
@@ -273,10 +280,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->block_len += merge->block_len;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
- em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
free_extent_map(merge);
+ dec_evictable_extent_maps(inode);
}
}
@@ -302,7 +309,6 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
struct extent_map *em;
- bool prealloc = false;
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
@@ -327,20 +333,8 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
em->generation = gen;
em->flags &= ~EXTENT_FLAG_PINNED;
- em->mod_start = em->start;
- em->mod_len = em->len;
- if (em->flags & EXTENT_FLAG_FILLING) {
- prealloc = true;
- em->flags &= ~EXTENT_FLAG_FILLING;
- }
-
- try_merge_map(tree, em);
-
- if (prealloc) {
- em->mod_start = em->start;
- em->mod_len = em->len;
- }
+ try_merge_map(inode, em);
out:
write_unlock(&tree->lock);
@@ -349,58 +343,62 @@ out:
}
-void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
{
- lockdep_assert_held_write(&tree->lock);
+ lockdep_assert_held_write(&inode->extent_tree.lock);
em->flags &= ~EXTENT_FLAG_LOGGING;
if (extent_map_in_tree(em))
- try_merge_map(tree, em);
+ try_merge_map(inode, em);
}
-static inline void setup_extent_mapping(struct extent_map_tree *tree,
+static inline void setup_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em,
int modified)
{
refcount_inc(&em->refs);
- em->mod_start = em->start;
- em->mod_len = em->len;
ASSERT(list_empty(&em->list));
if (modified)
- list_add(&em->list, &tree->modified_extents);
+ list_add(&em->list, &inode->extent_tree.modified_extents);
else
- try_merge_map(tree, em);
+ try_merge_map(inode, em);
}
/*
- * Add new extent map to the extent tree
+ * Add a new extent map to an inode's extent map tree.
*
- * @tree: tree to insert new map in
+ * @inode: the target inode
* @em: map to insert
* @modified: indicate whether the given @em should be added to the
* modified list, which indicates the extent needs to be logged
*
- * Insert @em into @tree or perform a simple forward/backward merge with
- * existing mappings. The extent_map struct passed in will be inserted
- * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was successful.
+ * Insert @em into the @inode's extent map tree or perform a simple
+ * forward/backward merge with existing mappings. The extent_map struct passed
+ * in will be inserted into the tree directly, with an additional reference
+ * taken, or a reference dropped if the merge attempt was successful.
*/
-static int add_extent_mapping(struct extent_map_tree *tree,
+static int add_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em, int modified)
{
- int ret = 0;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
lockdep_assert_held_write(&tree->lock);
ret = tree_insert(&tree->map, em);
if (ret)
- goto out;
+ return ret;
- setup_extent_mapping(tree, em, modified);
-out:
- return ret;
+ setup_extent_mapping(inode, em, modified);
+
+ if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
+ percpu_counter_inc(&fs_info->evictable_extent_maps);
+
+ return 0;
}
static struct extent_map *
@@ -466,16 +464,18 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
}
/*
- * Remove an extent_map from the extent tree.
+ * Remove an extent_map from its inode's extent tree.
*
- * @tree: extent tree to remove from
+ * @inode: the inode the extent map belongs to
* @em: extent map being removed
*
- * Remove @em from @tree. No reference counts are dropped, and no checks
- * are done to see if the range is in use.
+ * Remove @em from the extent tree of @inode. No reference counts are dropped,
+ * and no checks are done to see if the range is in use.
*/
-void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
lockdep_assert_held_write(&tree->lock);
WARN_ON(em->flags & EXTENT_FLAG_PINNED);
@@ -483,13 +483,17 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
+
+ dec_evictable_extent_maps(inode);
}
-static void replace_extent_mapping(struct extent_map_tree *tree,
+static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *cur,
struct extent_map *new,
int modified)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
lockdep_assert_held_write(&tree->lock);
WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
@@ -499,7 +503,7 @@ static void replace_extent_mapping(struct extent_map_tree *tree,
rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
- setup_extent_mapping(tree, new, modified);
+ setup_extent_mapping(inode, new, modified);
}
static struct extent_map *next_extent_map(const struct extent_map *em)
@@ -528,7 +532,7 @@ static struct extent_map *prev_extent_map(struct extent_map *em)
* and an extent that you want to insert, deal with overlap and insert
* the best fitted new extent into the tree.
*/
-static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
+static noinline int merge_extent_mapping(struct btrfs_inode *inode,
struct extent_map *existing,
struct extent_map *em,
u64 map_start)
@@ -539,7 +543,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
u64 end;
u64 start_diff;
- BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+ if (map_start < em->start || map_start >= extent_map_end(em))
+ return -EINVAL;
if (existing->start > map_start) {
next = existing;
@@ -561,14 +566,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
em->block_start += start_diff;
em->block_len = em->len;
}
- return add_extent_mapping(em_tree, em, 0);
+ return add_extent_mapping(inode, em, 0);
}
/*
- * Add extent mapping into em_tree.
+ * Add extent mapping into an inode's extent map tree.
*
- * @fs_info: the filesystem
- * @em_tree: extent tree into which we want to insert the extent mapping
+ * @inode: target inode
* @em_in: extent we are inserting
* @start: start of the logical range btrfs_get_extent() is requesting
* @len: length of the logical range btrfs_get_extent() is requesting
@@ -576,8 +580,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Note that @em_in's range may be different from [start, start+len),
* but they must be overlapped.
*
- * Insert @em_in into @em_tree. In case there is an overlapping range, handle
- * the -EEXIST by either:
+ * Insert @em_in into the inode's extent map tree. In case there is an
+ * overlapping range, handle the -EEXIST by either:
* a) Returning the existing extent in @em_in if @start is within the
* existing em.
* b) Merge the existing extent with @em_in passed in.
@@ -585,12 +589,12 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
* Return 0 on success, otherwise -EEXIST.
*
*/
-int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len)
{
int ret;
struct extent_map *em = *em_in;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
/*
* Tree-checker should have rejected any inline extent with non-zero
@@ -599,7 +603,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (em->block_start == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = add_extent_mapping(inode, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
@@ -607,7 +611,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (ret == -EEXIST) {
struct extent_map *existing;
- existing = search_extent_mapping(em_tree, start, len);
+ existing = search_extent_mapping(&inode->extent_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -628,15 +632,14 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
* The existing extent map is the one nearest to
* the [start, start + len) range which overlaps
*/
- ret = merge_extent_mapping(em_tree, existing,
- em, start);
- if (ret) {
+ ret = merge_extent_mapping(inode, existing, em, start);
+ if (WARN_ON(ret)) {
free_extent_map(em);
*em_in = NULL;
- WARN_ONCE(ret,
-"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
- ret, existing->start, existing->len,
- orig_start, orig_len);
+ btrfs_warn(fs_info,
+"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
+ existing->start, extent_map_end(existing),
+ orig_start, orig_start + orig_len, start);
}
free_extent_map(existing);
}
@@ -651,8 +654,10 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
* if needed. This avoids searching the tree, from the root down to the first
* extent map, before each deletion.
*/
-static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
{
+ struct extent_map_tree *tree = &inode->extent_tree;
+
write_lock(&tree->lock);
while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
struct extent_map *em;
@@ -661,7 +666,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
- remove_extent_mapping(tree, em);
+ remove_extent_mapping(inode, em);
free_extent_map(em);
cond_resched_rwlock_write(&tree->lock);
}
@@ -694,7 +699,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
WARN_ON(end < start);
if (end == (u64)-1) {
if (start == 0 && !skip_pinned) {
- drop_all_extent_maps_fast(em_tree);
+ drop_all_extent_maps_fast(inode);
return;
}
len = (u64)-1;
@@ -791,7 +796,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
- replace_extent_mapping(em_tree, em, split, modified);
+ replace_extent_mapping(inode, em, split, modified);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -832,13 +837,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
if (extent_map_in_tree(em)) {
- replace_extent_mapping(em_tree, em, split,
- modified);
+ replace_extent_mapping(inode, em, split, modified);
} else {
int ret;
- ret = add_extent_mapping(em_tree, split,
- modified);
+ ret = add_extent_mapping(inode, split, modified);
/* Logic error, shouldn't happen. */
ASSERT(ret == 0);
if (WARN_ON(ret != 0) && modified)
@@ -873,7 +876,7 @@ remove_em:
ASSERT(!split);
btrfs_set_inode_full_sync(inode);
}
- remove_extent_mapping(em_tree, em);
+ remove_extent_mapping(inode, em);
}
/*
@@ -928,7 +931,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
do {
btrfs_drop_extent_map_range(inode, new_em->start, end, false);
write_lock(&tree->lock);
- ret = add_extent_mapping(tree, new_em, modified);
+ ret = add_extent_mapping(inode, new_em, modified);
write_unlock(&tree->lock);
} while (ret == -EEXIST);
@@ -992,7 +995,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_pre->flags = flags;
split_pre->generation = em->generation;
- replace_extent_mapping(em_tree, em, split_pre, 1);
+ replace_extent_mapping(inode, em, split_pre, 1);
/*
* Now we only have an extent_map at:
@@ -1009,7 +1012,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, 1);
+ add_extent_mapping(inode, split_mid, 1);
/* Once for us */
free_extent_map(em);
@@ -1024,3 +1027,240 @@ out_free_pre:
free_extent_map(split_pre);
return ret;
}
+
+struct btrfs_em_shrink_ctx {
+ long nr_to_scan;
+ long scanned;
+ u64 last_ino;
+ u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
+{
+ const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ struct extent_map_tree *tree = &inode->extent_tree;
+ long nr_dropped = 0;
+ struct rb_node *node;
+
+ /*
+ * Take the mmap lock so that we serialize with the inode logging phase
+ * of fsync because we may need to set the full sync flag on the inode,
+ * in case we have to remove extent maps in the tree's list of modified
+ * extents. If we set the full sync flag in the inode while an fsync is
+ * in progress, we may risk missing new extents because before the flag
+ * is set, fsync decides to only wait for writeback to complete and then
+ * during inode logging it sees the flag set and uses the subvolume tree
+ * to find new extents, which may not be there yet because ordered
+ * extents haven't completed yet.
+ *
+ * We also do a try lock because otherwise we could deadlock. This is
+ * because the shrinker for this filesystem may be invoked while we are
+ * in a path that is holding the mmap lock in write mode. For example in
+ * a reflink operation while COWing an extent buffer, when allocating
+ * pages for a new extent buffer and under memory pressure, the shrinker
+ * may be invoked, and therefore we would deadlock by attempting to read
+ * lock the mmap lock while we are holding already a write lock on it.
+ */
+ if (!down_read_trylock(&inode->i_mmap_lock))
+ return 0;
+
+ /*
+ * We want to be fast because we can be called from any path trying to
+ * allocate memory, so if the lock is busy we don't want to spend time
+ * waiting for it - either some task is about to do IO for the inode or
+ * we may have another task shrinking extent maps, here in this code, so
+ * skip this inode.
+ */
+ if (!write_trylock(&tree->lock)) {
+ up_read(&inode->i_mmap_lock);
+ return 0;
+ }
+
+ node = rb_first_cached(&tree->map);
+ while (node) {
+ struct extent_map *em;
+
+ em = rb_entry(node, struct extent_map, rb_node);
+ node = rb_next(node);
+ ctx->scanned++;
+
+ if (em->flags & EXTENT_FLAG_PINNED)
+ goto next;
+
+ /*
+ * If the inode is in the list of modified extents (new) and its
+ * generation is the same (or is greater than) the current fs
+ * generation, it means it was not yet persisted so we have to
+ * set the full sync flag so that the next fsync will not miss
+ * it.
+ */
+ if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
+ btrfs_set_inode_full_sync(inode);
+
+ remove_extent_mapping(inode, em);
+ trace_btrfs_extent_map_shrinker_remove_em(inode, em);
+ /* Drop the reference for the tree. */
+ free_extent_map(em);
+ nr_dropped++;
+next:
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * Stop if we need to reschedule or there's contention on the
+ * lock. This is to avoid slowing other tasks trying to take the
+ * lock and because the shrinker might be called during a memory
+ * allocation path and we want to avoid taking a very long time
+ * and slowing down all sorts of tasks.
+ */
+ if (need_resched() || rwlock_needbreak(&tree->lock))
+ break;
+ }
+ write_unlock(&tree->lock);
+ up_read(&inode->i_mmap_lock);
+
+ return nr_dropped;
+}
+
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
+{
+ struct btrfs_inode *inode;
+ long nr_dropped = 0;
+ u64 min_ino = ctx->last_ino + 1;
+
+ inode = btrfs_find_first_inode(root, min_ino);
+ while (inode) {
+ nr_dropped += btrfs_scan_inode(inode, ctx);
+
+ min_ino = btrfs_ino(inode) + 1;
+ ctx->last_ino = btrfs_ino(inode);
+ btrfs_add_delayed_iput(inode);
+
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * We may be called from memory allocation paths, so we don't
+ * want to take too much time and slowdown tasks.
+ */
+ if (need_resched())
+ break;
+
+ inode = btrfs_find_first_inode(root, min_ino);
+ }
+
+ if (inode) {
+ /*
+ * There are still inodes in this root or we happened to process
+ * the last one and reached the scan limit. In either case set
+ * the current root to this one, so we'll resume from the next
+ * inode if there is one or we will find out this was the last
+ * one and move to the next root.
+ */
+ ctx->last_root = btrfs_root_id(root);
+ } else {
+ /*
+ * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
+ * that when processing the next root we start from its first inode.
+ */
+ ctx->last_ino = 0;
+ ctx->last_root = btrfs_root_id(root) + 1;
+ }
+
+ return nr_dropped;
+}
+
+long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ struct btrfs_em_shrink_ctx ctx;
+ u64 start_root_id;
+ u64 next_root_id;
+ bool cycled = false;
+ long nr_dropped = 0;
+
+ ctx.scanned = 0;
+ ctx.nr_to_scan = nr_to_scan;
+
+ /*
+ * In case we have multiple tasks running this shrinker, make the next
+ * one start from the next inode in case it starts before we finish.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+ fs_info->extent_map_shrinker_last_ino++;
+ ctx.last_root = fs_info->extent_map_shrinker_last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ start_root_id = ctx.last_root;
+ next_root_id = ctx.last_root;
+
+ if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
+ s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+ nr, ctx.last_root,
+ ctx.last_ino);
+ }
+
+ /*
+ * We may be called from memory allocation paths, so we don't want to
+ * take too much time and slowdown tasks, so stop if we need reschedule.
+ */
+ while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
+ struct btrfs_root *root;
+ unsigned long count;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)&root,
+ (unsigned long)next_root_id, 1);
+ if (count == 0) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ if (start_root_id > 0 && !cycled) {
+ next_root_id = 0;
+ ctx.last_root = 0;
+ ctx.last_ino = 0;
+ cycled = true;
+ continue;
+ }
+ break;
+ }
+ next_root_id = btrfs_root_id(root) + 1;
+ root = btrfs_grab_root(root);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (!root)
+ continue;
+
+ if (is_fstree(btrfs_root_id(root)))
+ nr_dropped += btrfs_scan_root(root, &ctx);
+
+ btrfs_put_root(root);
+ }
+
+ /*
+ * In case of multiple tasks running this extent map shrinking code this
+ * isn't perfect but it's simple and silences things like KCSAN. It's
+ * not possible to know which task made more progress because we can
+ * cycle back to the first root and first inode if it's not the first
+ * time the shrinker ran, see the above logic. Also a task that started
+ * later may finish ealier than another task and made less progress. So
+ * make this simple and update to the progress of the last task that
+ * finished, with the occasional possiblity of having two consecutive
+ * runs of the shrinker process the same inodes.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+ fs_info->extent_map_shrinker_last_root = ctx.last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
+ s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+ nr, ctx.last_root,
+ ctx.last_ino);
+ }
+
+ return nr_dropped;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e380fc08bb..6d587111f7 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,10 +3,18 @@
#ifndef BTRFS_EXTENT_MAP_H
#define BTRFS_EXTENT_MAP_H
+#include <linux/compiler_types.h>
+#include <linux/rwlock_types.h>
#include <linux/rbtree.h>
+#include <linux/list.h>
#include <linux/refcount.h>
+#include "misc.h"
+#include "extent_map.h"
#include "compression.h"
+struct btrfs_inode;
+struct btrfs_fs_info;
+
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
@@ -22,28 +30,77 @@ enum {
ENUM_BIT(EXTENT_FLAG_PREALLOC),
/* Logging this extent */
ENUM_BIT(EXTENT_FLAG_LOGGING),
- /* Filling in a preallocated extent */
- ENUM_BIT(EXTENT_FLAG_FILLING),
/* This em is merged from two or more physically adjacent ems */
ENUM_BIT(EXTENT_FLAG_MERGED),
};
/*
+ * This structure represents file extents and holes.
+ *
+ * Unlike on-disk file extent items, extent maps can be merged to save memory.
+ * This means members only match file extent items before any merging.
+ *
* Keep this structure as compact as possible, as we can have really large
* amounts of allocated extent maps at any time.
*/
struct extent_map {
struct rb_node rb_node;
- /* all of these are in bytes */
+ /* All of these are in bytes. */
+
+ /* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */
u64 start;
+
+ /*
+ * Length of the file extent.
+ *
+ * For non-inlined file extents it's btrfs_file_extent_item::num_bytes.
+ * For inline extents it's sectorsize, since inline data starts at
+ * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus
+ * btrfs_file_extent_item::num_bytes is not valid.
+ */
u64 len;
- u64 mod_start;
- u64 mod_len;
+
+ /*
+ * The file offset of the original file extent before splitting.
+ *
+ * This is an in-memory only member, matching
+ * extent_map::start - btrfs_file_extent_item::offset for
+ * regular/preallocated extents. EXTENT_MAP_HOLE otherwise.
+ */
u64 orig_start;
+
+ /*
+ * The full on-disk extent length, matching
+ * btrfs_file_extent_item::disk_num_bytes.
+ */
u64 orig_block_len;
+
+ /*
+ * The decompressed size of the whole on-disk extent, matching
+ * btrfs_file_extent_item::ram_bytes.
+ */
u64 ram_bytes;
+
+ /*
+ * The on-disk logical bytenr for the file extent.
+ *
+ * For compressed extents it matches btrfs_file_extent_item::disk_bytenr.
+ * For uncompressed extents it matches
+ * btrfs_file_extent_item::disk_bytenr + btrfs_file_extent_item::offset
+ *
+ * For holes it is EXTENT_MAP_HOLE and for inline extents it is
+ * EXTENT_MAP_INLINE.
+ */
u64 block_start;
+
+ /*
+ * The on-disk length for the file extent.
+ *
+ * For compressed extents it matches btrfs_file_extent_item::disk_num_bytes.
+ * For uncompressed extents it matches extent_map::len.
+ * For holes and inline extents it's -1 and shouldn't be used.
+ */
u64 block_len;
/*
@@ -116,7 +173,7 @@ static inline u64 extent_map_end(const struct extent_map *em)
void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
u64 new_logical);
@@ -125,11 +182,10 @@ void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void __cold extent_map_exit(void);
int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
-void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
+void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len);
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
u64 start, u64 end,
@@ -137,5 +193,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
+long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 81ac1d474b..bce95f8717 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -10,17 +10,14 @@
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "messages.h"
-#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "bio.h"
-#include "print-tree.h"
#include "compression.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
-#include "super.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
@@ -179,7 +176,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
sizeof(*item));
if (ret < 0)
goto out;
- BUG_ON(ret); /* Can't happen */
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -434,8 +430,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
memset(csum_dst, 0, csum_size);
count = 1;
- if (inode->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset = bbio->file_offset + bio_offset;
set_extent_bit(&inode->io_tree, file_offset,
@@ -454,9 +449,22 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
return ret;
}
+/*
+ * Search for checksums for a given logical range.
+ *
+ * @root: The root where to look for checksums.
+ * @start: Logical address of target checksum range.
+ * @end: End offset (inclusive) of the target checksum range.
+ * @list: List for adding each checksum that was found.
+ * Can be NULL in case the caller only wants to check if
+ * there any checksums for the range.
+ * @nowait: Indicate if the search must be non-blocking or not.
+ *
+ * Return < 0 on error, 0 if no checksums were found, or 1 if checksums were
+ * found.
+ */
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit,
- bool nowait)
+ struct list_head *list, bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -464,8 +472,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
struct btrfs_csum_item *item;
- LIST_HEAD(tmplist);
int ret;
+ bool found_csums = false;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -475,11 +483,6 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
return -ENOMEM;
path->nowait = nowait;
- if (search_commit) {
- path->skip_locking = 1;
- path->reada = READA_FORWARD;
- path->search_commit_root = 1;
- }
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = start;
@@ -487,7 +490,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto fail;
+ goto out;
if (ret > 0 && path->slots[0] > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
@@ -522,7 +525,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto fail;
+ goto out;
if (ret > 0)
break;
leaf = path->nodes[0];
@@ -544,6 +547,10 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
continue;
}
+ found_csums = true;
+ if (!list)
+ goto out;
+
csum_end = min(csum_end, end + 1);
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
@@ -557,7 +564,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
- goto fail;
+ goto out;
}
sums->logical = start;
@@ -571,21 +578,24 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
bytes_to_csum_size(fs_info, size));
start += size;
- list_add_tail(&sums->list, &tmplist);
+ list_add_tail(&sums->list, list);
}
path->slots[0]++;
}
- ret = 0;
-fail:
- while (ret < 0 && !list_empty(&tmplist)) {
- sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
- list_del(&sums->list);
- kfree(sums);
+out:
+ btrfs_free_path(path);
+ if (ret < 0) {
+ if (list) {
+ struct btrfs_ordered_sum *tmp_sums;
+
+ list_for_each_entry_safe(sums, tmp_sums, list, list)
+ kfree(sums);
+ }
+
+ return ret;
}
- list_splice_tail(&tmplist, list);
- btrfs_free_path(path);
- return ret;
+ return found_csums ? 1 : 0;
}
/*
@@ -874,8 +884,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
- ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
path = btrfs_alloc_path();
if (!path)
@@ -1175,7 +1185,7 @@ extend_csum:
* search, etc, because log trees are temporary anyway and it
* would only save a few bytes of leaf space.
*/
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
if (path->slots[0] + 1 >=
btrfs_header_nritems(path->nodes[0])) {
ret = find_next_csum_offset(root, path, &next_offset);
@@ -1229,8 +1239,6 @@ insert:
ins_size);
if (ret < 0)
goto out;
- if (WARN_ON(ret != 0))
- goto out;
leaf = path->nodes[0];
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -1271,20 +1279,19 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
struct extent_buffer *leaf = path->nodes[0];
const int slot = path->slots[0];
struct btrfs_key key;
- u64 extent_start, extent_end;
+ u64 extent_start;
u64 bytenr;
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
- extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
em->start = extent_start;
- em->len = extent_end - extent_start;
+ em->len = btrfs_file_extent_end(path) - extent_start;
em->orig_start = extent_start -
btrfs_file_extent_offset(leaf, fi);
em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
@@ -1305,9 +1312,12 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->flags |= EXTENT_FLAG_PREALLOC;
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* Tree-checker has ensured this. */
+ ASSERT(extent_start == 0);
+
em->block_start = EXTENT_MAP_INLINE;
- em->start = extent_start;
- em->len = extent_end - extent_start;
+ em->start = 0;
+ em->len = fs_info->sectorsize;
/*
* Initialize orig_start and block_len with the same values
* as in inode.c:btrfs_get_extent().
@@ -1319,7 +1329,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
"root %llu", type, btrfs_ino(inode), extent_start,
- root->root_key.objectid);
+ btrfs_root_id(root));
}
}
@@ -1340,12 +1350,10 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path)
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
- end = btrfs_file_extent_ram_bytes(leaf, fi);
- end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
- } else {
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE)
+ end = leaf->fs_info->sectorsize;
+ else
end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- }
return end;
}
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 04bd2d34ef..557dc43d71 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,8 +3,21 @@
#ifndef BTRFS_FILE_ITEM_H
#define BTRFS_FILE_ITEM_H
+#include <linux/list.h>
+#include <uapi/linux/btrfs_tree.h>
#include "accessors.h"
+struct extent_map;
+struct btrfs_file_extent_item;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_bio;
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_ordered_sum;
+struct btrfs_path;
+struct btrfs_inode;
+
#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
(offsetof(struct btrfs_file_extent_item, disk_bytenr))
@@ -55,8 +68,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit,
- bool nowait);
+ struct list_head *list, bool nowait);
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path,
u64 start, u64 end, u8 *csum_buf,
unsigned long *csum_bitmap);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 616790d6e5..ca434f0cd2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -22,10 +22,8 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "tree-log.h"
#include "locking.h"
-#include "volumes.h"
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
@@ -130,7 +128,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int err = 0;
+ int ret = 0;
int i;
u64 num_bytes;
u64 start_pos;
@@ -160,10 +158,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
cached);
- err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+ ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
- if (err)
- return err;
+ if (ret)
+ return ret;
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
@@ -208,7 +206,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
- struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(inode);
@@ -248,7 +245,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
- update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
+ update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -375,15 +372,17 @@ next_slot:
btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_ADD_DELAYED_REF,
- disk_bytenr, num_bytes, 0,
- root->root_key.objectid);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- new_key.objectid,
- args->start - extent_offset,
- 0, false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = disk_bytenr,
+ .num_bytes = num_bytes,
+ .parent = 0,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, new_key.objectid,
+ args->start - extent_offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -466,15 +465,17 @@ delete_extent_item:
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_DROP_DELAYED_REF,
- disk_bytenr, num_bytes, 0,
- root->root_key.objectid);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- key.objectid,
- key.offset - extent_offset, 0,
- false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = disk_bytenr,
+ .num_bytes = num_bytes,
+ .parent = 0,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, key.objectid,
+ key.offset - extent_offset,
+ 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -750,10 +751,13 @@ again:
extent_end - split);
btrfs_mark_buffer_dirty(trans, leaf);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
- num_bytes, 0, root->root_key.objectid);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
- orig_offset, 0, false);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_root_id(root);
+ btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -776,10 +780,14 @@ again:
other_start = end;
other_end = 0;
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, 0, root->root_key.objectid);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
- 0, false);
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_root_id(root);
+ btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -917,7 +925,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
- int err = 0;
+ int ret = 0;
int faili;
for (i = 0; i < num_pages; i++) {
@@ -927,28 +935,28 @@ again:
if (!pages[i]) {
faili = i - 1;
if (nowait)
- err = -EAGAIN;
+ ret = -EAGAIN;
else
- err = -ENOMEM;
+ ret = -ENOMEM;
goto fail;
}
- err = set_page_extent_mapped(pages[i]);
- if (err < 0) {
+ ret = set_page_extent_mapped(pages[i]);
+ if (ret < 0) {
faili = i;
goto fail;
}
if (i == 0)
- err = prepare_uptodate_page(inode, pages[i], pos,
+ ret = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
- if (!err && i == num_pages - 1)
- err = prepare_uptodate_page(inode, pages[i],
+ if (!ret && i == num_pages - 1)
+ ret = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
- if (err) {
+ if (ret) {
put_page(pages[i]);
- if (!nowait && err == -EAGAIN) {
- err = 0;
+ if (!nowait && ret == -EAGAIN) {
+ ret = 0;
goto again;
}
faili = i - 1;
@@ -964,7 +972,7 @@ fail:
put_page(pages[faili]);
faili--;
}
- return err;
+ return ret;
}
@@ -1467,7 +1475,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t written_buffered;
size_t prev_left = 0;
loff_t endbyte;
- ssize_t err;
+ ssize_t ret;
unsigned int ilock_flags = 0;
struct iomap_dio *dio;
@@ -1484,9 +1492,9 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
- err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
- if (err < 0)
- return err;
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+ if (ret < 0)
+ return ret;
/* Shared lock cannot be used with security bits set. */
if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
@@ -1495,14 +1503,14 @@ relock:
goto relock;
}
- err = generic_write_checks(iocb, from);
- if (err <= 0) {
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- return err;
+ return ret;
}
- err = btrfs_write_check(iocb, from, err);
- if (err < 0) {
+ ret = btrfs_write_check(iocb, from, ret);
+ if (ret < 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
}
@@ -1542,27 +1550,43 @@ relock:
* So here we disable page faults in the iov_iter and then retry if we
* got -EFAULT, faulting in the pages before the retry.
*/
+again:
from->nofault = true;
dio = btrfs_dio_write(iocb, from, written);
from->nofault = false;
- /*
- * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
- * iocb, and that needs to lock the inode. So unlock it before calling
- * iomap_dio_complete() to avoid a deadlock.
- */
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ if (IS_ERR_OR_NULL(dio)) {
+ ret = PTR_ERR_OR_ZERO(dio);
+ } else {
+ struct btrfs_file_private stack_private = { 0 };
+ struct btrfs_file_private *private;
+ const bool have_private = (file->private_data != NULL);
- if (IS_ERR_OR_NULL(dio))
- err = PTR_ERR_OR_ZERO(dio);
- else
- err = iomap_dio_complete(dio);
+ if (!have_private)
+ file->private_data = &stack_private;
+
+ /*
+ * If we have a synchoronous write, we must make sure the fsync
+ * triggered by the iomap_dio_complete() call below doesn't
+ * deadlock on the inode lock - we are already holding it and we
+ * can't call it after unlocking because we may need to complete
+ * partial writes due to the input buffer (or parts of it) not
+ * being already faulted in.
+ */
+ private = file->private_data;
+ private->fsync_skip_inode_lock = true;
+ ret = iomap_dio_complete(dio);
+ private->fsync_skip_inode_lock = false;
+
+ if (!have_private)
+ file->private_data = NULL;
+ }
/* No increment (+=) because iomap returns a cumulative value. */
- if (err > 0)
- written = err;
+ if (ret > 0)
+ written = ret;
- if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
+ if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
const size_t left = iov_iter_count(from);
/*
* We have more data left to write. Try to fault in as many as
@@ -1579,19 +1603,21 @@ relock:
* to buffered IO in case we haven't made any progress.
*/
if (left == prev_left) {
- err = -ENOTBLK;
+ ret = -ENOTBLK;
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left;
- goto relock;
+ goto again;
}
}
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+
/*
- * If 'err' is -ENOTBLK or we have not written all data, then it means
+ * If 'ret' is -ENOTBLK or we have not written all data, then it means
* we must fallback to buffered IO.
*/
- if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
+ if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
goto out;
buffered:
@@ -1602,14 +1628,14 @@ buffered:
* below, we will block when flushing and waiting for the IO.
*/
if (iocb->ki_flags & IOCB_NOWAIT) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
- err = written_buffered;
+ ret = written_buffered;
goto out;
}
/*
@@ -1617,18 +1643,18 @@ buffered:
* able to read what was just written.
*/
endbyte = pos + written_buffered - 1;
- err = btrfs_fdatawrite_range(inode, pos, endbyte);
- if (err)
+ ret = btrfs_fdatawrite_range(inode, pos, endbyte);
+ if (ret)
goto out;
- err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
- if (err)
+ ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+ if (ret)
goto out;
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
out:
- return err < 0 ? err : written;
+ return ret < 0 ? ret : written;
}
static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
@@ -1785,6 +1811,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
+ struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1794,6 +1821,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
+ const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
trace_btrfs_sync_file(file, datasync);
@@ -1821,7 +1849,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -1845,7 +1876,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@@ -1877,6 +1911,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
+ clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
@@ -1886,6 +1921,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
&ctx.ordered_extents);
ret = filemap_fdatawait_range(inode->i_mapping, start, end);
+ if (ret)
+ goto out_release_extents;
+
+ /*
+ * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
+ * starting and waiting for writeback, because for buffered IO
+ * it may have been set during the end IO callback
+ * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
+ * case an error happened and we need to wait for ordered
+ * extents to complete so that any extent maps that point to
+ * unwritten locations are dropped and we don't log them.
+ */
+ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = btrfs_wait_ordered_range(inode, start, len);
}
if (ret)
@@ -1912,6 +1962,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out_release_extents;
}
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
/*
* We use start here because we will need to wait on the IO to complete
* in btrfs_sync_log, which could require joining a transaction (for
@@ -1931,6 +1983,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans->in_fsync = true;
ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ /*
+ * Scratch eb no longer needed, release before syncing log or commit
+ * transaction, to avoid holding unnecessary memory during such long
+ * operations.
+ */
+ if (ctx.scratch_eb) {
+ free_extent_buffer(ctx.scratch_eb);
+ ctx.scratch_eb = NULL;
+ }
btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
@@ -1947,7 +2008,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
@@ -2006,6 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_commit_transaction(trans);
out:
+ free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.list));
ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file);
@@ -2015,10 +2080,179 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF. Because
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ unsigned long zero_start;
+ loff_t size;
+ vm_fault_t ret;
+ int ret2;
+ int reserved = 0;
+ u64 reserved_space;
+ u64 page_start;
+ u64 page_end;
+ u64 end;
+
+ ASSERT(folio_order(folio) == 0);
+
+ reserved_space = PAGE_SIZE;
+
+ sb_start_pagefault(inode->i_sb);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_SIZE - 1;
+ end = page_end;
+
+ /*
+ * Reserving delalloc space after obtaining the page lock can lead to
+ * deadlock. For example, if a dirty page is locked by this function
+ * and the call to btrfs_delalloc_reserve_space() ends up triggering
+ * dirty page write out, then the btrfs_writepages() function could
+ * end up waiting indefinitely to get a lock on the page currently
+ * being processed by btrfs_page_mkwrite() function.
+ */
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
+ if (!ret2) {
+ ret2 = file_update_time(vmf->vma->vm_file);
+ reserved = 1;
+ }
+ if (ret2) {
+ ret = vmf_error(ret2);
+ if (reserved)
+ goto out;
+ goto out_noreserve;
+ }
+
+ /* Make the VM retry the fault. */
+ ret = VM_FAULT_NOPAGE;
+again:
+ down_read(&BTRFS_I(inode)->i_mmap_lock);
+ lock_page(page);
+ size = i_size_read(inode);
+
+ if ((page->mapping != inode->i_mapping) ||
+ (page_start >= size)) {
+ /* Page got truncated out from underneath us. */
+ goto out_unlock;
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret2 = set_page_extent_mapped(page);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
+
+ /*
+ * We can't set the delalloc bits if there are pending ordered
+ * extents. Drop our locks and wait for them to finish.
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_start_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ if (page->index == ((size - 1) >> PAGE_SHIFT)) {
+ reserved_space = round_up(size - page_start, fs_info->sectorsize);
+ if (reserved_space < PAGE_SIZE) {
+ end = page_start + reserved_space - 1;
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, page_start,
+ PAGE_SIZE - reserved_space, true);
+ }
+ }
+
+ /*
+ * page_mkwrite gets called when the page is firstly dirtied after it's
+ * faulted in, but write(2) could also dirty a page and set delalloc
+ * bits, thus in this case for space account reason, we still need to
+ * clear any delalloc bits within this page range since we have to
+ * reserve data&meta space before lock_page() (see above comments).
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
+ &cached_state);
+ if (ret2) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
+
+ /* Page is wholly or partially inside EOF. */
+ if (page_start + PAGE_SIZE > size)
+ zero_start = offset_in_page(size);
+ else
+ zero_start = PAGE_SIZE;
+
+ if (zero_start != PAGE_SIZE)
+ memzero_page(page, zero_start, PAGE_SIZE - zero_start);
+
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+ btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
+
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return VM_FAULT_LOCKED;
+
+out_unlock:
+ unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+out:
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
+ reserved_space, (ret != 0));
+out_noreserve:
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
@@ -2176,7 +2410,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
struct extent_map *em;
int ret = 0;
- em = btrfs_get_extent(inode, NULL, 0,
+ em = btrfs_get_extent(inode, NULL,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
@@ -2248,7 +2482,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
int slot;
- struct btrfs_ref ref = { 0 };
int ret;
if (replace_len == 0)
@@ -2304,15 +2537,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->qgroup_reserved,
&key);
} else {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = extent_info->disk_offset,
+ .num_bytes = extent_info->disk_len,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
u64 ref_offset;
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- extent_info->disk_offset,
- extent_info->disk_len, 0,
- root->root_key.objectid);
ref_offset = extent_info->file_offset - extent_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(inode), ref_offset, 0, false);
+ btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -2835,7 +3070,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2866,7 +3101,7 @@ static int btrfs_zero_range(struct inode *inode,
u64 bytes_to_reserve = 0;
bool space_reserved = false;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -2909,8 +3144,7 @@ static int btrfs_zero_range(struct inode *inode,
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
- sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -3005,7 +3239,7 @@ reserve_space:
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
- i_blocksize(inode),
+ fs_info->sectorsize,
offset + len, &alloc_hint);
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
@@ -3126,7 +3360,7 @@ static long btrfs_fallocate(struct file *file, int mode,
/* First, check if we exceed the qgroup limit */
while (cur_offset < alloc_end) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
alloc_end - cur_offset);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -3177,7 +3411,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (!ret) {
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
- range->len, i_blocksize(inode),
+ range->len, blocksize,
offset + len, &alloc_hint);
/*
* btrfs_prealloc_file_range() releases space even
@@ -3710,8 +3944,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
- FMODE_CAN_ODIRECT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
ret = fsverity_file_open(inode, filp);
if (ret)
@@ -3841,6 +4074,7 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
+ .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 82b34fbb29..77aaca208c 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -3,6 +3,21 @@
#ifndef BTRFS_FILE_H
#define BTRFS_FILE_H
+#include <linux/types.h>
+
+struct file;
+struct extent_state;
+struct kiocb;
+struct iov_iter;
+struct page;
+struct btrfs_ioctl_encoded_io_args;
+struct btrfs_drop_extents_args;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_replace_extent_info;
+struct btrfs_trans_handle;
+
extern const struct file_operations btrfs_file_operations;
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 815bb146b1..62c3dea957 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -19,9 +19,7 @@
#include "transaction.h"
#include "disk-io.h"
#include "extent_io.h"
-#include "volumes.h"
#include "space-info.h"
-#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
#include "subpage.h"
@@ -860,6 +858,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
spin_unlock(&ctl->tree_lock);
btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
+ kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap);
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -1913,9 +1912,9 @@ static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
ctl->free_space -= bytes;
}
-static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info, u64 offset,
- u64 bytes)
+static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
{
unsigned long start, count, end;
int extent_delta = 1;
@@ -2251,7 +2250,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bytes_to_set = min(end - offset, bytes);
- bitmap_set_bits(ctl, info, offset, bytes_to_set);
+ btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set);
return bytes_to_set;
@@ -2621,7 +2620,7 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
}
}
-int __btrfs_add_free_space(struct btrfs_block_group *block_group,
+static int __btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes,
enum btrfs_trim_state trim_state)
{
@@ -2699,7 +2698,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
int bg_reclaim_threshold = 0;
- bool initial = (size == block_group->length);
+ bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
@@ -2725,8 +2724,10 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
* If the block group is read-only, we should account freed space into
* bytes_readonly.
*/
- if (!block_group->ro)
+ if (!block_group->ro) {
block_group->zone_unusable += to_unusable;
+ WARN_ON(block_group->zone_unusable > block_group->length);
+ }
spin_unlock(&ctl->tree_lock);
if (!used) {
spin_lock(&block_group->lock);
@@ -4156,15 +4157,13 @@ out:
int __init btrfs_free_space_init(void)
{
- btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
- sizeof(struct btrfs_free_space), 0,
- SLAB_MEM_SPREAD, NULL);
+ btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0);
if (!btrfs_free_space_cachep)
return -ENOMEM;
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
- SLAB_MEM_SPREAD, NULL);
+ 0, NULL);
if (!btrfs_free_space_bitmap_cachep) {
kmem_cache_destroy(btrfs_free_space_cachep);
return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 33b4da3271..83774bfd7b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -6,6 +6,19 @@
#ifndef BTRFS_FREE_SPACE_CACHE_H
#define BTRFS_FREE_SPACE_CACHE_H
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include "fs.h"
+
+struct inode;
+struct page;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_trans_handle;
+struct btrfs_trim_block_group;
+
/*
* This is the trim state of an extent or bitmap.
*
@@ -114,8 +127,6 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl);
-int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
- u64 size, enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 7b598b0707..7ba50e1339 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1176,12 +1176,16 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
ret = PTR_ERR(free_space_root);
- goto abort;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
}
ret = btrfs_global_root_insert(free_space_root);
if (ret) {
btrfs_put_root(free_space_root);
- goto abort;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
}
node = rb_first_cached(&fs_info->block_group_cache_tree);
@@ -1189,8 +1193,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out_clear;
+ }
node = rb_next(node);
}
@@ -1206,11 +1213,9 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
-abort:
+out_clear:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
return ret;
}
@@ -1273,12 +1278,18 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
ret = btrfs_del_root(trans, &free_space_root->root_key);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
btrfs_global_root_delete(free_space_root);
@@ -1289,17 +1300,16 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(free_space_root->node);
btrfs_clear_buffer_dirty(trans, free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
- btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
- free_space_root->node, 0, 1);
-
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+ free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
return btrfs_commit_transaction(trans);
-
-abort:
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
}
int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
@@ -1322,8 +1332,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
@@ -1332,8 +1345,11 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
node = rb_next(node);
}
@@ -1344,10 +1360,6 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
ret = btrfs_commit_transaction(trans);
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
-abort:
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 6d5551d0ce..e6c6d6f4f2 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -6,7 +6,13 @@
#ifndef BTRFS_FREE_SPACE_TREE_H
#define BTRFS_FREE_SPACE_TREE_H
+#include <linux/bits.h>
+
struct btrfs_caching_control;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_block_group;
+struct btrfs_trans_handle;
/*
* The default size for new free space bitmap items. The last bitmap in a block
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index bd59cf0aae..833dc3fe0a 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -4,13 +4,49 @@
#define BTRFS_FS_H
#include <linux/blkdev.h>
-#include <linux/fs.h>
-#include <linux/btrfs_tree.h>
#include <linux/sizes.h>
+#include <linux/time64.h>
+#include <linux/compiler.h>
+#include <linux/math.h>
+#include <linux/atomic.h>
+#include <linux/percpu_counter.h>
+#include <linux/completion.h>
+#include <linux/lockdep.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwlock_types.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/wait_bit.h>
+#include <linux/sched.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "extent-io-tree.h"
-#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
+#include "fs.h"
+
+struct inode;
+struct super_block;
+struct kobject;
+struct reloc_control;
+struct crypto_shash;
+struct ulist;
+struct btrfs_device;
+struct btrfs_block_group;
+struct btrfs_root;
+struct btrfs_fs_devices;
+struct btrfs_transaction;
+struct btrfs_delayed_root;
+struct btrfs_balance_control;
+struct btrfs_subpage_info;
+struct btrfs_stripe_hash_table;
+struct btrfs_space_info;
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
@@ -593,6 +629,11 @@ struct btrfs_fs_info {
s32 dirty_metadata_batch;
s32 delalloc_batch;
+ struct percpu_counter evictable_extent_maps;
+ spinlock_t extent_map_shrinker_lock;
+ u64 extent_map_shrinker_last_root;
+ u64 extent_map_shrinker_last_ino;
+
/* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;
@@ -732,10 +773,13 @@ struct btrfs_fs_info {
/* Reclaim partially filled block groups in the background */
struct work_struct reclaim_bgs_work;
+ /* Protected by unused_bgs_lock. */
struct list_head reclaim_bgs;
int bg_reclaim_threshold;
+ /* Protects the lists unused_bgs and reclaim_bgs. */
spinlock_t unused_bgs_lock;
+ /* Protected by unused_bgs_lock. */
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
/* Protect block groups that are going to be deleted */
@@ -933,6 +977,8 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation op);
+int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);
+
/* Compatibility and incompatibility defines */
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 7d734830e5..84a94d19b2 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -9,7 +9,6 @@
#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "space-info.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -671,16 +670,18 @@ delete:
}
if (del_item && extent_start != 0 && !control->skip_ref_updates) {
- struct btrfs_ref ref = { 0 };
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = extent_start,
+ .num_bytes = extent_num_bytes,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_header_owner(leaf),
+ };
bytes_deleted += extent_num_bytes;
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
- extent_start, extent_num_bytes, 0,
- root->root_key.objectid);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- control->ino, extent_offset,
- root->root_key.objectid, false);
+ btrfs_init_data_ref(&ref, control->ino, extent_offset,
+ btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 4337bb26f4..c4aded8270 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -6,14 +6,15 @@
#include <linux/types.h>
#include <linux/crc32c.h>
+struct fscrypt_str;
+struct extent_buffer;
struct btrfs_trans_handle;
struct btrfs_root;
struct btrfs_path;
struct btrfs_key;
struct btrfs_inode_extref;
struct btrfs_inode;
-struct extent_buffer;
-struct fscrypt_str;
+struct btrfs_truncate_control;
/*
* Return this if we need to call truncate_block for the last bit of the
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3c534c1bf..39d22693e4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,14 +39,12 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
#include "bio.h"
#include "compression.h"
#include "locking.h"
-#include "free-space-cache.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
@@ -256,7 +254,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid, btrfs_ino(inode), file_off,
+ btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
@@ -266,7 +264,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off
logical += file_off;
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
btrfs_ino(inode), file_off, logical,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -333,15 +331,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* Output without objectid, which is more meaningful */
- if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
+ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -349,7 +347,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
} else {
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
- root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(root), btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
@@ -514,12 +512,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, bool extent_inserted,
size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages,
+ struct folio *compressed_folio,
bool update_i_size)
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
+ const u32 sectorsize = trans->fs_info->sectorsize;
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
@@ -527,10 +526,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
u64 i_size;
- ASSERT((compressed_size > 0 && compressed_pages) ||
- (compressed_size == 0 && !compressed_pages));
+ /*
+ * The decompressed size must still be no larger than a sector. Under
+ * heavy race, we can have size == 0 passed in, but that shouldn't be a
+ * big deal and we can continue the insertion.
+ */
+ ASSERT(size <= sectorsize);
- if (compressed_size && compressed_pages)
+ /*
+ * The compressed size also needs to be no larger than a sector.
+ * That's also why we only need one page as the parameter.
+ */
+ if (compressed_folio)
+ ASSERT(compressed_size <= sectorsize);
+ else
+ ASSERT(compressed_size == 0);
+
+ if (compressed_size && compressed_folio)
cur_size = compressed_size;
if (!extent_inserted) {
@@ -558,21 +570,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
ptr = btrfs_file_extent_inline_start(ei);
if (compress_type != BTRFS_COMPRESS_NONE) {
- struct page *cpage;
- int i = 0;
- while (compressed_size > 0) {
- cpage = compressed_pages[i];
- cur_size = min_t(unsigned long, compressed_size,
- PAGE_SIZE);
-
- kaddr = kmap_local_page(cpage);
- write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_local(kaddr);
+ kaddr = kmap_local_folio(compressed_folio, 0);
+ write_extent_buffer(leaf, kaddr, ptr, compressed_size);
+ kunmap_local(kaddr);
- i++;
- ptr += cur_size;
- compressed_size -= cur_size;
- }
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
@@ -613,17 +614,62 @@ fail:
return ret;
}
+static bool can_cow_file_range_inline(struct btrfs_inode *inode,
+ u64 offset, u64 size,
+ size_t compressed_size)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 data_len = (compressed_size ?: size);
+
+ /* Inline extents must start at offset 0. */
+ if (offset != 0)
+ return false;
+
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (fs_info->sectorsize != PAGE_SIZE)
+ return false;
+
+ /* Inline extents are limited to sectorsize. */
+ if (size > fs_info->sectorsize)
+ return false;
+
+ /* We cannot exceed the maximum inline data size. */
+ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ return false;
+
+ /* We cannot exceed the user specified max_inline size. */
+ if (data_len > fs_info->max_inline)
+ return false;
+
+ /* Inline extents must be the entirety of the file. */
+ if (size < i_size_read(&inode->vfs_inode))
+ return false;
+
+ return true;
+}
/*
* conditionally insert an inline extent into the file. This
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
+ *
+ * If being used directly, you must have already checked we're allowed to cow
+ * the range by getting true from can_cow_file_range_inline().
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
- size_t compressed_size,
- int compress_type,
- struct page **compressed_pages,
- bool update_i_size)
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+ u64 size, size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
@@ -633,18 +679,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
int ret;
struct btrfs_path *path;
- /*
- * We can create an inline extent if it ends at or beyond the current
- * i_size, is no larger than a sector (decompressed), and the (possibly
- * compressed) data fits in a leaf and the configured maximum inline
- * size.
- */
- if (size < i_size_read(&inode->vfs_inode) ||
- size > fs_info->sectorsize ||
- data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- data_len > fs_info->max_inline)
- return 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -670,7 +704,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
- compressed_pages, update_i_size);
+ compressed_folio, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -703,12 +737,48 @@ out:
return ret;
}
+static noinline int cow_file_range_inline(struct btrfs_inode *inode,
+ struct page *locked_page,
+ u64 offset, u64 end,
+ size_t compressed_size,
+ int compress_type,
+ struct folio *compressed_folio,
+ bool update_i_size)
+{
+ struct extent_state *cached = NULL;
+ unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
+ u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
+ int ret;
+
+ if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
+ return 1;
+
+ lock_extent(&inode->io_tree, offset, end, &cached);
+ ret = __cow_file_range_inline(inode, offset, size, compressed_size,
+ compress_type, compressed_folio,
+ update_i_size);
+ if (ret > 0) {
+ unlock_extent(&inode->io_tree, offset, end, &cached);
+ return ret;
+ }
+
+ if (ret == 0)
+ locked_page = NULL;
+
+ extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached,
+ clear_flags,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ return ret;
+}
+
struct async_extent {
u64 start;
u64 ram_size;
u64 compressed_size;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
int compress_type;
struct list_head list;
};
@@ -733,19 +803,20 @@ struct async_cow {
static noinline int add_async_extent(struct async_chunk *cow,
u64 start, u64 ram_size,
u64 compressed_size,
- struct page **pages,
- unsigned long nr_pages,
+ struct folio **folios,
+ unsigned long nr_folios,
int compress_type)
{
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
- BUG_ON(!async_extent); /* -ENOMEM */
+ if (!async_extent)
+ return -ENOMEM;
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
- async_extent->pages = pages;
- async_extent->nr_pages = nr_pages;
+ async_extent->folios = folios;
+ async_extent->nr_folios = nr_folios;
async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
@@ -849,8 +920,8 @@ static void compress_file_range(struct btrfs_work *work)
u64 actual_end;
u64 i_size;
int ret = 0;
- struct page **pages;
- unsigned long nr_pages;
+ struct folio **folios;
+ unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
unsigned int poff;
@@ -880,9 +951,9 @@ static void compress_file_range(struct btrfs_work *work)
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
- pages = NULL;
- nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
+ folios = NULL;
+ nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
/*
* we don't want to send crud past the end of i_size through
@@ -931,8 +1002,8 @@ again:
if (!inode_need_compress(inode, start, end))
goto cleanup_and_bail_uncompressed;
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages) {
+ folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
+ if (!folios) {
/*
* Memory allocation failure is not a fatal error, we can fall
* back to uncompressed code.
@@ -946,9 +1017,9 @@ again:
compress_type = inode->prop_compress;
/* Compression level is applied here. */
- ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
- mapping, start, pages, &nr_pages, &total_in,
- &total_compressed);
+ ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
+ mapping, start, folios, &nr_folios, &total_in,
+ &total_compressed);
if (ret)
goto mark_incompressible;
@@ -958,7 +1029,7 @@ again:
*/
poff = offset_in_page(total_compressed);
if (poff)
- memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
+ folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
/*
* Try to create an inline extent.
@@ -969,43 +1040,16 @@ again:
* Check cow_file_range() for why we don't even try to create inline
* extent for the subpage case.
*/
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
- if (total_in < actual_end) {
- ret = cow_file_range_inline(inode, actual_end, 0,
- BTRFS_COMPRESS_NONE, NULL,
- false);
- } else {
- ret = cow_file_range_inline(inode, actual_end,
- total_compressed,
- compress_type, pages,
- false);
- }
- if (ret <= 0) {
- unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING;
-
- if (ret < 0)
- mapping_set_error(mapping, -EIO);
-
- /*
- * inline extent creation worked or returned error,
- * we don't need to create any more async work items.
- * Unlock and free up our temp pages.
- *
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be done _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- NULL,
- clear_flags,
- PAGE_UNLOCK |
- PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK);
- goto free_pages;
- }
+ if (total_in < actual_end)
+ ret = cow_file_range_inline(inode, NULL, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
+ else
+ ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
+ compress_type, folios[0], false);
+ if (ret <= 0) {
+ if (ret < 0)
+ mapping_set_error(mapping, -EIO);
+ goto free_pages;
}
/*
@@ -1027,8 +1071,9 @@ again:
* The async work queues will take care of doing actual allocation on
* disk for these compressed pages, and will submit the bios.
*/
- add_async_extent(async_chunk, start, total_in, total_compressed, pages,
- nr_pages, compress_type);
+ ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
+ nr_folios, compress_type);
+ BUG_ON(ret);
if (start + total_in < end) {
start += total_in;
cond_resched();
@@ -1040,15 +1085,16 @@ mark_incompressible:
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
inode->flags |= BTRFS_INODE_NOCOMPRESS;
cleanup_and_bail_uncompressed:
- add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
- BTRFS_COMPRESS_NONE);
+ ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
+ BTRFS_COMPRESS_NONE);
+ BUG_ON(ret);
free_pages:
- if (pages) {
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- btrfs_free_compr_page(pages[i]);
+ if (folios) {
+ for (i = 0; i < nr_folios; i++) {
+ WARN_ON(folios[i]->mapping);
+ btrfs_free_compr_folio(folios[i]);
}
- kfree(pages);
+ kfree(folios);
}
}
@@ -1056,16 +1102,16 @@ static void free_async_extent_pages(struct async_extent *async_extent)
{
int i;
- if (!async_extent->pages)
+ if (!async_extent->folios)
return;
- for (i = 0; i < async_extent->nr_pages; i++) {
- WARN_ON(async_extent->pages[i]->mapping);
- btrfs_free_compr_page(async_extent->pages[i]);
+ for (i = 0; i < async_extent->nr_folios; i++) {
+ WARN_ON(async_extent->folios[i]->mapping);
+ btrfs_free_compr_folio(async_extent->folios[i]);
}
- kfree(async_extent->pages);
- async_extent->nr_pages = 0;
- async_extent->pages = NULL;
+ kfree(async_extent->folios);
+ async_extent->nr_folios = 0;
+ async_extent->folios = NULL;
}
static void submit_uncompressed_range(struct btrfs_inode *inode,
@@ -1112,6 +1158,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_ordered_extent *ordered;
struct btrfs_key ins;
struct page *locked_page = NULL;
+ struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
u64 start = async_extent->start;
@@ -1131,7 +1178,6 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
if (!(start >= locked_page_end || end <= locked_page_start))
locked_page = async_chunk->locked_page;
}
- lock_extent(io_tree, start, end, NULL);
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
submit_uncompressed_range(inode, async_extent, locked_page);
@@ -1153,6 +1199,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
goto done;
}
+ lock_extent(io_tree, start, end, &cached);
+
/* Here we're doing allocation and writeback of the compressed pages */
em = create_io_em(inode, start,
async_extent->ram_size, /* len */
@@ -1186,11 +1234,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
/* Clear dirty, set writeback and unlock the pages. */
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
btrfs_submit_compressed_write(ordered,
- async_extent->pages, /* compressed_pages */
- async_extent->nr_pages,
+ async_extent->folios, /* compressed_folios */
+ async_extent->nr_folios,
async_chunk->write_flags, true);
*alloc_hint = ins.objectid + ins.offset;
done:
@@ -1204,7 +1252,8 @@ out_free_reserve:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ NULL, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
@@ -1214,7 +1263,7 @@ out_free_reserve:
kthread_associate_blkcg(NULL);
btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
- root->root_key.objectid, btrfs_ino(inode), start,
+ btrfs_root_id(root), btrfs_ino(inode), start,
async_extent->ram_size, ret);
kfree(async_extent);
}
@@ -1286,6 +1335,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_state *cached = NULL;
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
@@ -1311,53 +1361,21 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
- u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
- end + 1);
-
+ if (!no_inline) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, actual_end, 0,
+ ret = cow_file_range_inline(inode, locked_page, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
- if (ret == 0) {
- /*
- * We use DO_ACCOUNTING here because we need the
- * delalloc_release_metadata to be run _after_ we drop
- * our outstanding extent for clearing delalloc for this
- * range.
- */
- extent_clear_unlock_delalloc(inode, start, end,
- locked_page,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
- EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
+ if (ret <= 0) {
/*
- * locked_page is locked by the caller of
- * writepage_delalloc(), not locked by
- * __process_pages_contig().
- *
- * We can't let __process_pages_contig() to unlock it,
- * as it doesn't have any subpage::writers recorded.
+ * We succeeded, return 1 so the caller knows we're done
+ * with this page and already handled the IO.
*
- * Here we manually unlock the page, since the caller
- * can't determine if it's an inline extent or a
- * compressed extent.
+ * If there was an error then cow_file_range_inline() has
+ * already done the cleanup.
*/
- unlock_page(locked_page);
- ret = 1;
+ if (ret == 0)
+ ret = 1;
goto done;
- } else if (ret < 0) {
- goto out_unlock;
}
}
@@ -1417,6 +1435,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
extent_reserved = true;
ram_size = ins.offset;
+
+ lock_extent(&inode->io_tree, start, start + ram_size - 1,
+ &cached);
+
em = create_io_em(inode, start, ins.offset, /* len */
start, /* orig_start */
ins.objectid, /* block_start */
@@ -1426,6 +1448,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
BTRFS_COMPRESS_NONE, /* compress_type */
BTRFS_ORDERED_REGULAR /* type */);
if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, start,
+ start + ram_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
@@ -1436,6 +1460,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
0, 1 << BTRFS_ORDERED_REGULAR,
BTRFS_COMPRESS_NONE);
if (IS_ERR(ordered)) {
+ unlock_extent(&inode->io_tree, start,
+ start + ram_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1475,7 +1501,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
- locked_page,
+ locked_page, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
if (num_bytes < cur_alloc_size)
@@ -1534,10 +1560,17 @@ out_unlock:
if (!locked_page)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_page, 0, page_ops);
+ locked_page, NULL, 0, page_ops);
}
/*
+ * At this point we're unlocked, we want to make sure we're only
+ * clearing these flags under the extent lock, so lock the rest of the
+ * range and clear everything up.
+ */
+ lock_extent(&inode->io_tree, start, end, NULL);
+
+ /*
* For the range (2). If we reserved an extent for our delalloc range
* (or a subrange) and failed to create the respective ordered extent,
* then it means that when we reserved the extent we decremented the
@@ -1550,7 +1583,7 @@ out_unlock:
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
- locked_page,
+ locked_page, &cached,
clear_bits,
page_ops);
start += cur_alloc_size;
@@ -1565,7 +1598,7 @@ out_unlock:
if (start < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
- clear_bits, page_ops);
+ &cached, clear_bits, page_ops);
}
return ret;
}
@@ -1638,7 +1671,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
if (!ctx)
return false;
- unlock_extent(&inode->io_tree, start, end, NULL);
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
async_chunk = ctx->chunks;
@@ -1732,29 +1764,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
return 1;
}
-static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes, bool nowait)
-{
- struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
- struct btrfs_ordered_sum *sums;
- int ret;
- LIST_HEAD(list);
-
- ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
- &list, 0, nowait);
- if (ret == 0 && list_empty(&list))
- return 0;
-
- while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
- list_del(&sums->list);
- kfree(sums);
- }
- if (ret < 0)
- return ret;
- return 1;
-}
-
static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const u64 start, const u64 end)
{
@@ -1762,6 +1771,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_state *cached_state = NULL;
u64 range_start = start;
u64 count;
int ret;
@@ -1798,6 +1808,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
+ lock_extent(io_tree, start, end, &cached_state);
count = count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
@@ -1816,6 +1827,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
NULL);
}
+ unlock_extent(io_tree, start, end, &cached_state);
/*
* Don't try to create inline extents, as a mix of inline extent that
@@ -1869,6 +1881,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
+ struct btrfs_root *csum_root;
u64 extent_end;
u8 extent_type;
int can_nocow = 0;
@@ -1929,7 +1942,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (args->free_path) {
/*
* We don't need the path anymore, plus through the
- * csum_exist_in_range() call below we will end up allocating
+ * btrfs_lookup_csums_list() call below we will end up allocating
* another path. So free the path to avoid unnecessary extra
* memory usage.
*/
@@ -1950,8 +1963,11 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
- nowait);
+
+ csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr);
+ ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr,
+ args->disk_bytenr + args->num_bytes - 1,
+ NULL, nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -2001,12 +2017,13 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
nocow_args.end = end;
nocow_args.writeback_path = true;
- while (1) {
+ while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
+ struct extent_state *cached_state = NULL;
u64 extent_end;
u64 ram_bytes;
u64 nocow_end;
@@ -2144,6 +2161,8 @@ must_cow:
}
nocow_end = cur_offset + nocow_args.num_bytes - 1;
+ lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
+
is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
if (is_prealloc) {
u64 orig_start = found_key.offset - nocow_args.extent_offset;
@@ -2157,6 +2176,8 @@ must_cow:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, cur_offset,
+ nocow_end, &cached_state);
btrfs_dec_nocow_writers(nocow_bg);
ret = PTR_ERR(em);
goto error;
@@ -2177,6 +2198,8 @@ must_cow:
btrfs_drop_extent_map_range(inode, cur_offset,
nocow_end, false);
}
+ unlock_extent(&inode->io_tree, cur_offset,
+ nocow_end, &cached_state);
ret = PTR_ERR(ordered);
goto error;
}
@@ -2191,8 +2214,8 @@ must_cow:
btrfs_put_ordered_extent(ordered);
extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC |
+ locked_page, &cached_state,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_ORDERED);
@@ -2205,8 +2228,6 @@ must_cow:
*/
if (ret)
goto error;
- if (cur_offset > end)
- break;
}
btrfs_release_path(path);
@@ -2232,13 +2253,23 @@ error:
*/
if (cow_start != (u64)-1)
cur_offset = cow_start;
- if (cur_offset < end)
+
+ /*
+ * We need to lock the extent here because we're clearing DELALLOC and
+ * we're not locked at this point.
+ */
+ if (cur_offset < end) {
+ struct extent_state *cached = NULL;
+
+ lock_extent(&inode->io_tree, cur_offset, end, &cached);
extent_clear_unlock_delalloc(inode, cur_offset, end,
- locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_DEFRAG |
+ locked_page, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
+ }
btrfs_free_path(path);
return ret;
}
@@ -2301,6 +2332,8 @@ void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 size;
+ lockdep_assert_held(&inode->io_tree.lock);
+
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
@@ -2339,6 +2372,8 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
u64 new_size, old_size;
u32 num_extents;
+ lockdep_assert_held(&inode->io_tree.lock);
+
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
@@ -2386,55 +2421,50 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
spin_unlock(&inode->lock);
}
-static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
- struct btrfs_inode *inode)
+static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&root->delalloc_lock);
- if (list_empty(&inode->delalloc_inodes)) {
- list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
- set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
- root->nr_delalloc_inodes++;
- if (root->nr_delalloc_inodes == 1) {
- spin_lock(&fs_info->delalloc_root_lock);
- BUG_ON(!list_empty(&root->delalloc_root));
- list_add_tail(&root->delalloc_root,
- &fs_info->delalloc_roots);
- spin_unlock(&fs_info->delalloc_root_lock);
- }
+ ASSERT(list_empty(&inode->delalloc_inodes));
+ list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&fs_info->delalloc_root_lock);
+ ASSERT(list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
spin_unlock(&root->delalloc_lock);
}
-void __btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
+void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
+ lockdep_assert_held(&root->delalloc_lock);
+
+ /*
+ * We may be called after the inode was already deleted from the list,
+ * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
+ * and then later through btrfs_clear_delalloc_extent() while the inode
+ * still has ->delalloc_bytes > 0.
+ */
if (!list_empty(&inode->delalloc_inodes)) {
list_del_init(&inode->delalloc_inodes);
- clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags);
root->nr_delalloc_inodes--;
if (!root->nr_delalloc_inodes) {
ASSERT(list_empty(&root->delalloc_inodes));
spin_lock(&fs_info->delalloc_root_lock);
- BUG_ON(list_empty(&root->delalloc_root));
+ ASSERT(!list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
spin_unlock(&fs_info->delalloc_root_lock);
}
}
}
-static void btrfs_del_delalloc_inode(struct btrfs_root *root,
- struct btrfs_inode *inode)
-{
- spin_lock(&root->delalloc_lock);
- __btrfs_del_delalloc_inode(root, inode);
- spin_unlock(&root->delalloc_lock);
-}
-
/*
* Properly track delayed allocation bytes in the inode and to maintain the
* list of inodes that have pending delalloc work to be done.
@@ -2444,6 +2474,8 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ lockdep_assert_held(&inode->io_tree.lock);
+
if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
@@ -2452,10 +2484,9 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
* bit, which is only set or cleared with irqs on
*/
if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
- struct btrfs_root *root = inode->root;
u64 len = state->end + 1 - state->start;
+ u64 prev_delalloc_bytes;
u32 num_extents = count_max_extents(fs_info, len);
- bool do_list = !btrfs_is_free_space_inode(inode);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, num_extents);
@@ -2468,13 +2499,20 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
+ prev_delalloc_bytes = inode->delalloc_bytes;
inode->delalloc_bytes += len;
if (bits & EXTENT_DEFRAG)
inode->defrag_bytes += len;
- if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags))
- btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&inode->lock);
+
+ /*
+ * We don't need to be under the protection of the inode's lock,
+ * because we are called while holding the inode's io_tree lock
+ * and are therefore protected against concurrent calls of this
+ * function and btrfs_clear_delalloc_extent().
+ */
+ if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
+ btrfs_add_delalloc_inode(inode);
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
@@ -2496,6 +2534,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(fs_info, len);
+ lockdep_assert_held(&inode->io_tree.lock);
+
if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
@@ -2509,7 +2549,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
*/
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
- bool do_list = !btrfs_is_free_space_inode(inode);
+ u64 new_delalloc_bytes;
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, -num_extents);
@@ -2529,7 +2569,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
return;
if (!btrfs_is_data_reloc_root(root) &&
- do_list && !(state->state & EXTENT_NORESERVE) &&
+ !btrfs_is_free_space_inode(inode) &&
+ !(state->state & EXTENT_NORESERVE) &&
(bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2537,11 +2578,20 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
inode->delalloc_bytes -= len;
- if (do_list && inode->delalloc_bytes == 0 &&
- test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
- &inode->runtime_flags))
- btrfs_del_delalloc_inode(root, inode);
+ new_delalloc_bytes = inode->delalloc_bytes;
spin_unlock(&inode->lock);
+
+ /*
+ * We don't need to be under the protection of the inode's lock,
+ * because we are called while holding the inode's io_tree lock
+ * and are therefore protected against concurrent calls of this
+ * function and btrfs_set_delalloc_extent().
+ */
+ if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
+ spin_lock(&root->delalloc_lock);
+ btrfs_del_delalloc_inode(inode);
+ spin_unlock(&root->delalloc_lock);
+ }
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
@@ -2631,7 +2681,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
u64 em_len;
int ret = 0;
- em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ em = btrfs_get_extent(inode, NULL, search_start, search_len);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -3161,7 +3211,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = 0;
out:
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
&cached_state);
@@ -3180,9 +3229,8 @@ out:
* set the mapping error, so we need to set it if we're the ones
* marking this ordered extent as failed.
*/
- if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
- &ordered_extent->flags))
- mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+ if (ret)
+ btrfs_mark_ordered_extent_error(ordered_extent);
if (truncated)
unwritten_start += logical_len;
@@ -3236,7 +3284,7 @@ out:
* Actually free the qgroup rsv which was released when
* the ordered extent was created.
*/
- btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
ordered_extent->qgroup_rsv,
BTRFS_QGROUP_RSV_DATA);
}
@@ -3903,7 +3951,7 @@ cache_acl:
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
btrfs_ino(BTRFS_I(inode)),
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
}
if (path != in_path)
btrfs_free_path(path);
@@ -4262,7 +4310,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
/* This needs to handle no-key deletions later on */
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
- objectid = inode->root->root_key.objectid;
+ objectid = btrfs_root_id(inode->root);
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
objectid = inode->location.objectid;
} else {
@@ -4320,7 +4368,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
} else {
ret = btrfs_del_root_ref(trans, objectid,
- root->root_key.objectid, dir_ino,
+ btrfs_root_id(root), dir_ino,
&index, &fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -4370,7 +4418,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
dir_id, &name, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.objectid == root->root_key.objectid) {
+ if (key.objectid == btrfs_root_id(root)) {
ret = -EPERM;
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
@@ -4380,21 +4428,27 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
btrfs_release_path(path);
}
- key.objectid = root->root_key.objectid;
+ key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of valid range.
+ */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = 0;
if (path->slots[0] > 0) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == root->root_key.objectid &&
- key.type == BTRFS_ROOT_REF_KEY)
+ if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
out:
@@ -4406,64 +4460,26 @@ out:
static void btrfs_prune_dentries(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
- u64 objectid = 0;
+ struct btrfs_inode *inode;
+ u64 min_ino = 0;
if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
-
- if (objectid < btrfs_ino(entry))
- node = node->rb_left;
- else if (objectid > btrfs_ino(entry))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(entry)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- objectid = btrfs_ino(entry) + 1;
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- if (atomic_read(&inode->i_count) > 1)
- d_prune_aliases(inode);
- /*
- * btrfs_drop_inode will have it removed from the inode
- * cache when its usage count hits zero.
- */
- iput(inode);
- cond_resched();
- spin_lock(&root->inode_lock);
- goto again;
- }
-
- if (cond_resched_lock(&root->inode_lock))
- goto again;
+ inode = btrfs_find_first_inode(root, min_ino);
+ while (inode) {
+ if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ d_prune_aliases(&inode->vfs_inode);
- node = rb_next(node);
+ min_ino = btrfs_ino(inode) + 1;
+ /*
+ * btrfs_drop_inode() will have it removed from the inode
+ * cache when its usage count hits zero.
+ */
+ iput(&inode->vfs_inode);
+ cond_resched();
+ inode = btrfs_find_first_inode(root, min_ino);
}
- spin_unlock(&root->inode_lock);
}
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
@@ -4490,7 +4506,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
- dest->root_key.objectid);
+ btrfs_root_id(dest));
ret = -EPERM;
goto out_up_write;
}
@@ -4498,7 +4514,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EPERM;
goto out_up_write;
}
@@ -4559,7 +4575,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4567,8 +4583,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
}
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- dest->root_key.objectid);
+ BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4577,7 +4592,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_uuid_tree_remove(trans,
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- dest->root_key.objectid);
+ btrfs_root_id(dest));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -4618,7 +4633,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- int err = 0;
+ int ret = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
struct fscrypt_name fname;
@@ -4634,33 +4649,33 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
}
- err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
- if (err)
- return err;
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
/* This needs to handle no-key deletions later on */
trans = __unlink_start_trans(BTRFS_I(dir));
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_notrans;
}
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
}
- err = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret)
goto out;
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
- if (!err) {
+ if (!ret) {
btrfs_i_size_write(BTRFS_I(inode), 0);
/*
* Propagate the last_unlink_trans value of the deleted dir to
@@ -4682,7 +4697,7 @@ out_notrans:
btrfs_btree_balance_dirty(fs_info);
fscrypt_free_filename(&fname);
- return err;
+ return ret;
}
/*
@@ -4710,7 +4725,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
- struct page *page;
+ struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
size_t write_bytes = blocksize;
int ret = 0;
@@ -4742,8 +4757,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
again:
- page = find_or_create_page(mapping, index, mask);
- if (!page) {
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio)) {
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
@@ -4751,15 +4767,15 @@ again:
goto out;
}
- if (!PageUptodate(page)) {
- ret = btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping) {
- unlock_page(page);
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ ret = btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
goto again;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
goto out_unlock;
}
@@ -4771,19 +4787,19 @@ again:
* folio private, but left the page in the mapping. Set the page mapped
* here to make sure it's properly set for the subpage stuff.
*/
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto out_unlock;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent(io_tree, block_start, block_end, &cached_state);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -4804,15 +4820,16 @@ again:
if (!len)
len = blocksize - offset;
if (front)
- memzero_page(page, (block_start - page_offset(page)),
- offset);
+ folio_zero_range(folio, block_start - folio_pos(folio),
+ offset);
else
- memzero_page(page, (block_start - page_offset(page)) + offset,
- len);
+ folio_zero_range(folio,
+ (block_start - folio_pos(folio)) + offset,
+ len);
}
- btrfs_folio_clear_checked(fs_info, page_folio(page), block_start,
+ btrfs_folio_clear_checked(fs_info, folio, block_start,
block_end + 1 - block_start);
- btrfs_folio_set_dirty(fs_info, page_folio(page), block_start,
+ btrfs_folio_set_dirty(fs_info, folio, block_start,
block_end + 1 - block_start);
unlock_extent(io_tree, block_start, block_end, &cached_state);
@@ -4829,8 +4846,8 @@ out_unlock:
block_start, blocksize, true);
}
btrfs_delalloc_release_extents(inode, blocksize);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out:
if (only_release_metadata)
btrfs_check_nocow_unlock(inode);
@@ -4904,16 +4921,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err = 0;
+ int ret = 0;
/*
* If our size started in the middle of a block we need to zero out the
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- err = btrfs_truncate_block(inode, oldsize, 0, 0);
- if (err)
- return err;
+ ret = btrfs_truncate_block(inode, oldsize, 0, 0);
+ if (ret)
+ return ret;
if (size <= hole_start)
return 0;
@@ -4922,10 +4939,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
&cached_state);
cur_offset = hole_start;
while (1) {
- em = btrfs_get_extent(inode, NULL, 0, cur_offset,
- block_end - cur_offset);
+ em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
if (IS_ERR(em)) {
- err = PTR_ERR(em);
+ ret = PTR_ERR(em);
em = NULL;
break;
}
@@ -4936,13 +4952,13 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
- err = maybe_insert_hole(inode, cur_offset, hole_size);
- if (err)
+ ret = maybe_insert_hole(inode, cur_offset, hole_size);
+ if (ret)
break;
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
hole_em = alloc_extent_map();
@@ -4963,12 +4979,12 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->ram_bytes = hole_size;
hole_em->generation = btrfs_get_fs_generation(fs_info);
- err = btrfs_replace_extent_map_range(inode, hole_em, true);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
} else {
- err = btrfs_inode_set_file_extent_range(inode,
+ ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
- if (err)
+ if (ret)
break;
}
next:
@@ -4980,7 +4996,7 @@ next:
}
free_extent_map(em);
unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
- return err;
+ return ret;
}
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
@@ -5256,7 +5272,7 @@ void btrfs_evict_inode(struct inode *inode)
if (inode->i_nlink &&
((btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
btrfs_is_free_space_inode(BTRFS_I(inode))))
goto out;
@@ -5268,7 +5284,7 @@ void btrfs_evict_inode(struct inode *inode)
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
- root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+ btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
goto out;
}
@@ -5440,7 +5456,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
}
err = -ENOENT;
- key.objectid = dir->root->root_key.objectid;
+ key.objectid = btrfs_root_id(dir->root);
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
@@ -5549,7 +5565,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
BTRFS_I(inode)->location.offset = 0;
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
- BUG_ON(args->root && !BTRFS_I(inode)->root);
if (args->root && args->root == args->root->fs_info->tree_root &&
args->ino != BTRFS_BTREE_INODE_OBJECTID)
@@ -6400,8 +6415,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_err(fs_info,
"error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
- ret);
+ btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
}
/*
@@ -6474,7 +6488,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
index, name);
} else if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root, name,
@@ -6517,7 +6531,7 @@ fail_dir_item:
u64 local_index;
int err;
err = btrfs_del_root_ref(trans, key.objectid,
- root->root_key.objectid, parent_ino,
+ btrfs_root_id(root), parent_ino,
&local_index, name);
if (err)
btrfs_abort_transaction(trans, err);
@@ -6615,7 +6629,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
+ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6772,7 +6786,6 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
*
* @inode: file to search in
* @page: page to read extent data into if the extent is inline
- * @pg_offset: offset into @page to copy to
* @start: file offset
* @len: length of range starting at @start
*
@@ -6786,8 +6799,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
* Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
+ struct page *page, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
@@ -6930,7 +6942,6 @@ next:
* ensured by tree-checker and inline extent creation path.
* Thus all members representing file offsets should be zero.
*/
- ASSERT(pg_offset == 0);
ASSERT(extent_start == 0);
ASSERT(em->start == 0);
@@ -6965,7 +6976,7 @@ insert:
}
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
@@ -7292,11 +7303,49 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
struct extent_map *em;
int ret;
+ /*
+ * Note the missing NOCOW type.
+ *
+ * For pure NOCOW writes, we should not create an io extent map, but
+ * just reusing the existing one.
+ * Only PREALLOC writes (NOCOW write into preallocated range) can
+ * create an io extent map.
+ */
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
type == BTRFS_ORDERED_COMPRESSED ||
- type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
+ switch (type) {
+ case BTRFS_ORDERED_PREALLOC:
+ /* Uncompressed extents. */
+ ASSERT(block_len == len);
+
+ /* We're only referring part of a larger preallocated extent. */
+ ASSERT(block_len <= ram_bytes);
+ break;
+ case BTRFS_ORDERED_REGULAR:
+ /* Uncompressed extents. */
+ ASSERT(block_len == len);
+
+ /* COW results a new extent matching our file extent size. */
+ ASSERT(orig_block_len == len);
+ ASSERT(ram_bytes == len);
+
+ /* Since it's a new extent, we should not have any offset. */
+ ASSERT(orig_start == start);
+ break;
+ case BTRFS_ORDERED_COMPRESSED:
+ /* Must be compressed. */
+ ASSERT(compress_type != BTRFS_COMPRESS_NONE);
+
+ /*
+ * Encoded write can make us to refer to part of the
+ * uncompressed extent.
+ */
+ ASSERT(len <= ram_bytes);
+ break;
+ }
+
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7310,9 +7359,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->ram_bytes = ram_bytes;
em->generation = -1;
em->flags |= EXTENT_FLAG_PINNED;
- if (type == BTRFS_ORDERED_PREALLOC)
- em->flags |= EXTENT_FLAG_FILLING;
- else if (type == BTRFS_ORDERED_COMPRESSED)
+ if (type == BTRFS_ORDERED_COMPRESSED)
extent_map_set_compression(em, compress_type);
ret = btrfs_replace_extent_map_range(inode, em, true);
@@ -7571,7 +7618,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (ret < 0)
goto err;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
@@ -7899,17 +7946,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
-static int btrfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return extent_writepages(mapping, wbc);
-}
-
-static void btrfs_readahead(struct readahead_control *rac)
-{
- extent_readahead(rac);
-}
-
/*
* For release_folio() and invalidate_folio() we have a race window where
* folio_end_writeback() is called but the subpage spinlock is not yet released.
@@ -7946,13 +7982,12 @@ static void wait_subpage_spinlock(struct page *page)
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- int ret = try_release_extent_mapping(&folio->page, gfp_flags);
-
- if (ret == 1) {
+ if (try_release_extent_mapping(&folio->page, gfp_flags)) {
wait_subpage_spinlock(&folio->page);
clear_page_extent_mapped(&folio->page);
+ return true;
}
- return ret;
+ return false;
}
static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -8150,173 +8185,6 @@ next:
clear_page_extent_mapped(&folio->page);
}
-/*
- * btrfs_page_mkwrite() is not allowed to change the file size as it gets
- * called from a page fault handler when a page is first dirtied. Hence we must
- * be careful to check for EOF conditions here. We set the page up correctly
- * for a written page which means we get ENOSPC checking when writing into
- * holes and correct delalloc and unwritten extent mapping on filesystems that
- * support these features.
- *
- * We are not allowed to take the i_mutex here so we have to play games to
- * protect against truncate races as the page could now be beyond EOF. Because
- * truncate_setsize() writes the inode size before removing pages, once we have
- * the page lock we can determine safely if the page is beyond EOF. If it is not
- * beyond EOF, then the page is guaranteed safe against truncation until we
- * unlock the page.
- */
-vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
-{
- struct page *page = vmf->page;
- struct folio *folio = page_folio(page);
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_changeset *data_reserved = NULL;
- unsigned long zero_start;
- loff_t size;
- vm_fault_t ret;
- int ret2;
- int reserved = 0;
- u64 reserved_space;
- u64 page_start;
- u64 page_end;
- u64 end;
-
- ASSERT(folio_order(folio) == 0);
-
- reserved_space = PAGE_SIZE;
-
- sb_start_pagefault(inode->i_sb);
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- end = page_end;
-
- /*
- * Reserving delalloc space after obtaining the page lock can lead to
- * deadlock. For example, if a dirty page is locked by this function
- * and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepages() function could
- * end up waiting indefinitely to get a lock on the page currently
- * being processed by btrfs_page_mkwrite() function.
- */
- ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (!ret2) {
- ret2 = file_update_time(vmf->vma->vm_file);
- reserved = 1;
- }
- if (ret2) {
- ret = vmf_error(ret2);
- if (reserved)
- goto out;
- goto out_noreserve;
- }
-
- ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
-again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
- lock_page(page);
- size = i_size_read(inode);
-
- if ((page->mapping != inode->i_mapping) ||
- (page_start >= size)) {
- /* page got truncated out from underneath us */
- goto out_unlock;
- }
- wait_on_page_writeback(page);
-
- lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_page_extent_mapped(page);
- if (ret2 < 0) {
- ret = vmf_error(ret2);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- goto out_unlock;
- }
-
- /*
- * we can't set the delalloc bits if there are pending ordered
- * extents. Drop our locks and wait for them to finish
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_start_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
-
- if (page->index == ((size - 1) >> PAGE_SHIFT)) {
- reserved_space = round_up(size - page_start,
- fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
- end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
- }
- }
-
- /*
- * page_mkwrite gets called when the page is firstly dirtied after it's
- * faulted in, but write(2) could also dirty a page and set delalloc
- * bits, thus in this case for space account reason, we still need to
- * clear any delalloc bits within this page range since we have to
- * reserve data&meta space before lock_page() (see above comments).
- */
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
- if (ret2) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- ret = VM_FAULT_SIGBUS;
- goto out_unlock;
- }
-
- /* page is wholly or partially inside EOF */
- if (page_start + PAGE_SIZE > size)
- zero_start = offset_in_page(size);
- else
- zero_start = PAGE_SIZE;
-
- if (zero_start != PAGE_SIZE)
- memzero_page(page, zero_start, PAGE_SIZE - zero_start);
-
- btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
- btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
- btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
-
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
-
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return VM_FAULT_LOCKED;
-
-out_unlock:
- unlock_page(page);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
-out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, (ret != 0));
-out_noreserve:
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return ret;
-}
-
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
{
struct btrfs_truncate_control control = {
@@ -8716,7 +8584,7 @@ int __init btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
init_once);
if (!btrfs_inode_cachep)
goto fail;
@@ -8765,6 +8633,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
+ stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->result_mask |= STATX_SUBVOL;
+
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
@@ -9644,7 +9515,7 @@ free_qgroup:
* or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid, qgroup_released,
+ btrfs_root_id(inode->root), qgroup_released,
BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(ret);
}
@@ -10180,7 +10051,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
cond_resched();
}
- em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_unlock_extent;
@@ -10292,8 +10163,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
size_t orig_count;
u64 start, end;
u64 num_bytes, ram_bytes, disk_num_bytes;
- unsigned long nr_pages, i;
- struct page **pages;
+ unsigned long nr_folios, i;
+ struct folio **folios;
struct btrfs_key ins;
bool extent_reserved = false;
struct extent_map *em;
@@ -10382,24 +10253,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
* isn't.
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
- nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
- if (!pages)
+ nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!folios)
return -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_folios; i++) {
size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
char *kaddr;
- pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!pages[i]) {
+ folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
+ if (!folios[i]) {
ret = -ENOMEM;
- goto out_pages;
+ goto out_folios;
}
- kaddr = kmap_local_page(pages[i]);
+ kaddr = kmap_local_folio(folios[i], 0);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
kunmap_local(kaddr);
ret = -EFAULT;
- goto out_pages;
+ goto out_folios;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
@@ -10411,12 +10282,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
if (ret)
- goto out_pages;
+ goto out_folios;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ret)
- goto out_pages;
+ goto out_folios;
lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
@@ -10444,10 +10315,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_qgroup_free_data;
/* Try an inline extent first. */
- if (start == 0 && encoded->unencoded_len == encoded->len &&
- encoded->unencoded_offset == 0) {
- ret = cow_file_range_inline(inode, encoded->len, orig_count,
- compression, pages, true);
+ if (encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0 &&
+ can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
+ ret = __cow_file_range_inline(inode, start, encoded->len,
+ orig_count, compression, folios[0],
+ true);
if (ret <= 0) {
if (ret == 0)
ret = orig_count;
@@ -10491,7 +10364,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
btrfs_delalloc_release_extents(inode, num_bytes);
- btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
+ btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
ret = orig_count;
goto out;
@@ -10513,12 +10386,12 @@ out_free_data_space:
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
unlock_extent(io_tree, start, end, &cached_state);
-out_pages:
- for (i = 0; i < nr_pages; i++) {
- if (pages[i])
- __free_page(pages[i]);
+out_folios:
+ for (i = 0; i < nr_folios; i++) {
+ if (folios[i])
+ folio_put(folios[i]);
}
- kvfree(pages);
+ kvfree(folios);
out:
if (ret >= 0)
iocb->ki_pos += encoded->len;
@@ -10745,7 +10618,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
- root->root_key.objectid);
+ btrfs_root_id(root));
return -EPERM;
}
atomic_inc(&root->nr_swapfiles);
@@ -10760,7 +10633,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_block_group *bg;
u64 len = isize - start;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -10971,7 +10844,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
if (ordered) {
btrfs_err(root->fs_info,
"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
- start, end, btrfs_ino(inode), root->root_key.objectid,
+ start, end, btrfs_ino(inode), btrfs_root_id(root),
ordered->file_offset,
ordered->file_offset + ordered->num_bytes - 1);
btrfs_put_ordered_extent(ordered);
@@ -10980,6 +10853,65 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
ASSERT(ordered == NULL);
}
+/*
+ * Find the first inode with a minimum number.
+ *
+ * @root: The root to search for.
+ * @min_ino: The minimum inode number.
+ *
+ * Find the first inode in the @root with a number >= @min_ino and return it.
+ * Returns NULL if no such inode found.
+ */
+struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *inode;
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ inode = rb_entry(node, struct btrfs_inode, rb_node);
+ if (min_ino < btrfs_ino(inode))
+ node = node->rb_left;
+ else if (min_ino > btrfs_ino(inode))
+ node = node->rb_right;
+ else
+ break;
+ }
+
+ if (!node) {
+ while (prev) {
+ inode = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (min_ino <= btrfs_ino(inode)) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+
+ while (node) {
+ inode = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (igrab(&inode->vfs_inode)) {
+ spin_unlock(&root->inode_lock);
+ return inode;
+ }
+
+ min_ino = btrfs_ino(inode) + 1;
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+
+ return NULL;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8851ba7a1e..c1b0556e40 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -34,11 +34,9 @@
#include "export.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
#include "backref.h"
-#include "rcu-string.h"
#include "send.h"
#include "dev-replace.h"
#include "props.h"
@@ -47,9 +45,7 @@
#include "tree-log.h"
#include "compression.h"
#include "space-info.h"
-#include "delalloc-space.h"
#include "block-group.h"
-#include "subpage.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -231,6 +227,20 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
return 0;
}
+int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args)
+{
+ if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL)
+ return -ENAMETOOLONG;
+ return 0;
+}
+
+static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2)
+{
+ if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL)
+ return -ENAMETOOLONG;
+ return 0;
+}
+
/*
* Set flags/xflags from the internal inode flags. The remaining items of
* fsxattr are zeroed.
@@ -658,7 +668,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
/* Tree log can't currently deal with an inode which is a new root. */
btrfs_set_log_full_commit(trans);
- ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit);
+ ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit);
if (ret)
goto out;
@@ -709,6 +719,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret) {
+ int ret2;
+
/*
* Since we don't abort the transaction in this case, free the
* tree block so that we don't leak space and leave the
@@ -719,7 +731,9 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_tree_lock(leaf);
btrfs_clear_buffer_dirty(trans, leaf);
btrfs_tree_unlock(leaf);
- btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ if (ret2 < 0)
+ btrfs_abort_transaction(trans, ret2);
free_extent_buffer(leaf);
goto out;
}
@@ -929,7 +943,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
if (d_really_is_negative(victim))
return -ENOENT;
- BUG_ON(d_inode(victim->d_parent) != dir);
+ /* The @victim is not inside @dir. */
+ if (d_inode(victim->d_parent) != dir)
+ return -EINVAL;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
@@ -1147,7 +1163,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = PTR_ERR(vol_args);
goto out_drop;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
sizestr = vol_args->name;
cancel = (strcmp("cancel", sizestr) == 0);
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
@@ -1347,12 +1366,15 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out;
ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
vol_args->name, vol_args->fd, subvol,
false, NULL);
+out:
kfree(vol_args);
return ret;
}
@@ -1371,7 +1393,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
+ if (ret < 0)
+ goto free_args;
if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
ret = -EOPNOTSUPP;
@@ -1490,7 +1514,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
spin_unlock(&root->root_item_lock);
btrfs_warn(fs_info,
"Attempt to set subvolume %llu read-write during send",
- root->root_key.objectid);
+ btrfs_root_id(root));
ret = -EPERM;
goto out_drop_sem;
}
@@ -1899,7 +1923,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct super_block *sb = inode->i_sb;
struct btrfs_key upper_limit = BTRFS_I(inode)->location;
- u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+ u64 treeid = btrfs_root_id(BTRFS_I(inode)->root);
u64 dirid = args->dirid;
unsigned long item_off;
unsigned long item_len;
@@ -2071,7 +2095,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
- args->treeid = root->root_key.objectid;
+ args->treeid = btrfs_root_id(root);
if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
args->name[0] = 0;
@@ -2167,7 +2191,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
fs_info = BTRFS_I(inode)->root->fs_info;
/* Get root_item of inode's subvolume */
- key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
root = btrfs_get_fs_root(fs_info, key.objectid, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
@@ -2282,7 +2306,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
return PTR_ERR(rootrefs);
}
- objectid = root->root_key.objectid;
+ objectid = btrfs_root_id(root);
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
@@ -2366,7 +2390,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
int subvol_namelen;
- int err = 0;
+ int ret = 0;
bool destroy_parent = false;
/* We don't support snapshots with extent tree v2 yet. */
@@ -2382,7 +2406,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
return PTR_ERR(vol_args2);
if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
- err = -EOPNOTSUPP;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -2391,29 +2415,31 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* name, same as v1 currently does.
*/
if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
- vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2);
+ if (ret < 0)
+ goto out;
subvol_name = vol_args2->name;
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
} else {
struct inode *old_dir;
if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
dentry = btrfs_get_dentry(fs_info->sb,
BTRFS_FIRST_FREE_OBJECTID,
vol_args2->subvolid, 0);
if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
+ ret = PTR_ERR(dentry);
goto out_drop_write;
}
@@ -2433,7 +2459,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
dput(dentry);
if (IS_ERR(parent)) {
- err = PTR_ERR(parent);
+ ret = PTR_ERR(parent);
goto out_drop_write;
}
old_dir = dir;
@@ -2457,14 +2483,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* to delete without an idmapped mount.
*/
if (old_dir != dir && idmap != &nop_mnt_idmap) {
- err = -EOPNOTSUPP;
+ ret = -EOPNOTSUPP;
goto free_parent;
}
subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
fs_info, vol_args2->subvolid);
if (IS_ERR(subvol_name_ptr)) {
- err = PTR_ERR(subvol_name_ptr);
+ ret = PTR_ERR(subvol_name_ptr);
goto free_parent;
}
/* subvol_name_ptr is already nul terminated */
@@ -2475,11 +2501,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out;
+
subvol_name = vol_args->name;
- err = mnt_want_write_file(file);
- if (err)
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
}
@@ -2487,26 +2516,26 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (strchr(subvol_name, '/') ||
strncmp(subvol_name, "..", subvol_namelen) == 0) {
- err = -EINVAL;
+ ret = -EINVAL;
goto free_subvol_name;
}
if (!S_ISDIR(dir->i_mode)) {
- err = -ENOTDIR;
+ ret = -ENOTDIR;
goto free_subvol_name;
}
- err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (err == -EINTR)
+ ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (ret == -EINTR)
goto free_subvol_name;
dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen);
if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
+ ret = PTR_ERR(dentry);
goto out_unlock_dir;
}
if (d_really_is_negative(dentry)) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out_dput;
}
@@ -2526,7 +2555,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* Users who want to delete empty subvols should try
* rmdir(2).
*/
- err = -EPERM;
+ ret = -EPERM;
if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
goto out_dput;
@@ -2537,29 +2566,29 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* of the subvol, not a random directory contained
* within it.
*/
- err = -EINVAL;
+ ret = -EINVAL;
if (root == dest)
goto out_dput;
- err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
- if (err)
+ ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
+ if (ret)
goto out_dput;
}
/* check if subvolume may be deleted by a user */
- err = btrfs_may_delete(idmap, dir, dentry, 1);
- if (err)
+ ret = btrfs_may_delete(idmap, dir, dentry, 1);
+ if (ret)
goto out_dput;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
- err = -EINVAL;
+ ret = -EINVAL;
goto out_dput;
}
btrfs_inode_lock(BTRFS_I(inode), 0);
- err = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
btrfs_inode_unlock(BTRFS_I(inode), 0);
- if (!err)
+ if (!ret)
d_delete_notify(dir, dentry);
out_dput:
@@ -2576,7 +2605,7 @@ out_drop_write:
out:
kfree(vol_args2);
kfree(vol_args);
- return err;
+ return ret;
}
static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
@@ -2686,12 +2715,16 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
goto out;
}
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
ret = btrfs_init_new_device(fs_info, vol_args->name);
if (!ret)
btrfs_info(fs_info, "disk added %s", vol_args->name);
+out_free:
kfree(vol_args);
out:
if (restore_op)
@@ -2707,7 +2740,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_vol_args_v2 *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2723,7 +2756,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
+ if (ret < 0)
+ goto out;
+
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
args.devid = vol_args->devid;
} else if (!strcmp("cancel", vol_args->name)) {
@@ -2744,7 +2780,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
/* Exclusive operation is now claimed */
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
btrfs_exclop_finish(fs_info);
@@ -2758,8 +2794,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
err_drop:
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2772,7 +2808,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_ioctl_vol_args *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2783,7 +2819,10 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol_args);
+ if (ret < 0)
+ goto out_free;
+
if (!strcmp("cancel", vol_args->name)) {
cancel = true;
} else {
@@ -2799,17 +2838,18 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
+out_free:
kfree(vol_args);
return ret;
}
@@ -2945,7 +2985,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(new_root->root_key.objectid)) {
+ if (!is_fstree(btrfs_root_id(new_root))) {
ret = -ENOENT;
goto out_free;
}
@@ -3911,7 +3951,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
qgroupid = sa->qgroupid;
if (!qgroupid) {
/* take the current subvol as qgroup */
- qgroupid = root->root_key.objectid;
+ qgroupid = btrfs_root_id(root);
}
ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
@@ -4042,7 +4082,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
!btrfs_is_empty_uuid(root_item->received_uuid)) {
ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- root->root_key.objectid);
+ btrfs_root_id(root));
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4066,7 +4106,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- root->root_key.objectid);
+ btrfs_root_id(root));
if (ret < 0 && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index d51b9a2f2f..2c5dc25ec6 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -3,6 +3,15 @@
#ifndef BTRFS_IOCTL_H
#define BTRFS_IOCTL_H
+#include <linux/types.h>
+
+struct file;
+struct dentry;
+struct mnt_idmap;
+struct fileattr;
+struct btrfs_fs_info;
+struct btrfs_ioctl_balance_args;
+
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 74d8e2003f..6a0b7abb5b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -13,7 +13,6 @@
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
-#include "accessors.h"
/*
* Lockdep class keys for extent_buffer->lock's in this root. For a given
@@ -85,7 +84,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int
{
struct btrfs_lockdep_keyset *ks;
- BUG_ON(level >= ARRAY_SIZE(ks->keys));
+ ASSERT(level < ARRAY_SIZE(ks->keys));
/* Find the matching keyset, id 0 is the default entry */
for (ks = btrfs_lockdep_keysets; ks->id; ks++)
@@ -98,7 +97,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int
void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
{
if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
- btrfs_set_buffer_lockdep_class(root->root_key.objectid,
+ btrfs_set_buffer_lockdep_class(btrfs_root_id(root),
eb, btrfs_header_level(eb));
}
@@ -130,14 +129,14 @@ static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
*/
/*
- * __btrfs_tree_read_lock - lock extent buffer for read
+ * btrfs_tree_read_lock_nested - lock extent buffer for read
* @eb: the eb to be locked
* @nest: the nesting level to be used for lockdep
*
* This takes the read lock on the extent buffer, using the specified nesting
* level for lockdep purposes.
*/
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
{
u64 start_ns = 0;
@@ -148,11 +147,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
trace_btrfs_tree_read_lock(eb, start_ns);
}
-void btrfs_tree_read_lock(struct extent_buffer *eb)
-{
- __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL);
-}
-
/*
* Try-lock for read.
*
@@ -199,7 +193,7 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*
* Returns with the eb->lock write locked.
*/
-void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -212,11 +206,6 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
trace_btrfs_tree_lock(eb, start_ns);
}
-void btrfs_tree_lock(struct extent_buffer *eb)
-{
- __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
-}
-
/*
* Release the write lock.
*/
@@ -375,8 +364,12 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
{
- atomic_dec(&lock->writers);
- cond_wake_up(&lock->pending_readers);
+ /*
+ * atomic_dec_and_test() implies a full barrier, so woken up readers are
+ * guaranteed to see the decrement.
+ */
+ if (atomic_dec_and_test(&lock->writers))
+ wake_up(&lock->pending_readers);
}
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 7d6ee1e609..1bc8e67388 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -8,8 +8,14 @@
#include <linux/atomic.h>
#include <linux/wait.h>
+#include <linux/lockdep.h>
#include <linux/percpu_counter.h>
#include "extent_io.h"
+#include "locking.h"
+
+struct extent_buffer;
+struct btrfs_path;
+struct btrfs_root;
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
@@ -157,14 +163,22 @@ enum btrfs_lockdep_trans_states {
static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
"too many lock subclasses defined");
-struct btrfs_path;
+void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
+
+static inline void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ btrfs_tree_lock_nested(eb, BTRFS_NESTING_NORMAL);
+}
-void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
-void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
-void btrfs_tree_read_lock(struct extent_buffer *eb);
+void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
+
+static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ btrfs_tree_read_lock_nested(eb, BTRFS_NESTING_NORMAL);
+}
+
void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
index 00328c856b..e32906ab6f 100644
--- a/fs/btrfs/lru_cache.h
+++ b/fs/btrfs/lru_cache.h
@@ -3,8 +3,10 @@
#ifndef BTRFS_LRU_CACHE_H
#define BTRFS_LRU_CACHE_H
+#include <linux/types.h>
#include <linux/maple_tree.h>
#include <linux/list.h>
+#include "lru_cache.h"
/*
* A cache entry. This is meant to be embedded in a structure of a user of
@@ -50,11 +52,6 @@ struct btrfs_lru_cache {
#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \
list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list)
-static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache)
-{
- return cache->size;
-}
-
static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
struct btrfs_lru_cache *cache)
{
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 3e5d3b7028..1c396ac167 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -130,17 +130,17 @@ static inline size_t read_compress_length(const char *buf)
*/
static int copy_compressed_data_to_page(char *compressed_data,
size_t compressed_size,
- struct page **out_pages,
- unsigned long max_nr_page,
+ struct folio **out_folios,
+ unsigned long max_nr_folio,
u32 *cur_out,
const u32 sectorsize)
{
u32 sector_bytes_left;
u32 orig_out;
- struct page *cur_page;
+ struct folio *cur_folio;
char *kaddr;
- if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
return -E2BIG;
/*
@@ -149,16 +149,16 @@ static int copy_compressed_data_to_page(char *compressed_data,
*/
ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
- cur_page = out_pages[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out / PAGE_SIZE];
/* Allocate a new page */
- if (!cur_page) {
- cur_page = btrfs_alloc_compr_page();
- if (!cur_page)
+ if (!cur_folio) {
+ cur_folio = btrfs_alloc_compr_folio();
+ if (!cur_folio)
return -ENOMEM;
- out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ out_folios[*cur_out / PAGE_SIZE] = cur_folio;
}
- kaddr = kmap_local_page(cur_page);
+ kaddr = kmap_local_folio(cur_folio, 0);
write_compress_length(kaddr + offset_in_page(*cur_out),
compressed_size);
*cur_out += LZO_LEN;
@@ -172,18 +172,18 @@ static int copy_compressed_data_to_page(char *compressed_data,
kunmap_local(kaddr);
- if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
return -E2BIG;
- cur_page = out_pages[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out / PAGE_SIZE];
/* Allocate a new page */
- if (!cur_page) {
- cur_page = btrfs_alloc_compr_page();
- if (!cur_page)
+ if (!cur_folio) {
+ cur_folio = btrfs_alloc_compr_folio();
+ if (!cur_folio)
return -ENOMEM;
- out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ out_folios[*cur_out / PAGE_SIZE] = cur_folio;
}
- kaddr = kmap_local_page(cur_page);
+ kaddr = kmap_local_folio(cur_folio, 0);
memcpy(kaddr + offset_in_page(*cur_out),
compressed_data + *cur_out - orig_out, copy_len);
@@ -209,15 +209,15 @@ out:
return 0;
}
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
- struct page *page_in = NULL;
+ struct folio *folio_in = NULL;
char *sizes_ptr;
- const unsigned long max_nr_page = *out_pages;
+ const unsigned long max_nr_folio = *out_folios;
int ret = 0;
/* Points to the file offset of input data */
u64 cur_in = start;
@@ -225,8 +225,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u32 cur_out = 0;
u32 len = *total_out;
- ASSERT(max_nr_page > 0);
- *out_pages = 0;
+ ASSERT(max_nr_folio > 0);
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
@@ -243,15 +243,16 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
size_t out_len;
/* Get the input page first */
- if (!page_in) {
- page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
- ASSERT(page_in);
+ if (!folio_in) {
+ ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in);
+ if (ret < 0)
+ goto out;
}
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap_local_page(page_in);
+ data_in = kmap_local_folio(folio_in, 0);
ret = lzo1x_1_compress(data_in +
offset_in_page(cur_in), in_len,
workspace->cbuf, &out_len,
@@ -264,7 +265,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
- pages, max_nr_page,
+ folios, max_nr_folio,
&cur_out, sectorsize);
if (ret < 0)
goto out;
@@ -282,13 +283,13 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we have reached page boundary */
if (PAGE_ALIGNED(cur_in)) {
- put_page(page_in);
- page_in = NULL;
+ folio_put(folio_in);
+ folio_in = NULL;
}
}
/* Store the size of all chunks of compressed data */
- sizes_ptr = kmap_local_page(pages[0]);
+ sizes_ptr = kmap_local_folio(folios[0], 0);
write_compress_length(sizes_ptr, cur_out);
kunmap_local(sizes_ptr);
@@ -296,9 +297,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = cur_out;
*total_in = cur_in - start;
out:
- if (page_in)
- put_page(page_in);
- *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+ if (folio_in)
+ folio_put(folio_in);
+ *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE);
return ret;
}
@@ -313,15 +314,15 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- struct page *cur_page;
+ struct folio *cur_folio;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
ASSERT(copy_len);
- cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+ cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE];
- memcpy_from_page(dest + *cur_in - orig_in, cur_page,
- offset_in_page(*cur_in), copy_len);
+ memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
+ offset_in_folio(cur_folio, *cur_in), copy_len);
*cur_in += copy_len;
}
@@ -341,7 +342,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- kaddr = kmap_local_page(cb->compressed_pages[0]);
+ kaddr = kmap_local_folio(cb->compressed_folios[0], 0);
len_in = read_compress_length(kaddr);
kunmap_local(kaddr);
cur_in += LZO_LEN;
@@ -363,7 +364,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Go through each lzo segment */
while (cur_in < len_in) {
- struct page *cur_page;
+ struct folio *cur_folio;
/* Length of the compressed segment */
u32 seg_len;
u32 sector_bytes_left;
@@ -375,9 +376,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
*/
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
- cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
- ASSERT(cur_page);
- kaddr = kmap_local_page(cur_page);
+ cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE];
+ ASSERT(cur_folio);
+ kaddr = kmap_local_folio(cur_folio, 0);
seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
kunmap_local(kaddr);
cur_in += LZO_LEN;
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index cdada48658..210d9c82e2 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -3,13 +3,11 @@
#include "fs.h"
#include "messages.h"
#include "discard.h"
-#include "transaction.h"
-#include "space-info.h"
#include "super.h"
#ifdef CONFIG_PRINTK
-#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_PREFACE " state "
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1)
/*
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 40f2d9f1a1..dde4904aea 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_MISC_H
#define BTRFS_MISC_H
+#include <linux/types.h>
+#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/math64.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 81f67ebf74..35a413ce93 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,7 +19,6 @@
#include "qgroup.h"
#include "subpage.h"
#include "file.h"
-#include "super.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -295,6 +294,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
spin_unlock_irq(&inode->ordered_tree_lock);
}
+void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
+{
+ if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ mapping_set_error(ordered->inode->i_mapping, -EIO);
+}
+
static void finish_ordered_fn(struct btrfs_work *work)
{
struct btrfs_ordered_extent *ordered_extent;
@@ -333,7 +338,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (WARN_ON_ONCE(len > ordered->bytes_left)) {
btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
- inode->root->root_key.objectid, btrfs_ino(inode),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
ordered->file_offset, ordered->num_bytes,
len, ordered->bytes_left);
ordered->bytes_left = 0;
@@ -383,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ /*
+ * If this is a COW write it means we created new extent maps for the
+ * range and they point to unwritten locations if we got an error either
+ * before submitting a bio or during IO.
+ *
+ * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
+ * are queuing its completion below. During completion, at
+ * btrfs_finish_one_ordered(), we will drop the extent maps for the
+ * unwritten extents.
+ *
+ * However because completion runs in a work queue we can end up having
+ * a fast fsync running before that. In the case of direct IO, once we
+ * unlock the inode the fsync might start, and we queue the completion
+ * before unlocking the inode. In the case of buffered IO when writeback
+ * finishes (end_bbio_data_write()) we queue the completion, so if the
+ * writeback was triggered by a fast fsync, the fsync might start
+ * logging before ordered extent completion runs in the work queue.
+ *
+ * The fast fsync will log file extent items based on the extent maps it
+ * finds, so if by the time it collects extent maps the ordered extent
+ * completion didn't happen yet, it will log file extent items that
+ * point to unwritten extents, resulting in a corruption if a crash
+ * happens and the log tree is replayed. Note that a fast fsync does not
+ * wait for completion of ordered extents in order to reduce latency.
+ *
+ * Set a flag in the inode so that the next fast fsync will wait for
+ * ordered extents to complete before starting to log.
+ */
+ if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
+
if (ret)
btrfs_queue_ordered_fn(ordered);
return ret;
@@ -1237,10 +1273,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
int __init ordered_data_init(void)
{
- btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
- sizeof(struct btrfs_ordered_extent), 0,
- SLAB_MEM_SPREAD,
- NULL);
+ btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0);
if (!btrfs_ordered_extent_cache)
return -ENOMEM;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 127ef8bf0f..b6f6c6b917 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,6 +6,21 @@
#ifndef BTRFS_ORDERED_DATA_H
#define BTRFS_ORDERED_DATA_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <linux/wait.h>
+#include "async-thread.h"
+
+struct inode;
+struct page;
+struct extent_state;
+struct btrfs_inode;
+struct btrfs_root;
+struct btrfs_fs_info;
+
struct btrfs_ordered_sum {
/*
* Logical start address and length for of the blocks covered by
@@ -188,6 +203,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
struct btrfs_ordered_extent *btrfs_split_ordered_extent(
struct btrfs_ordered_extent *ordered, u64 len);
+void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 7a1b021b56..6195a2215b 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -4,7 +4,6 @@
*/
#include "ctree.h"
-#include "disk-io.h"
#include "orphan.h"
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h
index 3faab5cbb5..aa54a88a60 100644
--- a/fs/btrfs/orphan.h
+++ b/fs/btrfs/orphan.h
@@ -3,6 +3,11 @@
#ifndef BTRFS_ORPHAN_H
#define BTRFS_ORPHAN_H
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 7e46aa8a04..b876156fcd 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -14,7 +14,7 @@
struct root_name_map {
u64 id;
- char name[16];
+ const char *name;
};
static const struct root_name_map root_map[] = {
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index c42bc666d5..8504bf1702 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -9,6 +9,9 @@
/* Buffer size to contain tree name and possibly additional data (offset) */
#define BTRFS_ROOT_NAME_BUF_LEN 48
+struct extent_buffer;
+struct btrfs_key;
+
void btrfs_print_leaf(const struct extent_buffer *l);
void btrfs_print_tree(const struct extent_buffer *c, bool follow);
const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index ac4a0af2b5..155570e20f 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -4,6 +4,7 @@
*/
#include <linux/hashtable.h>
+#include <linux/xattr.h>
#include "messages.h"
#include "props.h"
#include "btrfs_inode.h"
@@ -267,7 +268,7 @@ static void inode_prop_iterator(void *ctx,
btrfs_warn(root->fs_info,
"error applying prop %s to ino %llu (root %llu): %d",
handler->xattr_name, btrfs_ino(BTRFS_I(inode)),
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
else
set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 6e283196e3..f60cd89feb 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -6,7 +6,12 @@
#ifndef BTRFS_PROPS_H
#define BTRFS_PROPS_H
-#include "ctree.h"
+#include <linux/compiler_types.h>
+
+struct inode;
+struct btrfs_inode;
+struct btrfs_path;
+struct btrfs_trans_handle;
int __init btrfs_props_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cacc12d0ff..29d6ca3b87 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -468,6 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
if (!qgroup) {
struct btrfs_qgroup *prealloc;
+ struct btrfs_root *tree_root = fs_info->tree_root;
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
if (!prealloc) {
@@ -475,6 +476,25 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
+ /*
+ * If a qgroup exists for a subvolume ID, it is possible
+ * that subvolume has been deleted, in which case
+ * re-using that ID would lead to incorrect accounting.
+ *
+ * Ensure that we skip any such subvol ids.
+ *
+ * We don't need to lock because this is only called
+ * during mount before we start doing things like creating
+ * subvolumes.
+ */
+ if (is_fstree(qgroup->qgroupid) &&
+ qgroup->qgroupid > tree_root->free_objectid)
+ /*
+ * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
+ * as it will get checked on the next call to
+ * btrfs_get_free_objectid.
+ */
+ tree_root->free_objectid = qgroup->qgroupid + 1;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0)
@@ -1324,14 +1344,14 @@ static int flush_reservations(struct btrfs_fs_info *fs_info)
trans = btrfs_join_transaction(fs_info->tree_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
return ret;
}
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *quota_root;
+ struct btrfs_root *quota_root = NULL;
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
@@ -1426,12 +1446,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(quota_root->node);
btrfs_clear_buffer_dirty(trans, quota_root->node);
btrfs_tree_unlock(quota_root->node);
- btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
- quota_root->node, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+ quota_root->node, 0, 1);
- btrfs_put_root(quota_root);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
+ btrfs_put_root(quota_root);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
@@ -1536,18 +1558,15 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
{
struct btrfs_qgroup *qgroup;
int ret = 1;
- int err = 0;
qgroup = find_qgroup_rb(fs_info, src);
if (!qgroup)
goto out;
if (qgroup->excl == qgroup->rfer) {
- ret = 0;
- err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
- if (err < 0) {
- ret = err;
+ ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
+ if (ret < 0)
goto out;
- }
+ ret = 0;
}
out:
if (ret)
@@ -2500,8 +2519,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
- BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
- BUG_ON(root_eb == NULL);
+ ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
+ ASSERT(root_eb != NULL);
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
@@ -2856,8 +2875,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (nr_old_roots == 0 && nr_new_roots == 0)
goto out_free;
- BUG_ON(!fs_info->quota_root);
-
trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
num_bytes, nr_old_roots, nr_new_roots);
@@ -3047,8 +3064,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_inherit *inherit,
size_t size)
{
- if (!btrfs_qgroup_enabled(fs_info))
- return 0;
if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
return -EOPNOTSUPP;
if (size < sizeof(*inherit) || size > PAGE_SIZE)
@@ -3066,13 +3081,18 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0)
return -EINVAL;
- if (inherit->num_qgroups > PAGE_SIZE)
- return -EINVAL;
-
if (size != struct_size(inherit, qgroups, inherit->num_qgroups))
return -EINVAL;
/*
+ * Skip the inherit source qgroups check if qgroup is not enabled.
+ * Qgroup can still be later enabled causing problems, but in that case
+ * btrfs_qgroup_inherit() would just ignore those invalid ones.
+ */
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
+ /*
* Now check all the remaining qgroups, they should all:
*
* - Exist
@@ -3131,13 +3151,69 @@ static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
qgids = res->qgroups;
list_for_each_entry(qg_list, &inode_qg->groups, next_group)
- qgids[i] = qg_list->group->qgroupid;
+ qgids[i++] = qg_list->group->qgroupid;
*inherit = res;
return 0;
}
/*
+ * Check if we can skip rescan when inheriting qgroups. If @src has a single
+ * @parent, and that @parent is owning all its bytes exclusively, we can skip
+ * the full rescan, by just adding nodesize to the @parent's excl/rfer.
+ *
+ * Return <0 for fatal errors (like srcid/parentid has no qgroup).
+ * Return 0 if a quick inherit is done.
+ * Return >0 if a quick inherit is not possible, and a full rescan is needed.
+ */
+static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
+ u64 srcid, u64 parentid)
+{
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+ int nr_parents = 0;
+
+ src = find_qgroup_rb(fs_info, srcid);
+ if (!src)
+ return -ENOENT;
+ parent = find_qgroup_rb(fs_info, parentid);
+ if (!parent)
+ return -ENOENT;
+
+ /*
+ * Source has no parent qgroup, but our new qgroup would have one.
+ * Qgroup numbers would become inconsistent.
+ */
+ if (list_empty(&src->groups))
+ return 1;
+
+ list_for_each_entry(list, &src->groups, next_group) {
+ /* The parent is not the same, quick update is not possible. */
+ if (list->group->qgroupid != parentid)
+ return 1;
+ nr_parents++;
+ /*
+ * More than one parent qgroup, we can't be sure about accounting
+ * consistency.
+ */
+ if (nr_parents > 1)
+ return 1;
+ }
+
+ /*
+ * The parent is not exclusively owning all its bytes. We're not sure
+ * if the source has any bytes not fully owned by the parent.
+ */
+ if (parent->excl != parent->rfer)
+ return 1;
+
+ parent->excl += fs_info->nodesize;
+ parent->rfer += fs_info->nodesize;
+ return 0;
+}
+
+/*
* Copy the accounting information between qgroups. This is necessary
* when a snapshot or a subvolume is created. Throwing an error will
* cause a transaction abort so we take extra care here to only error
@@ -3305,6 +3381,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
qgroup_dirty(fs_info, dstgroup);
qgroup_dirty(fs_info, srcgroup);
+
+ /*
+ * If the source qgroup has parent but the new one doesn't,
+ * we need a full rescan.
+ */
+ if (!inherit && !list_empty(&srcgroup->groups))
+ need_rescan = true;
}
if (!inherit)
@@ -3319,14 +3402,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (ret)
goto unlock;
}
+ if (srcid) {
+ /* Check if we can do a quick inherit. */
+ ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups);
+ if (ret < 0)
+ goto unlock;
+ if (ret > 0)
+ need_rescan = true;
+ ret = 0;
+ }
++i_qgroups;
-
- /*
- * If we're doing a snapshot, and adding the snapshot to a new
- * qgroup, the numbers are guaranteed to be incorrect.
- */
- if (srcid)
- need_rescan = true;
}
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
@@ -3408,7 +3493,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
{
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 ref_root = root->root_key.objectid;
+ u64 ref_root = btrfs_root_id(root);
int ret = 0;
LIST_HEAD(qgroup_list);
@@ -3643,7 +3728,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- int err = -ENOMEM;
int ret = 0;
bool stopped = false;
bool did_leaf_rescans = false;
@@ -3652,8 +3736,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
return;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ ret = -ENOMEM;
goto out;
+ }
/*
* Rescan should only search for commit root, and any later difference
* should be recorded by qgroup
@@ -3661,18 +3747,17 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path->search_commit_root = 1;
path->skip_locking = 1;
- err = 0;
- while (!err && !(stopped = rescan_should_stop(fs_info))) {
+ while (!ret && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
break;
}
- err = qgroup_rescan_leaf(trans, path);
+ ret = qgroup_rescan_leaf(trans, path);
did_leaf_rescans = true;
- if (err > 0)
+ if (ret > 0)
btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
@@ -3682,10 +3767,10 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (err > 0 &&
+ if (ret > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0 || stopped) {
+ } else if (ret < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3700,11 +3785,11 @@ out:
if (did_leaf_rescans) {
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d",
- err);
+ ret);
}
} else {
trans = NULL;
@@ -3715,11 +3800,11 @@ out:
fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
- ret = update_qgroup_status_item(trans);
- if (ret < 0) {
- err = ret;
- btrfs_err(fs_info, "fail to update qgroup status: %d",
- err);
+ int ret2 = update_qgroup_status_item(trans);
+
+ if (ret2 < 0) {
+ ret = ret2;
+ btrfs_err(fs_info, "fail to update qgroup status: %d", ret);
}
}
fs_info->qgroup_rescan_running = false;
@@ -3736,11 +3821,11 @@ out:
btrfs_info(fs_info, "qgroup scan paused");
} else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
btrfs_info(fs_info, "qgroup scan cancelled");
- } else if (err >= 0) {
+ } else if (ret >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
- err > 0 ? " (inconsistency flag cleared)" : "");
+ ret > 0 ? " (inconsistency flag cleared)" : "");
} else {
- btrfs_err(fs_info, "qgroup scan failed with %d", err);
+ btrfs_err(fs_info, "qgroup scan failed with %d", ret);
}
}
@@ -3763,14 +3848,14 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
/* we're resuming qgroup rescan at mount time */
if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup rescan is not queued");
ret = -EINVAL;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
- ret = -EINVAL;
+ ret = -ENOTCONN;
}
if (ret)
@@ -3781,14 +3866,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
if (init_flags) {
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- btrfs_warn(fs_info,
- "qgroup rescan is already in progress");
ret = -EINPROGRESS;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
- btrfs_warn(fs_info,
+ btrfs_debug(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
- ret = -EINVAL;
+ ret = -ENOTCONN;
} else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
/* Quota disable is in progress */
ret = -EBUSY;
@@ -4049,7 +4132,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid) || len == 0)
+ !is_fstree(btrfs_root_id(root)) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -4165,7 +4248,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
+ btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed,
BTRFS_QGROUP_RSV_DATA);
if (freed_ret)
*freed_ret = freed;
@@ -4206,7 +4289,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
if (released)
*released = changeset.bytes_changed;
@@ -4301,7 +4384,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid) || num_bytes == 0)
+ !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
@@ -4346,13 +4429,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid))
+ !is_fstree(btrfs_root_id(root)))
return;
/* TODO: Update trace point to handle such free */
trace_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
- btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -4362,7 +4445,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid))
+ !is_fstree(btrfs_root_id(root)))
return;
/*
@@ -4373,8 +4456,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
- btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
- num_bytes, type);
+ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
}
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
@@ -4422,13 +4504,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(root->root_key.objectid))
+ !is_fstree(btrfs_root_id(root)))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
- qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
+ qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
if (!sb_rdonly(fs_info->sb))
add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -4457,7 +4539,7 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
btrfs_ino(inode), unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(inode->root->fs_info,
- inode->root->root_key.objectid,
+ btrfs_root_id(inode->root),
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
@@ -4643,7 +4725,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 45a7d8920d..706640be0e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,12 +6,22 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
+#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/kobject.h>
-#include "ulist.h"
-#include "delayed-ref.h"
-#include "misc.h"
+#include <linux/list.h>
+#include <uapi/linux/btrfs_tree.h>
+
+struct extent_buffer;
+struct extent_changeset;
+struct btrfs_delayed_extent_op;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_ioctl_quota_ctl_args;
+struct btrfs_trans_handle;
+struct btrfs_delayed_ref_root;
+struct btrfs_inode;
/*
* Btrfs qgroup overview
@@ -321,7 +331,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit);
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
-struct btrfs_delayed_extent_op;
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 9589362acf..6af6b4b9a3 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -11,7 +11,6 @@
#include "disk-io.h"
#include "raid-stripe-tree.h"
#include "volumes.h"
-#include "misc.h"
#include "print-tree.h"
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index cdb58b38fc..c9c258f849 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -6,6 +6,10 @@
#ifndef BTRFS_RAID_STRIPE_TREE_H
#define BTRFS_RAID_STRIPE_TREE_H
+#include <linux/types.h>
+#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+
#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID1_MASK | \
BTRFS_BLOCK_GROUP_RAID0 | \
@@ -13,6 +17,7 @@
struct btrfs_io_context;
struct btrfs_io_stripe;
+struct btrfs_fs_info;
struct btrfs_ordered_extent;
struct btrfs_trans_handle;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 792c8e17c3..831fac45e7 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -14,7 +14,6 @@
#include <linux/raid/xor.h>
#include <linux/mm.h>
#include "messages.h"
-#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
@@ -332,12 +331,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
static void merge_rbio(struct btrfs_raid_bio *dest,
struct btrfs_raid_bio *victim)
{
- bio_list_merge(&dest->bio_list, &victim->bio_list);
+ bio_list_merge_init(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
/* Also inherit the bitmaps from @victim. */
bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
dest->stripe_nsectors);
- bio_list_init(&victim->bio_list);
}
/*
@@ -918,6 +916,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
*/
ASSERT(stripe_nsectors <= BITS_PER_LONG);
+ /*
+ * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
+ * (limited by u8).
+ */
+ ASSERT(real_stripes >= 2);
+ ASSERT(real_stripes <= U8_MAX);
+
rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -955,6 +960,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+ ASSERT(rbio->nr_data > 0);
return rbio;
}
@@ -1181,6 +1187,26 @@ static inline void bio_list_put(struct bio_list *bio_list)
bio_put(bio);
}
+static void assert_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
+ !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ /*
+ * At least two stripes (2 disks RAID5), and since real_stripes is U8,
+ * we won't go beyond 256 disks anyway.
+ */
+ ASSERT(rbio->real_stripes >= 2);
+ ASSERT(rbio->nr_data > 0);
+
+ /*
+ * This is another check to make sure nr data stripes is smaller
+ * than total stripes.
+ */
+ ASSERT(rbio->nr_data < rbio->real_stripes);
+}
+
/* Generate PQ for one vertical stripe. */
static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{
@@ -1212,6 +1238,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
pointers[stripe++] = kmap_local_page(sector->page) +
sector->pgoff;
+ assert_rbio(rbio);
raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
@@ -2473,6 +2500,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
}
if (has_qstripe) {
+ assert_rbio(rbio);
/* RAID6, call the library function to fill in our P/Q */
raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 4702136888..0d7b4c2fb6 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,9 +7,18 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/bio.h>
+#include <linux/refcount.h>
#include <linux/workqueue.h>
#include "volumes.h"
+struct page;
+struct sector_ptr;
+struct btrfs_fs_info;
+
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index 5c2b66d155..1c2d7cb1fe 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -6,6 +6,12 @@
#ifndef BTRFS_RCU_STRING_H
#define BTRFS_RCU_STRING_H
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <linux/printk.h>
+
struct rcu_string {
struct rcu_head rcu;
char str[];
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 8c4fc98ca9..9522a8b79d 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
- int type, ret;
+ int type;
+ int ret = 0;
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
@@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
key->objectid, key->offset);
break;
case BTRFS_EXTENT_OWNER_REF_KEY:
- WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) {
+ btrfs_err(fs_info,
+ "found extent owner ref without simple quotas enabled");
+ ret = -EINVAL;
+ }
break;
default:
btrfs_err(fs_info, "invalid key type in iref");
@@ -673,7 +678,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
int ret = 0;
bool metadata;
u64 bytenr = generic_ref->bytenr;
- u64 num_bytes = generic_ref->len;
+ u64 num_bytes = generic_ref->num_bytes;
u64 parent = generic_ref->parent;
u64 ref_root = 0;
u64 owner = 0;
@@ -684,11 +689,11 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.ref_root;
+ ref_root = generic_ref->ref_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.ref_root;
- owner = generic_ref->data_ref.ino;
+ ref_root = generic_ref->ref_root;
+ owner = generic_ref->data_ref.objectid;
offset = generic_ref->data_ref.offset;
}
metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 855de37719..3511e1a5c9 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -6,7 +6,16 @@
#ifndef BTRFS_REF_VERIFY_H
#define BTRFS_REF_VERIFY_H
+#include <linux/types.h>
+#include <linux/rbtree_types.h>
+
+struct btrfs_fs_info;
+struct btrfs_ref;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+
+#include <linux/spinlock.h>
+
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 08d0fb46ce..d0a3fcecc4 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -616,35 +616,6 @@ out:
return ret;
}
-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
-}
-
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- u64 range1_end = loff1 + len - 1;
- u64 range2_end = loff2 + len - 1;
-
- if (inode1 < inode2) {
- swap(inode1, inode2);
- swap(loff1, loff2);
- swap(range1_end, range2_end);
- } else if (inode1 == inode2 && loff2 < loff1) {
- swap(loff1, loff2);
- swap(range1_end, range2_end);
- }
-
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
-
- btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
- btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
-}
-
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
{
if (inode1 < inode2)
@@ -662,17 +633,21 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
+ const u64 end = dst_loff + len - 1;
+ struct extent_state *cached_state = NULL;
struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
const u64 bs = fs_info->sectorsize;
int ret;
/*
- * Lock destination range to serialize with concurrent readahead() and
- * source range to serialize with relocation.
+ * Lock destination range to serialize with concurrent readahead(), and
+ * we are safe from concurrency with relocation of source extents
+ * because we have already locked the inode's i_mmap_lock in exclusive
+ * mode.
*/
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
btrfs_btree_balance_dirty(fs_info);
@@ -690,7 +665,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
if (root_dst->send_in_progress) {
btrfs_warn_rl(root_dst->fs_info,
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
- root_dst->root_key.objectid,
+ btrfs_root_id(root_dst),
root_dst->send_in_progress);
spin_unlock(&root_dst->root_item_lock);
return -EAGAIN;
@@ -724,6 +699,7 @@ out:
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
u64 off, u64 olen, u64 destoff)
{
+ struct extent_state *cached_state = NULL;
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -731,6 +707,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
int wb_ret;
u64 len = olen;
u64 bs = fs_info->sectorsize;
+ u64 end;
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
@@ -763,12 +740,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
}
/*
- * Lock destination range to serialize with concurrent readahead() and
- * source range to serialize with relocation.
+ * Lock destination range to serialize with concurrent readahead(), and
+ * we are safe from concurrency with relocation of source extents
+ * because we have already locked the inode's i_mmap_lock in exclusive
+ * mode.
*/
- btrfs_double_extent_lock(src, off, inode, destoff, len);
+ end = destoff + len - 1;
+ lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
+ unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
/*
* We may have copied an inline extent into a page of the destination
diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h
index ecb309b4da..1e291f7d85 100644
--- a/fs/btrfs/reflink.h
+++ b/fs/btrfs/reflink.h
@@ -3,7 +3,9 @@
#ifndef BTRFS_REFLINK_H
#define BTRFS_REFLINK_H
-#include <linux/fs.h>
+#include <linux/types.h>
+
+struct file;
loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2fca67f2b3..f2935252b9 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -473,20 +473,19 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
struct btrfs_backref_node *node = NULL;
struct btrfs_backref_edge *edge;
int ret;
- int err = 0;
iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
if (!iter)
return ERR_PTR(-ENOMEM);
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
node = btrfs_backref_alloc_node(cache, bytenr, level);
if (!node) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -497,10 +496,9 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
do {
ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
node_key, cur);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
+
edge = list_first_entry_or_null(&cache->pending_edge,
struct btrfs_backref_edge, list[UPPER]);
/*
@@ -515,19 +513,18 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
/* Finish the upper linkage of newly added edges/nodes */
ret = btrfs_backref_finish_upper_links(cache, node);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (handle_useless_nodes(rc, node))
node = NULL;
out:
- btrfs_backref_iter_free(iter);
+ btrfs_free_path(iter->path);
+ kfree(iter);
btrfs_free_path(path);
- if (err) {
+ if (ret) {
btrfs_backref_error_cleanup(cache, node);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
ASSERT(!node || !node->detached);
ASSERT(list_empty(&cache->useless_node) &&
@@ -753,7 +750,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = objectid;
- if (root->root_key.objectid == objectid) {
+ if (btrfs_root_id(root) == objectid) {
u64 commit_root_gen;
/* called by btrfs_init_reloc_root */
@@ -797,7 +794,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
btrfs_set_root_level(root_item, btrfs_header_level(eb));
btrfs_set_root_generation(root_item, trans->transid);
- if (root->root_key.objectid == objectid) {
+ if (btrfs_root_id(root) == objectid) {
btrfs_set_root_refs(root_item, 0);
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
@@ -820,7 +817,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
goto abort;
}
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
return reloc_root;
fail:
kfree(root_item);
@@ -867,7 +864,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
*/
if (root->reloc_root) {
reloc_root = root->reloc_root;
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
return 0;
}
@@ -875,8 +872,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
* We are merging reloc roots, we do not need new reloc trees. Also
* reloc trees never need their own reloc tree.
*/
- if (!rc->create_reloc_tree ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
return 0;
if (!trans->reloc_reserved) {
@@ -884,7 +880,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
trans->block_rsv = rc->block_rsv;
clear_rsv = 1;
}
- reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+ reloc_root = create_reloc_root(trans, root, btrfs_root_id(root));
if (clear_rsv)
trans->block_rsv = rsv;
if (IS_ERR(reloc_root))
@@ -951,60 +947,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
}
/*
- * helper to find first cached inode with inode number >= objectid
- * in a subvolume
- */
-static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
-{
- struct rb_node *node;
- struct rb_node *prev;
- struct btrfs_inode *entry;
- struct inode *inode;
-
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- entry = rb_entry(node, struct btrfs_inode, rb_node);
-
- if (objectid < btrfs_ino(entry))
- node = node->rb_left;
- else if (objectid > btrfs_ino(entry))
- node = node->rb_right;
- else
- break;
- }
- if (!node) {
- while (prev) {
- entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= btrfs_ino(entry)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
- while (node) {
- entry = rb_entry(node, struct btrfs_inode, rb_node);
- inode = igrab(&entry->vfs_inode);
- if (inode) {
- spin_unlock(&root->inode_lock);
- return inode;
- }
-
- objectid = btrfs_ino(entry) + 1;
- if (cond_resched_lock(&root->inode_lock))
- goto again;
-
- node = rb_next(node);
- }
- spin_unlock(&root->inode_lock);
- return NULL;
-}
-
-/*
* get new location of data
*/
static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
@@ -1064,7 +1006,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
u64 parent;
u64 bytenr;
u64 new_bytenr = 0;
@@ -1080,7 +1022,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
return 0;
/* reloc trees always use full backref */
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
parent = leaf->start;
else
parent = 0;
@@ -1109,15 +1051,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
* if we are modifying block in fs tree, wait for read_folio
* to complete and drop the extent cache
*/
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
if (first) {
- inode = find_next_inode(root, key.objectid);
+ inode = btrfs_find_first_inode(root, key.objectid);
first = 0;
- } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
- inode = find_next_inode(root, key.objectid);
+ } else if (inode && btrfs_ino(inode) < key.objectid) {
+ btrfs_add_delayed_iput(inode);
+ inode = btrfs_find_first_inode(root, key.objectid);
}
- if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
+ if (inode && btrfs_ino(inode) == key.objectid) {
struct extent_state *cached_state = NULL;
end = key.offset +
@@ -1126,16 +1068,20 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
fs_info->sectorsize));
WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
- ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end,
- &cached_state);
- if (!ret)
+ /* Take mmap lock to serialize with reflinks. */
+ if (!down_read_trylock(&inode->i_mmap_lock))
+ continue;
+ ret = try_lock_extent(&inode->io_tree, key.offset,
+ end, &cached_state);
+ if (!ret) {
+ up_read(&inode->i_mmap_lock);
continue;
+ }
- btrfs_drop_extent_map_range(BTRFS_I(inode),
- key.offset, end, true);
- unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end, &cached_state);
+ btrfs_drop_extent_map_range(inode, key.offset, end, true);
+ unlock_extent(&inode->io_tree, key.offset, end,
+ &cached_state);
+ up_read(&inode->i_mmap_lock);
}
}
@@ -1153,22 +1099,28 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- num_bytes, parent, root->root_key.objectid);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset,
- root->root_key.objectid, false);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = parent;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_header_owner(leaf);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, parent, root->root_key.objectid);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset,
- root->root_key.objectid, false);
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = bytenr;
+ ref.num_bytes = num_bytes;
+ ref.parent = parent;
+ ref.owning_root = btrfs_root_id(root);
+ ref.ref_root = btrfs_header_owner(leaf);
+ btrfs_init_data_ref(&ref, key.objectid, key.offset,
+ btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1178,7 +1130,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (dirty)
btrfs_mark_buffer_dirty(trans, leaf);
if (inode)
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
return ret;
}
@@ -1224,8 +1176,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
int ret;
int slot;
- ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
- ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
@@ -1377,20 +1329,26 @@ again:
path->slots[level], old_ptr_gen);
btrfs_mark_buffer_dirty(trans, path->nodes[level]);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
- blocksize, path->nodes[level]->start,
- src->root_key.objectid);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
- 0, true);
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = old_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = path->nodes[level]->start;
+ ref.owning_root = btrfs_root_id(src);
+ ref.ref_root = btrfs_root_id(src);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- blocksize, 0, dest->root_key.objectid);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
- true);
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = 0;
+ ref.owning_root = btrfs_root_id(dest);
+ ref.ref_root = btrfs_root_id(dest);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1398,10 +1356,13 @@ again:
}
/* We don't know the real owning_root, use 0. */
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
- blocksize, path->nodes[level]->start, 0);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
- 0, true);
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = new_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = path->nodes[level]->start;
+ ref.owning_root = 0;
+ ref.ref_root = btrfs_root_id(src);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1409,10 +1370,13 @@ again:
}
/* We don't know the real owning_root, use 0. */
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
- blocksize, 0, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
- 0, true);
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.bytenr = old_bytenr;
+ ref.num_bytes = blocksize;
+ ref.parent = 0;
+ ref.owning_root = 0;
+ ref.ref_root = btrfs_root_id(dest);
+ btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1520,7 +1484,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
const struct btrfs_key *max_key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
u64 objectid;
u64 start, end;
u64 ino;
@@ -1530,23 +1494,24 @@ static int invalidate_extent_cache(struct btrfs_root *root,
struct extent_state *cached_state = NULL;
cond_resched();
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
if (objectid > max_key->objectid)
break;
- inode = find_next_inode(root, objectid);
+ inode = btrfs_find_first_inode(root, objectid);
if (!inode)
break;
- ino = btrfs_ino(BTRFS_I(inode));
+ ino = btrfs_ino(inode);
if (ino > max_key->objectid) {
- iput(inode);
+ iput(&inode->vfs_inode);
break;
}
objectid = ino + 1;
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->vfs_inode.i_mode))
continue;
if (unlikely(min_key->objectid == ino)) {
@@ -1579,9 +1544,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_drop_extent_map_range(inode, start, end, true);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
}
return 0;
}
@@ -1616,7 +1581,7 @@ static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
int ret;
/* @root must be a subvolume tree root with a valid reloc tree */
- ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
ASSERT(reloc_root);
reloc_root_item = &reloc_root->root_item;
@@ -1645,7 +1610,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
reloc_dirty_list) {
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
/* Merged subvolume, cleanup its reloc root */
struct btrfs_root *reloc_root = root->reloc_root;
@@ -1774,7 +1739,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* btrfs_update_reloc_root() and update our root item
* appropriately.
*/
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
trans->block_rsv = rc->block_rsv;
replaced = 0;
@@ -1920,13 +1885,13 @@ again:
if (root->reloc_root) {
btrfs_err(fs_info,
"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
- root->root_key.objectid,
- root->reloc_root->root_key.objectid,
+ btrfs_root_id(root),
+ btrfs_root_id(root->reloc_root),
root->reloc_root->root_key.type,
root->reloc_root->root_key.offset,
btrfs_root_generation(
&root->reloc_root->root_item),
- reloc_root->root_key.objectid,
+ btrfs_root_id(reloc_root),
reloc_root->root_key.type,
reloc_root->root_key.offset,
btrfs_root_generation(
@@ -1934,8 +1899,8 @@ again:
} else {
btrfs_err(fs_info,
"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
- root->root_key.objectid,
- reloc_root->root_key.objectid,
+ btrfs_root_id(root),
+ btrfs_root_id(reloc_root),
reloc_root->root_key.type,
reloc_root->root_key.offset,
btrfs_root_generation(
@@ -2117,7 +2082,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root;
int ret;
- if (reloc_root->last_trans == trans->transid)
+ if (btrfs_get_root_last_trans(reloc_root) == trans->transid)
return 0;
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
@@ -2192,7 +2157,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
return ERR_PTR(-EUCLEAN);
}
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
@@ -2299,7 +2264,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return root;
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID)
fs_root = root;
if (next != node)
@@ -2315,9 +2280,8 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
return fs_root;
}
-static noinline_for_stack
-u64 calcu_metadata_size(struct reloc_control *rc,
- struct btrfs_backref_node *node, int reserve)
+static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_backref_node *next = node;
@@ -2326,12 +2290,12 @@ u64 calcu_metadata_size(struct reloc_control *rc,
u64 num_bytes = 0;
int index = 0;
- BUG_ON(reserve && node->processed);
+ BUG_ON(node->processed);
while (next) {
cond_resched();
while (1) {
- if (next->processed && (reserve || next != node))
+ if (next->processed)
break;
num_bytes += fs_info->nodesize;
@@ -2359,7 +2323,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
int ret;
u64 tmp;
- num_bytes = calcu_metadata_size(rc, node, 1) * 2;
+ num_bytes = calcu_metadata_size(rc, node) * 2;
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
@@ -2422,8 +2386,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
- struct btrfs_ref ref = { 0 };
-
cond_resched();
upper = edge->node[UPPER];
@@ -2511,19 +2473,23 @@ static int do_relocation(struct btrfs_trans_handle *trans,
*/
ASSERT(node->eb == eb);
} else {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = node->eb->start,
+ .num_bytes = blocksize,
+ .parent = upper->eb->start,
+ .owning_root = btrfs_header_owner(upper->eb),
+ .ref_root = btrfs_header_owner(upper->eb),
+ };
+
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
btrfs_set_node_ptr_generation(upper->eb, slot,
trans->transid);
btrfs_mark_buffer_dirty(trans, upper->eb);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- node->eb->start, blocksize,
- upper->eb->start,
- btrfs_header_owner(upper->eb));
btrfs_init_tree_ref(&ref, node->level,
- btrfs_header_owner(upper->eb),
- root->root_key.objectid, false);
+ btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
@@ -2775,12 +2741,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct tree_block *block;
struct tree_block *next;
- int ret;
- int err = 0;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out_free_blocks;
}
@@ -2795,8 +2760,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Get first keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready) {
- err = get_tree_block_key(fs_info, block);
- if (err)
+ ret = get_tree_block_key(fs_info, block);
+ if (ret)
goto out_free_path;
}
}
@@ -2806,25 +2771,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
node = build_backref_tree(trans, rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
- err = PTR_ERR(node);
+ ret = PTR_ERR(node);
goto out;
}
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
break;
- }
}
out:
- err = finish_pending_nodes(trans, rc, path, err);
+ ret = finish_pending_nodes(trans, rc, path, ret);
out_free_path:
btrfs_free_path(path);
out_free_blocks:
free_block_list(blocks);
- return err;
+ return ret;
}
static noinline_for_stack int prealloc_file_extent_cluster(
@@ -2849,7 +2812,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
* btrfs_do_readpage() call of previously relocated file cluster.
*
* If the current cluster starts in the above range, btrfs_do_readpage()
- * will skip the read, and relocate_one_page() will later writeback
+ * will skip the read, and relocate_one_folio() will later writeback
* the padding zeros as new data, causing data corruption.
*
* Here we have to manually invalidate the range (i_size, PAGE_END + 1).
@@ -2858,7 +2821,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
struct address_space *mapping = inode->vfs_inode.i_mapping;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- struct page *page;
+ struct folio *folio;
ASSERT(sectorsize < PAGE_SIZE);
ASSERT(IS_ALIGNED(i_size, sectorsize));
@@ -2889,16 +2852,16 @@ static noinline_for_stack int prealloc_file_extent_cluster(
clear_extent_bits(&inode->io_tree, i_size,
round_up(i_size, PAGE_SIZE) - 1,
EXTENT_UPTODATE);
- page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+ folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
/*
* If page is freed we don't need to do anything then, as we
* will re-read the whole page anyway.
*/
- if (page) {
- btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size,
+ if (!IS_ERR(folio)) {
+ btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
round_up(i_size, PAGE_SIZE) - i_size);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
}
@@ -2983,68 +2946,71 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
return cluster->boundary[cluster_nr + 1] - 1;
}
-static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
- const struct file_extent_cluster *cluster,
- int *cluster_nr, unsigned long page_index)
+static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra,
+ const struct file_extent_cluster *cluster,
+ int *cluster_nr, unsigned long index)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 offset = BTRFS_I(inode)->index_cnt;
const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- struct page *page;
- u64 page_start;
- u64 page_end;
+ struct folio *folio;
+ u64 folio_start;
+ u64 folio_end;
u64 cur;
int ret;
- ASSERT(page_index <= last_index);
- page = find_lock_page(inode->i_mapping, page_index);
- if (!page) {
+ ASSERT(index <= last_index);
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (IS_ERR(folio)) {
page_cache_sync_readahead(inode->i_mapping, ra, NULL,
- page_index, last_index + 1 - page_index);
- page = find_or_create_page(inode->i_mapping, page_index, mask);
- if (!page)
- return -ENOMEM;
+ index, last_index + 1 - index);
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
}
- if (PageReadahead(page))
+ WARN_ON(folio_order(folio));
+
+ if (folio_test_readahead(folio))
page_cache_async_readahead(inode->i_mapping, ra, NULL,
- page_folio(page), page_index,
- last_index + 1 - page_index);
+ folio, index,
+ last_index + 1 - index);
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
ret = -EIO;
- goto release_page;
+ goto release_folio;
}
}
/*
- * We could have lost page private when we dropped the lock to read the
- * page above, make sure we set_page_extent_mapped here so we have any
+ * We could have lost folio private when we dropped the lock to read the
+ * folio above, make sure we set_page_extent_mapped here so we have any
* of the subpage blocksize stuff we need in place.
*/
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
- goto release_page;
+ goto release_folio;
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
+ folio_start = folio_pos(folio);
+ folio_end = folio_start + PAGE_SIZE - 1;
/*
* Start from the cluster, as for subpage case, the cluster can start
- * inside the page.
+ * inside the folio.
*/
- cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
- while (cur <= page_end) {
+ cur = max(folio_start, cluster->boundary[*cluster_nr] - offset);
+ while (cur <= folio_end) {
struct extent_state *cached_state = NULL;
u64 extent_start = cluster->boundary[*cluster_nr] - offset;
u64 extent_end = get_cluster_boundary_end(cluster,
*cluster_nr) - offset;
- u64 clamped_start = max(page_start, extent_start);
- u64 clamped_end = min(page_end, extent_end);
+ u64 clamped_start = max(folio_start, extent_start);
+ u64 clamped_end = min(folio_end, extent_end);
u32 clamped_len = clamped_end + 1 - clamped_start;
/* Reserve metadata for this range */
@@ -3052,7 +3018,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len, clamped_len,
false);
if (ret)
- goto release_page;
+ goto release_folio;
/* Mark the range delalloc and dirty for later writeback */
lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
@@ -3068,20 +3034,18 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
clamped_len);
- goto release_page;
+ goto release_folio;
}
- btrfs_folio_set_dirty(fs_info, page_folio(page),
- clamped_start, clamped_len);
+ btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len);
/*
- * Set the boundary if it's inside the page.
+ * Set the boundary if it's inside the folio.
* Data relocation requires the destination extents to have the
* same size as the source.
* EXTENT_BOUNDARY bit prevents current extent from being merged
* with previous extent.
*/
- if (in_range(cluster->boundary[*cluster_nr] - offset,
- page_start, PAGE_SIZE)) {
+ if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
u64 boundary_start = cluster->boundary[*cluster_nr] -
offset;
u64 boundary_end = boundary_start +
@@ -3104,8 +3068,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
break;
}
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
@@ -3113,9 +3077,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
ret = -ECANCELED;
return ret;
-release_page:
- unlock_page(page);
- put_page(page);
+release_folio:
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
@@ -3150,7 +3114,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
last_index = (cluster->end - offset) >> PAGE_SHIFT;
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
- ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
+ ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3927,7 +3891,7 @@ static noinline_for_stack struct inode *create_reloc_inode(
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
u64 objectid;
- int err = 0;
+ int ret = 0;
root = btrfs_grab_root(fs_info->data_reloc_root);
trans = btrfs_start_transaction(root, 6);
@@ -3936,31 +3900,31 @@ static noinline_for_stack struct inode *create_reloc_inode(
return ERR_CAST(trans);
}
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
+ ret = btrfs_get_free_objectid(root, &objectid);
+ if (ret)
goto out;
- err = __insert_orphan_inode(trans, root, objectid);
- if (err)
+ ret = __insert_orphan_inode(trans, root, objectid);
+ if (ret)
goto out;
inode = btrfs_iget(fs_info->sb, objectid, root);
if (IS_ERR(inode)) {
delete_orphan_inode(trans, root, objectid);
- err = PTR_ERR(inode);
+ ret = PTR_ERR(inode);
inode = NULL;
goto out;
}
BTRFS_I(inode)->index_cnt = group->start;
- err = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- if (err) {
+ if (ret) {
iput(inode);
- inode = ERR_PTR(err);
+ inode = ERR_PTR(ret);
}
return inode;
}
@@ -4438,9 +4402,11 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
disk_bytenr + ordered->num_bytes - 1,
- &list, 0, false);
- if (ret)
+ &list, false);
+ if (ret < 0) {
+ btrfs_mark_ordered_extent_error(ordered);
return ret;
+ }
while (!list_empty(&list)) {
struct btrfs_ordered_sum *sums =
@@ -4490,8 +4456,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
btrfs_root_last_snapshot(&root->root_item))
first_cow = 1;
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
- rc->create_reloc_tree) {
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) {
WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level];
@@ -4584,8 +4549,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
}
new_root = pending->snap;
- reloc_root = create_reloc_root(trans, root->reloc_root,
- new_root->root_key.objectid);
+ reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root));
if (IS_ERR(reloc_root))
return PTR_ERR(reloc_root);
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 5fb60f2deb..788c86d863 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -3,6 +3,15 @@
#ifndef BTRFS_RELOCATION_H
#define BTRFS_RELOCATION_H
+#include <linux/types.h>
+
+struct extent_buffer;
+struct btrfs_fs_info;
+struct btrfs_root;
+struct btrfs_trans_handle;
+struct btrfs_ordered_extent;
+struct btrfs_pending_snapshot;
+
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5260677ad5..33962671a9 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -10,7 +10,6 @@
#include "messages.h"
#include "transaction.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "qgroup.h"
#include "space-info.h"
#include "accessors.h"
@@ -82,7 +81,14 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
if (ret > 0)
goto out;
} else {
- BUG_ON(ret == 0); /* Logical error */
+ /*
+ * Key with offset -1 found, there would have to exist a root
+ * with such id, but this is out of the valid range.
+ */
+ if (ret == 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
@@ -142,8 +148,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
if (ret > 0) {
btrfs_crit(fs_info,
"unable to find root key (%llu %u %llu) in tree %llu",
- key->objectid, key->type, key->offset,
- root->root_key.objectid);
+ key->objectid, key->type, key->offset, btrfs_root_id(root));
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -323,8 +328,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
goto out;
-
- BUG_ON(ret != 0);
+ if (ret != 0) {
+ /* The root must exist but we did not find it by the key. */
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = btrfs_del_item(trans, root, path);
out:
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index de0175cbdb..8f5739e732 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -3,7 +3,17 @@
#ifndef BTRFS_ROOT_TREE_H
#define BTRFS_ROOT_TREE_H
+#include <linux/types.h>
+
struct fscrypt_str;
+struct extent_buffer;
+struct btrfs_key;
+struct btrfs_root;
+struct btrfs_root_item;
+struct btrfs_path;
+struct btrfs_fs_info;
+struct btrfs_block_rsv;
+struct btrfs_trans_handle;
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d89d5c0a8c..d7caa3732f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1390,8 +1390,15 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
+ if (ret == 0) {
+ /*
+ * Key with offset -1 found, there would have to exist an extent
+ * item with such offset, but this is out of the valid range.
+ */
+ btrfs_release_path(path);
+ return -EUCLEAN;
+ }
- ASSERT(ret > 0);
/*
* Here we intentionally pass 0 as @min_objectid, as there could be
* an extent item starting before @search_start.
@@ -1681,20 +1688,24 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
(i << fs_info->sectorsize_bits);
int err;
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
- fs_info, scrub_read_endio, stripe);
- bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
-
io_stripe.is_scrub = true;
+ stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
+ /*
+ * For RST cases, we need to manually split the bbio to
+ * follow the RST boundary.
+ */
err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &stripe_len, &bioc, &io_stripe,
- &mirror);
+ &stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
- if (err) {
- btrfs_bio_end_io(bbio,
- errno_to_blk_status(err));
- return;
+ if (err < 0) {
+ set_bit(i, &stripe->io_error_bitmap);
+ set_bit(i, &stripe->error_bitmap);
+ continue;
}
+
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+ fs_info, scrub_read_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
}
__bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
@@ -2093,7 +2104,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
const u64 logical_end = logical_start + logical_length;
u64 cur_logical = logical_start;
- int ret;
+ int ret = 0;
/* The range must be inside the bg */
ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index 7639103ebf..f0df597b75 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -3,6 +3,12 @@
#ifndef BTRFS_SCRUB_H
#define BTRFS_SCRUB_H
+#include <linux/types.h>
+
+struct btrfs_fs_info;
+struct btrfs_device;
+struct btrfs_scrub_progress;
+
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e818766915..3dd4a48479 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -25,7 +25,6 @@
#include "btrfs_inode.h"
#include "transaction.h"
#include "compression.h"
-#include "xattr.h"
#include "print-tree.h"
#include "accessors.h"
#include "dir-item.h"
@@ -393,9 +392,8 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
btrfs_err(sctx->send_root->fs_info,
"Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
result_string, what, sctx->cmp_key->objectid,
- sctx->send_root->root_key.objectid,
- (sctx->parent_root ?
- sctx->parent_root->root_key.objectid : 0));
+ btrfs_root_id(sctx->send_root),
+ (sctx->parent_root ? btrfs_root_id(sctx->parent_root) : 0));
}
__maybe_unused
@@ -777,7 +775,12 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
if (WARN_ON(!sctx->send_buf))
return -EINVAL;
- BUG_ON(sctx->send_size);
+ if (unlikely(sctx->send_size != 0)) {
+ btrfs_err(sctx->send_root->fs_info,
+ "send: command header buffer not empty cmd %d offset %llu",
+ cmd, sctx->send_off);
+ return -EINVAL;
+ }
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
@@ -1312,9 +1315,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
u64 root = (u64)(uintptr_t)key;
const struct clone_root *cr = elt;
- if (root < cr->root->root_key.objectid)
+ if (root < btrfs_root_id(cr->root))
return -1;
- if (root > cr->root->root_key.objectid)
+ if (root > btrfs_root_id(cr->root))
return 1;
return 0;
}
@@ -1324,9 +1327,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
const struct clone_root *cr1 = e1;
const struct clone_root *cr2 = e2;
- if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
+ if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root))
return -1;
- if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
+ if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root))
return 1;
return 0;
}
@@ -1414,7 +1417,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
struct btrfs_lru_cache_entry *raw_entry;
struct backref_cache_entry *entry;
- if (btrfs_lru_cache_size(&sctx->backref_cache) == 0)
+ if (sctx->backref_cache.size == 0)
return false;
/*
@@ -1512,7 +1515,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
- if (btrfs_lru_cache_size(&sctx->backref_cache) == 1)
+ if (sctx->backref_cache.size == 1)
sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
}
@@ -1774,7 +1777,7 @@ static int read_symlink(struct btrfs_root *root,
*/
btrfs_err(root->fs_info,
"Found empty symlink inode %llu at root %llu",
- ino, root->root_key.objectid);
+ ino, btrfs_root_id(root));
ret = -EIO;
goto out;
}
@@ -2528,7 +2531,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
}
- key.objectid = send_root->root_key.objectid;
+ key.objectid = btrfs_root_id(send_root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
@@ -2544,7 +2547,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
- key.objectid != send_root->root_key.objectid) {
+ key.objectid != btrfs_root_id(send_root)) {
ret = -ENOENT;
goto out;
}
@@ -2817,8 +2820,7 @@ static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
static int trim_dir_utimes_cache(struct send_ctx *sctx)
{
- while (btrfs_lru_cache_size(&sctx->dir_utimes_cache) >
- SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
+ while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
struct btrfs_lru_cache_entry *lru;
int ret;
@@ -4190,7 +4192,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* This should never happen as the root dir always has the same ref
* which is always '..'
*/
- BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+ if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) {
+ btrfs_err(fs_info,
+ "send: unexpected inode %llu in process_recorded_refs()",
+ sctx->cur_ino);
+ ret = -EINVAL;
+ goto out;
+ }
valid_path = fs_path_alloc();
if (!valid_path) {
@@ -5265,10 +5273,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct page *page;
+ struct folio *folio;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
+ struct address_space *mapping = sctx->cur_inode->i_mapping;
int ret;
ret = put_data_header(sctx, len);
@@ -5281,44 +5290,44 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
- page = find_lock_page(sctx->cur_inode->i_mapping, index);
- if (!page) {
- page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+ folio = filemap_lock_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ page_cache_sync_readahead(mapping,
&sctx->ra, NULL, index,
last_index + 1 - index);
- page = find_or_create_page(sctx->cur_inode->i_mapping,
- index, GFP_KERNEL);
- if (!page) {
- ret = -ENOMEM;
+ folio = filemap_grab_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
break;
}
}
- if (PageReadahead(page))
- page_cache_async_readahead(sctx->cur_inode->i_mapping,
- &sctx->ra, NULL, page_folio(page),
+ WARN_ON(folio_order(folio));
+
+ if (folio_test_readahead(folio))
+ page_cache_async_readahead(mapping, &sctx->ra, NULL, folio,
index, last_index + 1 - index);
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
+ if (!folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
btrfs_err(fs_info,
"send: IO error at offset %llu for inode %llu root %llu",
- page_offset(page), sctx->cur_ino,
- sctx->send_root->root_key.objectid);
- put_page(page);
+ folio_pos(folio), sctx->cur_ino,
+ btrfs_root_id(sctx->send_root));
+ folio_put(folio);
ret = -EIO;
break;
}
}
- memcpy_from_page(sctx->send_buf + sctx->send_size, page,
- pg_offset, cur_len);
- unlock_page(page);
- put_page(page);
+ memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
+ pg_offset, cur_len);
+ folio_unlock(folio);
+ folio_put(folio);
index++;
pg_offset = 0;
len -= cur_len;
@@ -5379,7 +5388,7 @@ static int send_clone(struct send_ctx *sctx,
btrfs_debug(sctx->send_root->fs_info,
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, clone_root->root->root_key.objectid,
+ offset, len, btrfs_root_id(clone_root->root),
clone_root->ino, clone_root->offset);
p = fs_path_alloc();
@@ -6466,21 +6475,18 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
return 0;
- if (sctx->cur_inode_last_extent == (u64)-1) {
- ret = get_last_extent(sctx, key->offset - 1);
- if (ret)
- return ret;
- }
-
- if (path->slots[0] == 0 &&
- sctx->cur_inode_last_extent < key->offset) {
- /*
- * We might have skipped entire leafs that contained only
- * file extent items for our current inode. These leafs have
- * a generation number smaller (older) than the one in the
- * current leaf and the leaf our last extent came from, and
- * are located between these 2 leafs.
- */
+ /*
+ * Get last extent's end offset (exclusive) if we haven't determined it
+ * yet (we're processing the first file extent item that is new), or if
+ * we're at the first slot of a leaf and the last extent's end is less
+ * than the current extent's offset, because we might have skipped
+ * entire leaves that contained only file extent items for our current
+ * inode. These leaves have a generation number smaller (older) than the
+ * one in the current leaf and the leaf our last extent came from, and
+ * are located between these 2 leaves.
+ */
+ if ((sctx->cur_inode_last_extent == (u64)-1) ||
+ (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) {
ret = get_last_extent(sctx, key->offset - 1);
if (ret)
return ret;
@@ -7331,7 +7337,7 @@ static int search_key_again(const struct send_ctx *sctx,
"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
key->objectid, key->type, key->offset,
(root == sctx->parent_root ? "parent" : "send"),
- root->root_key.objectid, path->lowest_level,
+ btrfs_root_id(root), path->lowest_level,
path->slots[path->lowest_level]);
return -EUCLEAN;
}
@@ -7437,8 +7443,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
u64 reada_done = 0;
lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+ ASSERT(*level != 0);
- BUG_ON(*level == 0);
eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb))
return PTR_ERR(eb);
@@ -8065,7 +8071,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
if (root->send_in_progress < 0)
btrfs_err(root->fs_info,
"send_in_progress unbalanced %d root %llu",
- root->send_in_progress, root->root_key.objectid);
+ root->send_in_progress, btrfs_root_id(root));
spin_unlock(&root->root_item_lock);
}
@@ -8073,7 +8079,7 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
{
btrfs_warn_rl(root->fs_info,
"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
- root->root_key.objectid, root->dedupe_in_progress);
+ btrfs_root_id(root), root->dedupe_in_progress);
}
long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 4f5509cb18..dd1c9f02b0 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -8,6 +8,11 @@
#define BTRFS_SEND_H
#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/align.h>
+
+struct inode;
+struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
/* Conditional support for the upcoming protocol version. */
@@ -25,9 +30,6 @@
#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE)
-struct inode;
-struct btrfs_ioctl_send_args;
-
enum btrfs_tlv_type {
BTRFS_TLV_U8,
BTRFS_TLV_U16,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 3b54eb5834..8f194eefd3 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -9,7 +9,6 @@
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
-#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
@@ -312,7 +311,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
found->bytes_used += block_group->used;
found->disk_used += block_group->used * factor;
found->bytes_readonly += block_group->bytes_super;
- found->bytes_zone_unusable += block_group->zone_unusable;
+ btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
@@ -374,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
* "optimal" chunk size based on the fs size. However when we actually
* allocate the chunk we will strip this down further, making it no more
* than 10% of the disk or 1G, whichever is smaller.
+ *
+ * On the zoned mode, we need to use zone_size (=
+ * data_sinfo->chunk_size) as it is.
*/
data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- data_chunk_size = min(data_sinfo->chunk_size,
- mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
- data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ if (!btrfs_is_zoned(fs_info)) {
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ } else {
+ data_chunk_size = data_sinfo->chunk_size;
+ }
/*
* Since data allocations immediately use block groups as part of the
@@ -406,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+
+ /*
+ * On the zoned mode, we always allocate one zone as one chunk.
+ * Returning non-zone size alingned bytes here will result in
+ * less pressure for the async metadata reclaim process, and it
+ * will over-commit too much leading to ENOSPC. Align down to the
+ * zone size to avoid that.
+ */
+ if (btrfs_is_zoned(fs_info))
+ avail = ALIGN_DOWN(avail, fs_info->zone_size);
+
return avail;
}
@@ -556,8 +573,7 @@ again:
spin_lock(&cache->lock);
avail = cache->length - cache->used - cache->pinned -
- cache->reserved - cache->delalloc_bytes -
- cache->bytes_super - cache->zone_unusable;
+ cache->reserved - cache->bytes_super - cache->zone_unusable;
btrfs_info(fs_info,
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s",
cache->start, cache->length, cache->used, cache->pinned,
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 92c595fed1..3e304300fc 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -4,8 +4,17 @@
#define BTRFS_SPACE_INFO_H
#include <trace/events/btrfs.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kobject.h>
+#include <linux/lockdep.h>
+#include <linux/wait.h>
+#include <linux/rwsem.h>
#include "volumes.h"
+struct btrfs_fs_info;
+struct btrfs_block_group;
+
/*
* Different levels for to flush space when doing space reservations.
*
@@ -198,6 +207,7 @@ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
+DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 0e49dab8da..54736f6238 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -111,6 +111,9 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector
subpage_info->checked_offset = cur;
cur += nr_bits;
+ subpage_info->locked_offset = cur;
+ cur += nr_bits;
+
subpage_info->total_nr_bits = cur;
}
@@ -237,28 +240,58 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
start + len <= folio_pos(folio) + PAGE_SIZE);
}
+#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
+({ \
+ unsigned int start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, folio, start, len); \
+ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ start_bit += fs_info->subpage_info->name##_offset; \
+ start_bit; \
+})
+
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = len >> fs_info->sectorsize_bits;
+ unsigned long flags;
+
btrfs_subpage_assert(fs_info, folio, start, len);
+ spin_lock_irqsave(&subpage->lock, flags);
+ /*
+ * Even though it's just for reading the page, no one should have
+ * locked the subpage range.
+ */
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ bitmap_set(subpage->bitmaps, start_bit, nbits);
atomic_add(nbits, &subpage->readers);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = len >> fs_info->sectorsize_bits;
+ unsigned long flags;
bool is_data;
bool last;
btrfs_subpage_assert(fs_info, folio, start, len);
is_data = is_data_inode(folio->mapping->host);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+
+ /* The range should have already been locked. */
+ ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
ASSERT(atomic_read(&subpage->readers) >= nbits);
+
+ bitmap_clear(subpage->bitmaps, start_bit, nbits);
last = atomic_sub_and_test(nbits, &subpage->readers);
/*
@@ -270,6 +303,7 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
*/
if (is_data && last)
folio_unlock(folio);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
@@ -290,28 +324,38 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
orig_start + orig_len) - *start;
}
-void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
+ unsigned long flags;
int ret;
btrfs_subpage_assert(fs_info, folio, start, len);
+ spin_lock_irqsave(&subpage->lock, flags);
ASSERT(atomic_read(&subpage->readers) == 0);
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ bitmap_set(subpage->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &subpage->writers);
ASSERT(ret == nbits);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
-bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
+ unsigned long flags;
+ bool last;
btrfs_subpage_assert(fs_info, folio, start, len);
+ spin_lock_irqsave(&subpage->lock, flags);
/*
* We have call sites passing @lock_page into
* extent_clear_unlock_delalloc() for compression path.
@@ -319,11 +363,18 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
* This @locked_page is locked by plain lock_page(), thus its
* subpage::writers is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->writers) == 0)
+ if (atomic_read(&subpage->writers) == 0) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
return true;
+ }
ASSERT(atomic_read(&subpage->writers) >= nbits);
- return atomic_sub_and_test(nbits, &subpage->writers);
+ /* The target range should have been locked. */
+ ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
+ bitmap_clear(subpage->bitmaps, start_bit, nbits);
+ last = atomic_sub_and_test(nbits, &subpage->writers);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return last;
}
/*
@@ -365,16 +416,6 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
folio_unlock(folio);
}
-#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
-({ \
- unsigned int start_bit; \
- \
- btrfs_subpage_assert(fs_info, folio, start, len); \
- start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- start_bit += fs_info->subpage_info->name##_offset; \
- start_bit; \
-})
-
#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
bitmap_test_range_all_set(subpage->bitmaps, \
fs_info->subpage_info->name##_offset, \
@@ -751,6 +792,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap);
GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap);
GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
dump_page(folio_page(folio, 0), "btrfs subpage dump");
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 793c2b314a..b6dc013b0f 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -4,6 +4,11 @@
#define BTRFS_SUBPAGE_H
#include <linux/spinlock.h>
+#include <linux/atomic.h>
+
+struct address_space;
+struct folio;
+struct btrfs_fs_info;
/*
* Extra info for subpapge bitmap.
@@ -28,7 +33,7 @@ struct btrfs_subpage_info {
unsigned int total_nr_bits;
/*
- * *_start indicates where the bitmap starts, the length is always
+ * *_offset indicates where the bitmap starts, the length is always
* @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
*/
unsigned int uptodate_offset;
@@ -36,6 +41,16 @@ struct btrfs_subpage_info {
unsigned int writeback_offset;
unsigned int ordered_offset;
unsigned int checked_offset;
+
+ /*
+ * For locked bitmaps, normally it's subpage representation for folio
+ * Locked flag, but metadata is different:
+ *
+ * - Metadata doesn't really lock the folio
+ * It's just to prevent page::private get cleared before the last
+ * end_page_read().
+ */
+ unsigned int locked_offset;
};
/*
@@ -93,10 +108,6 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
-void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c45fdaf24c..f05cce7c8b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -34,13 +34,11 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "print-tree.h"
#include "props.h"
#include "xattr.h"
#include "bio.h"
#include "export.h"
#include "compression.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "free-space-cache.h"
#include "backref.h"
@@ -121,6 +119,7 @@ enum {
Opt_thread_pool,
Opt_treelog,
Opt_user_subvol_rm_allowed,
+ Opt_norecovery,
/* Rescue options */
Opt_rescue,
@@ -247,6 +246,8 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
__fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
__fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
+ /* For compatibility only, alias for "rescue=nologreplay". */
+ fsparam_flag("norecovery", Opt_norecovery),
/* Debugging options. */
fsparam_flag_no("enospc_debug", Opt_enospc_debug),
@@ -440,6 +441,11 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
+ case Opt_norecovery:
+ btrfs_info(NULL,
+"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'");
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+ break;
case Opt_flushoncommit:
if (result.negated)
btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
@@ -1099,10 +1105,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
- seq_printf(seq, ",subvolid=%llu",
- BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
subvol_name = btrfs_get_subvol_name_from_objectid(info,
- BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
seq_puts(seq, ",subvol=");
seq_escape(seq, subvol_name, " \t\n\\");
@@ -1154,7 +1159,7 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
struct super_block *s = root->d_sb;
struct btrfs_fs_info *fs_info = btrfs_sb(s);
struct inode *root_inode = d_inode(root);
- u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+ u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root);
ret = 0;
if (!is_subvolume_inode(root_inode)) {
@@ -1776,10 +1781,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^=
- BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
- buf->f_fsid.val[1] ^=
- BTRFS_I(d_inode(dentry))->root->root_key.objectid;
+ buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32;
+ buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root);
return 0;
}
@@ -2203,7 +2206,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
- vol->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_check_ioctl_vol_args_path(vol);
+ if (ret < 0)
+ goto out;
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
@@ -2245,6 +2250,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
break;
}
+out:
kfree(vol);
return ret;
}
@@ -2373,6 +2379,24 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
return 0;
}
+static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+ trace_btrfs_extent_map_shrinker_count(fs_info, nr);
+
+ return nr;
+}
+
+static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
+{
+ const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ return btrfs_free_extent_maps(fs_info, nr_to_scan);
+}
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
@@ -2386,6 +2410,8 @@ static const struct super_operations btrfs_super_ops = {
.statfs = btrfs_statfs,
.freeze_fs = btrfs_freeze,
.unfreeze_fs = btrfs_unfreeze,
+ .nr_cached_objects = btrfs_nr_cached_objects,
+ .free_cached_objects = btrfs_free_cached_objects,
};
static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index f18253ca28..cbcab434b5 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -3,6 +3,13 @@
#ifndef BTRFS_SUPER_H
#define BTRFS_SUPER_H
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "fs.h"
+
+struct super_block;
+struct btrfs_fs_info;
+
bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
unsigned long flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 84c05246ff..af545b6b11 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -421,7 +421,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes,
static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf)
{
- return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
+ return sysfs_emit(buf, "%d\n", IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
}
BTRFS_ATTR(static_feature, acl, acl_show);
@@ -1228,11 +1228,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ const enum btrfs_read_policy policy = READ_ONCE(fs_devices->read_policy);
ssize_t ret = 0;
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (fs_devices->read_policy == i)
+ if (policy == i)
ret += sysfs_emit_at(buf, ret, "%s[%s]",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
@@ -1256,8 +1257,8 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
- if (i != fs_devices->read_policy) {
- fs_devices->read_policy = i;
+ if (i != READ_ONCE(fs_devices->read_policy)) {
+ WRITE_ONCE(fs_devices->read_policy, i);
btrfs_info(fs_devices->fs_info,
"read policy set to '%s'",
btrfs_read_policy_name[i]);
@@ -1306,6 +1307,47 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
+#ifdef CONFIG_BTRFS_DEBUG
+static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+
+ switch (READ_ONCE(fs_devices->offload_csum_mode)) {
+ case BTRFS_OFFLOAD_CSUM_AUTO:
+ return sysfs_emit(buf, "auto\n");
+ case BTRFS_OFFLOAD_CSUM_FORCE_ON:
+ return sysfs_emit(buf, "1\n");
+ case BTRFS_OFFLOAD_CSUM_FORCE_OFF:
+ return sysfs_emit(buf, "0\n");
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+}
+
+static ssize_t btrfs_offload_csum_store(struct kobject *kobj,
+ struct kobj_attribute *a, const char *buf,
+ size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ int ret;
+ bool val;
+
+ ret = kstrtobool(buf, &val);
+ if (ret == 0)
+ WRITE_ONCE(fs_devices->offload_csum_mode,
+ val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF);
+ else if (ret == -EINVAL && sysfs_streq(buf, "auto"))
+ WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO);
+ else
+ return -EINVAL;
+
+ return len;
+}
+BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store);
+#endif
+
/*
* Per-filesystem information and stats.
*
@@ -1325,6 +1367,9 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_ATTR_PTR(, offload_csum),
+#endif
NULL,
};
@@ -2294,7 +2339,7 @@ int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
struct kobject *qgroups_kobj = fs_info->qgroups_kobj;
int ret;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return 0;
if (qgroup->kobj.state_initialized)
return 0;
@@ -2315,7 +2360,7 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup *next;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return;
rbtree_postorder_for_each_entry_safe(qgroup, next,
@@ -2336,7 +2381,7 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup *next;
int ret = 0;
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return 0;
ASSERT(fsid_kobj);
@@ -2368,7 +2413,7 @@ out:
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup)
{
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ if (btrfs_is_testing(fs_info))
return;
if (qgroup->kobj.state_initialized) {
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 86c7eef128..e6a284c598 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -3,8 +3,17 @@
#ifndef BTRFS_SYSFS_H
#define BTRFS_SYSFS_H
+#include <linux/types.h>
+#include <linux/compiler_types.h>
#include <linux/kobject.h>
+struct btrfs_fs_info;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_block_group;
+struct btrfs_space_info;
+struct btrfs_qgroup;
+
enum btrfs_feature_set {
FEAT_COMPAT,
FEAT_COMPAT_RO,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 709c6cc970..dce0387ef1 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -160,8 +160,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
if (!fs_info)
return;
- if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
- &fs_info->fs_state)))
+ if (WARN_ON(!btrfs_is_testing(fs_info)))
return;
test_mnt->mnt_sb->s_fs_info = NULL;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 47b5d30103..ba36794ba2 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -11,19 +11,22 @@
#include "../disk-io.h"
#include "../block-group.h"
-static void free_extent_map_tree(struct extent_map_tree *em_tree)
+static int free_extent_map_tree(struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
struct rb_node *node;
+ int ret = 0;
write_lock(&em_tree->lock);
while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
- remove_extent_mapping(em_tree, em);
+ remove_extent_mapping(inode, em);
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
+ ret = -EINVAL;
test_err(
"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d",
em->start, em->len, em->block_start,
@@ -35,6 +38,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
free_extent_map(em);
}
write_unlock(&em_tree->lock);
+
+ return ret;
}
/*
@@ -53,13 +58,14 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
* ->add_extent_mapping(0, 16K)
* -> #handle -EEXIST
*/
-static int test_case_1(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 start = 0;
u64 len = SZ_8K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -73,7 +79,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 16K)");
@@ -94,7 +100,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = SZ_32K; /* avoid merging */
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [16K, 20K)");
@@ -115,7 +121,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = start;
em->block_len = len;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
@@ -137,7 +143,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -148,11 +156,12 @@ out:
* Reading the inline ending up with EEXIST, ie. read an inline
* extent and discard page cache and read it again.
*/
-static int test_case_2(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -166,7 +175,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 1K)");
@@ -187,7 +196,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -208,7 +217,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case2 [0 1K]: ret %d", ret);
@@ -229,17 +238,21 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
static int __test_case_3(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree, u64 start)
+ struct btrfs_inode *inode, u64 start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 len = SZ_4K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -253,7 +266,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -274,7 +287,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case3 [%llu %llu): ret %d",
@@ -301,7 +314,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -322,28 +337,29 @@ out:
* -> add_extent_mapping()
* -> add_extent_mapping()
*/
-static int test_case_3(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_3(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
int ret;
- ret = __test_case_3(fs_info, em_tree, 0);
+ ret = __test_case_3(fs_info, inode, 0);
if (ret)
return ret;
- ret = __test_case_3(fs_info, em_tree, SZ_8K);
+ ret = __test_case_3(fs_info, inode, SZ_8K);
if (ret)
return ret;
- ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K));
+ ret = __test_case_3(fs_info, inode, (12 * SZ_1K));
return ret;
}
static int __test_case_4(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree, u64 start)
+ struct btrfs_inode *inode, u64 start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 len = SZ_4K;
int ret;
+ int ret2;
em = alloc_extent_map();
if (!em) {
@@ -357,7 +373,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_8K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 8K)");
@@ -378,7 +394,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = SZ_16K; /* avoid merging */
em->block_len = 24 * SZ_1K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [8K, 32K)");
@@ -398,7 +414,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_32K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case4 [%llu %llu): ret %d",
@@ -420,7 +436,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
}
free_extent_map(em);
out:
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
return ret;
}
@@ -450,23 +468,22 @@ out:
* # handle -EEXIST when adding
* # [0, 32K)
*/
-static int test_case_4(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree)
+static int test_case_4(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
int ret;
- ret = __test_case_4(fs_info, em_tree, 0);
+ ret = __test_case_4(fs_info, inode, 0);
if (ret)
return ret;
- ret = __test_case_4(fs_info, em_tree, SZ_4K);
+ ret = __test_case_4(fs_info, inode, SZ_4K);
return ret;
}
-static int add_compressed_extent(struct btrfs_fs_info *fs_info,
- struct extent_map_tree *em_tree,
+static int add_compressed_extent(struct btrfs_inode *inode,
u64 start, u64 len, u64 block_start)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
@@ -482,7 +499,7 @@ static int add_compressed_extent(struct btrfs_fs_info *fs_info,
em->block_len = SZ_4K;
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
free_extent_map(em);
if (ret < 0) {
@@ -588,53 +605,44 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
* They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from
* merging the em's.
*/
-static int test_case_5(struct btrfs_fs_info *fs_info)
+static int test_case_5(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
- struct extent_map_tree *em_tree;
- struct inode *inode;
u64 start, end;
int ret;
+ int ret2;
test_msg("Running btrfs_drop_extent_map_range tests");
- inode = btrfs_new_test_inode();
- if (!inode) {
- test_std_err(TEST_ALLOC_INODE);
- return -ENOMEM;
- }
-
- em_tree = &BTRFS_I(inode)->extent_tree;
-
/* [0, 12k) */
- ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0);
+ ret = add_compressed_extent(inode, 0, SZ_4K * 3, 0);
if (ret) {
test_err("cannot add extent range [0, 12K)");
goto out;
}
/* [12k, 24k) */
- ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
+ ret = add_compressed_extent(inode, SZ_4K * 3, SZ_4K * 3, SZ_4K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [24k, 36k) */
- ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
+ ret = add_compressed_extent(inode, SZ_4K * 6, SZ_4K * 3, SZ_8K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [36k, 40k) */
- ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
+ ret = add_compressed_extent(inode, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [40k, 64k) */
- ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
+ ret = add_compressed_extent(inode, SZ_4K * 10, SZ_4K * 6, SZ_16K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
@@ -643,36 +651,39 @@ static int test_case_5(struct btrfs_fs_info *fs_info)
/* Drop [8k, 12k) */
start = SZ_8K;
end = (3 * SZ_4K) - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 0);
if (ret)
goto out;
/* Drop [12k, 20k) */
start = SZ_4K * 3;
end = SZ_16K + SZ_4K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 1);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 1);
if (ret)
goto out;
/* Drop [28k, 32k) */
start = SZ_32K - SZ_4K;
end = SZ_32K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 2);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 2);
if (ret)
goto out;
/* Drop [32k, 64k) */
start = SZ_32K;
end = SZ_64K - 1;
- btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
- ret = validate_range(&BTRFS_I(inode)->extent_tree, 3);
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ ret = validate_range(&inode->extent_tree, 3);
if (ret)
goto out;
out:
- iput(inode);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -681,23 +692,26 @@ out:
* for areas between two existing ems. Validate it doesn't do this when there
* are two unmerged em's side by side.
*/
-static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree)
+static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em = NULL;
int ret;
+ int ret2;
- ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0);
+ ret = add_compressed_extent(inode, 0, SZ_4K, 0);
if (ret)
goto out;
- ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0);
+ ret = add_compressed_extent(inode, SZ_4K, SZ_4K, 0);
if (ret)
goto out;
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
em->start = SZ_4K;
@@ -705,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
em->block_start = SZ_16K;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K);
+ ret = btrfs_add_extent_mapping(inode, &em, 0, SZ_8K);
write_unlock(&em_tree->lock);
if (ret != 0) {
@@ -725,7 +739,10 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
ret = 0;
out:
free_extent_map(em);
- free_extent_map_tree(em_tree);
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -734,28 +751,19 @@ out:
* true would mess up the start/end calculations and subsequent splits would be
* incorrect.
*/
-static int test_case_7(struct btrfs_fs_info *fs_info)
+static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{
- struct extent_map_tree *em_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
- struct inode *inode;
int ret;
+ int ret2;
test_msg("Running btrfs_drop_extent_cache with pinned");
- inode = btrfs_new_test_inode();
- if (!inode) {
- test_std_err(TEST_ALLOC_INODE);
- return -ENOMEM;
- }
-
- em_tree = &BTRFS_I(inode)->extent_tree;
-
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
/* [0, 16K), pinned */
@@ -765,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info)
em->block_len = SZ_4K;
em->flags |= EXTENT_FLAG_PINNED;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -786,7 +794,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info)
em->block_start = SZ_32K;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -798,7 +806,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info)
* Drop [0, 36K) This should skip the [0, 4K) extent and then split the
* [32K, 48K) extent.
*/
- btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true);
+ btrfs_drop_extent_map_range(inode, 0, (36 * SZ_1K) - 1, true);
/* Make sure our extent maps look sane. */
ret = -EINVAL;
@@ -865,7 +873,14 @@ static int test_case_7(struct btrfs_fs_info *fs_info)
ret = 0;
out:
free_extent_map(em);
- iput(inode);
+ /* Unpin our extent to prevent warning when removing it below. */
+ ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0);
+ if (ret == 0)
+ ret = ret2;
+ ret2 = free_extent_map_tree(inode);
+ if (ret == 0)
+ ret = ret2;
+
return ret;
}
@@ -959,7 +974,8 @@ out_free:
int btrfs_test_extent_map(void)
{
struct btrfs_fs_info *fs_info = NULL;
- struct extent_map_tree *em_tree;
+ struct inode *inode;
+ struct btrfs_root *root = NULL;
int ret = 0, i;
struct rmap_test_vector rmap_tests[] = {
{
@@ -1008,33 +1024,42 @@ int btrfs_test_extent_map(void)
return -ENOMEM;
}
- em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
- if (!em_tree) {
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
ret = -ENOMEM;
goto out;
}
- extent_map_tree_init(em_tree);
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ root = NULL;
+ goto out;
+ }
- ret = test_case_1(fs_info, em_tree);
+ BTRFS_I(inode)->root = root;
+
+ ret = test_case_1(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_2(fs_info, em_tree);
+ ret = test_case_2(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_3(fs_info, em_tree);
+ ret = test_case_3(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_4(fs_info, em_tree);
+ ret = test_case_4(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_5(fs_info);
+ ret = test_case_5(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_6(fs_info, em_tree);
+ ret = test_case_6(fs_info, BTRFS_I(inode));
if (ret)
goto out;
- ret = test_case_7(fs_info);
+ ret = test_case_7(fs_info, BTRFS_I(inode));
if (ret)
goto out;
@@ -1046,7 +1071,8 @@ int btrfs_test_extent_map(void)
}
out:
- kfree(em_tree);
+ iput(inode);
+ btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
return ret;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 9957de9f78..99da9d34b7 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -258,7 +258,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* First with no extents */
BTRFS_I(inode)->root = root;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize);
if (IS_ERR(em)) {
em = NULL;
test_err("got an error when we shouldn't have");
@@ -278,7 +278,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
*/
setup_file_extents(root, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, (u64)-1);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -316,7 +316,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -339,7 +339,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Regular extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -367,7 +367,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* The next 3 are split extents */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -396,7 +396,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -418,7 +418,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -452,7 +452,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -481,7 +481,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* The next 3 are a half written prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -511,7 +511,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -544,7 +544,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -579,7 +579,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Now for the compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -613,7 +613,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Split compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -648,7 +648,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -675,7 +675,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -710,7 +710,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* A hole between regular extents but no hole extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -737,7 +737,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -850,7 +850,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
insert_inode_item_key(root);
insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -872,7 +872,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
}
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b93cdf5f17..76117bb2c7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,12 +23,10 @@
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
-#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
-#include "defrag.h"
#include "dir-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
@@ -407,7 +405,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int ret = 0;
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- root->last_trans < trans->transid) || force) {
+ btrfs_get_root_last_trans(root) < trans->transid) || force) {
WARN_ON(!force && root->commit_root != root->node);
/*
@@ -423,15 +421,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
smp_wmb();
spin_lock(&fs_info->fs_roots_radix_lock);
- if (root->last_trans == trans->transid && !force) {
+ if (btrfs_get_root_last_trans(root) == trans->transid && !force) {
spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
radix_tree_tag_set(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
- root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(root, trans->transid);
/* this is pretty tricky. We don't want to
* take the relocation lock in btrfs_record_root_in_trans
@@ -474,7 +472,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
/* Make sure we don't try to update the root at commit time */
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
}
@@ -493,7 +491,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
* and barriers
*/
smp_rmb();
- if (root->last_trans == trans->transid &&
+ if (btrfs_get_root_last_trans(root) == trans->transid &&
!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
@@ -552,7 +550,7 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
if (!fs_info->reloc_ctl ||
!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
root->reloc_root)
return false;
@@ -1054,7 +1052,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- int err = 0;
+ int ret = 0;
if (refcount_read(&trans->use_count) > 1) {
refcount_dec(&trans->use_count);
@@ -1093,13 +1091,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
wake_up_process(info->transaction_kthread);
if (TRANS_ABORTED(trans))
- err = trans->aborted;
+ ret = trans->aborted;
else
- err = -EROFS;
+ ret = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- return err;
+ return ret;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans)
@@ -1120,8 +1118,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark)
{
- int err = 0;
- int werr = 0;
+ int ret = 0;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
@@ -1131,7 +1128,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
mark, &cached_state)) {
bool wait_writeback = false;
- err = convert_extent_bit(dirty_pages, start, end,
+ ret = convert_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT,
mark, &cached_state);
/*
@@ -1147,22 +1144,22 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
* We cleanup any entries left in the io tree when committing
* the transaction (through extent_io_tree_release()).
*/
- if (err == -ENOMEM) {
- err = 0;
+ if (ret == -ENOMEM) {
+ ret = 0;
wait_writeback = true;
}
- if (!err)
- err = filemap_fdatawrite_range(mapping, start, end);
- if (err)
- werr = err;
- else if (wait_writeback)
- werr = filemap_fdatawait_range(mapping, start, end);
+ if (!ret)
+ ret = filemap_fdatawrite_range(mapping, start, end);
+ if (!ret && wait_writeback)
+ ret = filemap_fdatawait_range(mapping, start, end);
free_extent_state(cached_state);
+ if (ret)
+ break;
cached_state = NULL;
cond_resched();
start = end + 1;
}
- return werr;
+ return ret;
}
/*
@@ -1174,12 +1171,11 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
- int err = 0;
- int werr = 0;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
+ int ret = 0;
while (find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
@@ -1191,22 +1187,20 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* concurrently - we do it only at transaction commit time when
* it's safe to do it (through extent_io_tree_release()).
*/
- err = clear_extent_bit(dirty_pages, start, end,
+ ret = clear_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT, &cached_state);
- if (err == -ENOMEM)
- err = 0;
- if (!err)
- err = filemap_fdatawait_range(mapping, start, end);
- if (err)
- werr = err;
+ if (ret == -ENOMEM)
+ ret = 0;
+ if (!ret)
+ ret = filemap_fdatawait_range(mapping, start, end);
free_extent_state(cached_state);
+ if (ret)
+ break;
cached_state = NULL;
cond_resched();
start = end + 1;
}
- if (err)
- werr = err;
- return werr;
+ return ret;
}
static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
@@ -1231,7 +1225,7 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
bool errors = false;
int err;
- ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if ((mark & EXTENT_DIRTY) &&
@@ -1494,7 +1488,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
ASSERT(atomic_read(&root->log_commit[1]) == 0);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
+ (unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
btrfs_qgroup_free_meta_all_pertrans(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1585,8 +1579,8 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
goto out;
/* Now qgroup are all updated, we can inherit it to new qgroups */
- ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
- parent->root_key.objectid, inherit);
+ ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid,
+ btrfs_root_id(parent), inherit);
if (ret < 0)
goto out;
@@ -1824,7 +1818,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* insert root back/forward references
*/
ret = btrfs_add_root_ref(trans, objectid,
- parent_root->root_key.objectid,
+ btrfs_root_id(parent_root),
btrfs_ino(BTRFS_I(parent_inode)), index,
&fname.disk_name);
if (ret) {
@@ -1857,16 +1851,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = qgroup_account_snapshot(trans, root, parent_root,
pending->inherit, objectid);
else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
- ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid,
- parent_root->root_key.objectid, pending->inherit);
+ ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid,
+ btrfs_root_id(parent_root), pending->inherit);
if (ret < 0)
goto fail;
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
index);
- /* We have check then name at the beginning, so it is impossible. */
- BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1958,19 +1950,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->uuid_tree_generation = root_item->generation;
}
-int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
-{
- struct btrfs_transaction *trans;
- int ret = 0;
-
- spin_lock(&info->trans_lock);
- trans = info->running_transaction;
- if (trans)
- ret = (trans->state >= TRANS_STATE_COMMIT_START);
- spin_unlock(&info->trans_lock);
- return ret;
-}
-
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
struct btrfs_transaction *trans;
@@ -2640,7 +2619,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
+ btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root));
btrfs_kill_all_delayed_nodes(root);
@@ -2685,9 +2664,7 @@ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
int __init btrfs_transaction_init(void)
{
- btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
- sizeof(struct btrfs_trans_handle), 0,
- SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
+ btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY);
if (!btrfs_trans_handle_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2bf8bbdfd0..4e451ab173 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -6,12 +6,27 @@
#ifndef BTRFS_TRANSACTION_H
#define BTRFS_TRANSACTION_H
+#include <linux/atomic.h>
#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/time64.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
#include "btrfs_inode.h"
#include "delayed-ref.h"
-#include "ctree.h"
+#include "extent-io-tree.h"
+#include "block-rsv.h"
+#include "messages.h"
#include "misc.h"
+struct dentry;
+struct inode;
+struct btrfs_pending_snapshot;
+struct btrfs_fs_info;
+struct btrfs_root_item;
+struct btrfs_root;
+struct btrfs_path;
+
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
@@ -262,7 +277,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
-int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 75c78a5556..a2c3651a3d 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -21,7 +21,6 @@
#include "messages.h"
#include "ctree.h"
#include "tree-checker.h"
-#include "disk-io.h"
#include "compression.h"
#include "volumes.h"
#include "misc.h"
@@ -30,7 +29,6 @@
#include "file-item.h"
#include "inode-item.h"
#include "dir-item.h"
-#include "raid-stripe-tree.h"
#include "extent-tree.h"
/*
@@ -67,6 +65,7 @@ static void generic_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -94,6 +93,7 @@ static void file_extent_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -154,6 +154,7 @@ static void dir_item_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -649,6 +650,7 @@ static void block_group_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1005,6 +1007,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(eb->fs_info,
"corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -1260,6 +1263,7 @@ static void extent_err(const struct extent_buffer *eb, int slot,
vaf.fmt = fmt;
vaf.va = &args;
+ dump_page(folio_page(eb->folios[0], 0), "eb page dump");
btrfs_crit(eb->fs_info,
"corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -2017,7 +2021,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
* Skip dummy fs, as selftests don't create unique ebs for each dummy
* root.
*/
- if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
+ if (btrfs_is_testing(eb->fs_info))
return 0;
/*
* There are several call sites (backref walking, qgroup, and data
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 580ec4fde0..01669cfa65 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -6,10 +6,12 @@
#ifndef BTRFS_TREE_CHECKER_H
#define BTRFS_TREE_CHECKER_H
+#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
struct extent_buffer;
struct btrfs_chunk;
+struct btrfs_key;
/* All the extra info needed to verify the parentness of a tree block. */
struct btrfs_tree_parent_check {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 331fc74299..0bce1d45e2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -13,13 +13,11 @@
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
-#include "print-tree.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
-#include "zoned.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
@@ -140,6 +138,25 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
+static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+{
+ unsigned int nofs_flag;
+ struct inode *inode;
+
+ /*
+ * We're holding a transaction handle whether we are logging or
+ * replaying a log tree, so we must make sure NOFS semantics apply
+ * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
+ * to allocate an inode, which can recurse back into the filesystem and
+ * attempt a transaction commit, resulting in a deadlock.
+ */
+ nofs_flag = memalloc_nofs_save();
+ inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ memalloc_nofs_restore(nofs_flag);
+
+ return inode;
+}
+
/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
@@ -393,7 +410,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* the leaf before writing into the log tree. See the comments at
* copy_items() for more details.
*/
- ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
+ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
@@ -602,7 +619,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
{
struct inode *inode;
- inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ inode = btrfs_iget_logging(objectid, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
@@ -750,7 +767,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
if (ins.objectid > 0) {
- struct btrfs_ref ref = { 0 };
u64 csum_start;
u64 csum_end;
LIST_HEAD(ordered_sums);
@@ -764,13 +780,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (ret < 0) {
goto out;
} else if (ret == 0) {
- btrfs_init_generic_ref(&ref,
- BTRFS_ADD_DELAYED_REF,
- ins.objectid, ins.offset, 0,
- root->root_key.objectid);
- btrfs_init_data_ref(&ref,
- root->root_key.objectid,
- key->objectid, offset, 0, false);
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+ btrfs_init_data_ref(&ref, key->objectid, offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
@@ -780,7 +798,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
- root->root_key.objectid,
+ btrfs_root_id(root),
key->objectid, offset, &ins);
if (ret)
goto out;
@@ -799,9 +817,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_list(root->log_root,
csum_start, csum_end - 1,
- &ordered_sums, 0, false);
- if (ret)
+ &ordered_sums, false);
+ if (ret < 0)
goto out;
+ ret = 0;
/*
* Now delete all existing cums in the csum root that
* cover our range. We do this because we can have an
@@ -2820,6 +2839,52 @@ static void wait_for_writer(struct btrfs_root *root)
finish_wait(&root->log_writer_wait, &wait);
}
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
+ ctx->logged_before = false;
+ ctx->inode = inode;
+ INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
+ ctx->scratch_eb = NULL;
+}
+
+void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+
+ if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return;
+
+ /*
+ * Don't care about allocation failure. This is just for optimization,
+ * if we fail to allocate here, we will try again later if needed.
+ */
+ ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
+}
+
+void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
@@ -3001,7 +3066,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (ret != -ENOSPC)
btrfs_err(fs_info,
"failed to update log for root %llu ret %d",
- root->root_key.objectid, ret);
+ btrfs_root_id(root), ret);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
@@ -3619,6 +3684,30 @@ out:
return ret;
}
+static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
+{
+ const int slot = path->slots[0];
+
+ if (ctx->scratch_eb) {
+ copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
+ } else {
+ ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!ctx->scratch_eb)
+ return -ENOMEM;
+ }
+
+ btrfs_release_path(path);
+ path->nodes[0] = ctx->scratch_eb;
+ path->slots[0] = slot;
+ /*
+ * Add extra ref to scratch eb so that it is not freed when callers
+ * release the path, so we can reuse it later if needed.
+ */
+ atomic_inc(&ctx->scratch_eb->refs);
+
+ return 0;
+}
+
static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
@@ -3633,23 +3722,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
- int i;
+ int ret;
/*
* We need to clone the leaf, release the read lock on it, and use the
* clone before modifying the log tree. See the comment at copy_items()
* about why we need to do this.
*/
- src = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!src)
- return -ENOMEM;
+ ret = clone_leaf(path, ctx);
+ if (ret < 0)
+ return ret;
- i = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = src;
- path->slots[0] = i;
+ src = path->nodes[0];
- for (; i < nritems; i++) {
+ for (int i = path->slots[0]; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key key;
int ret;
@@ -4259,17 +4345,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_path *dst_path,
struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
- u64 logged_isize)
+ u64 logged_isize, struct btrfs_log_ctx *ctx)
{
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
struct extent_buffer *src;
- int ret = 0;
+ int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
struct btrfs_item_batch batch;
char *ins_data;
- int i;
int dst_index;
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
const u64 i_size = i_size_read(&inode->vfs_inode);
@@ -4302,14 +4387,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* while the other is holding the delayed node's mutex and wants to
* write lock the same subvolume leaf for flushing delayed items.
*/
- src = btrfs_clone_extent_buffer(src_path->nodes[0]);
- if (!src)
- return -ENOMEM;
+ ret = clone_leaf(src_path, ctx);
+ if (ret < 0)
+ return ret;
- i = src_path->slots[0];
- btrfs_release_path(src_path);
- src_path->nodes[0] = src;
- src_path->slots[0] = i;
+ src = src_path->nodes[0];
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
@@ -4324,7 +4406,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
batch.nr = 0;
dst_index = 0;
- for (i = 0; i < nr; i++) {
+ for (int i = 0; i < nr; i++) {
const int src_slot = start_slot + i;
struct btrfs_root *csum_root;
struct btrfs_ordered_sum *sums;
@@ -4399,9 +4481,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
disk_bytenr += extent_offset;
ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
disk_bytenr + extent_num_bytes - 1,
- &ordered_sums, 0, false);
- if (ret)
+ &ordered_sums, false);
+ if (ret < 0)
goto out;
+ ret = 0;
list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
if (!ret)
@@ -4431,7 +4514,7 @@ add_to_batch:
goto out;
dst_index = 0;
- for (i = 0; i < nr; i++) {
+ for (int i = 0; i < nr; i++) {
const int src_slot = start_slot + i;
const int dst_slot = dst_path->slots[0] + dst_index;
struct btrfs_key key;
@@ -4513,8 +4596,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *csum_root;
u64 csum_offset;
u64 csum_len;
- u64 mod_start = em->mod_start;
- u64 mod_len = em->mod_len;
+ u64 mod_start = em->start;
+ u64 mod_len = em->len;
LIST_HEAD(ordered_sums);
int ret = 0;
@@ -4594,9 +4677,10 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0, false);
- if (ret)
+ csum_len - 1, &ordered_sums, false);
+ if (ret < 0)
return ret;
+ ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
@@ -4704,7 +4788,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
*/
static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key;
@@ -4770,7 +4855,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
goto out;
ins_nr = 0;
@@ -4794,18 +4879,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
- if (!dropped_extents) {
- /*
- * Avoid logging extent items logged in past fsync calls
- * and leading to duplicate keys in the log tree.
- */
+ /*
+ * Avoid overlapping items in the log tree. The first time we
+ * get here, get rid of everything from a past fsync. After
+ * that, if the current extent starts before the end of the last
+ * extent we copied, truncate the last one. This can happen if
+ * an ordered extent completion modifies the subvolume tree
+ * while btrfs_next_leaf() has the tree unlocked.
+ */
+ if (!dropped_extents || key.offset < truncate_offset) {
ret = truncate_inode_items(trans, root->log_root, inode,
- truncate_offset,
+ min(key.offset, truncate_offset),
BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
}
+ truncate_offset = btrfs_file_extent_end(path);
if (ins_nr == 0)
start_slot = slot;
ins_nr++;
@@ -4820,7 +4910,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
}
if (ins_nr > 0)
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
out:
btrfs_release_path(path);
btrfs_free_path(dst_path);
@@ -4883,7 +4973,7 @@ process:
* private list.
*/
if (ret) {
- clear_em_logging(tree, em);
+ clear_em_logging(inode, em);
free_extent_map(em);
continue;
}
@@ -4892,14 +4982,14 @@ process:
ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
- clear_em_logging(tree, em);
+ clear_em_logging(inode, em);
free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
if (!ret)
- ret = btrfs_log_prealloc_extents(trans, inode, path);
+ ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
if (ret)
return ret;
@@ -4980,7 +5070,8 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
int ret;
@@ -5009,7 +5100,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
if (slot >= nritems) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5035,7 +5126,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
}
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- start_slot, ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0, ctx);
if (ret < 0)
return ret;
}
@@ -5366,7 +5457,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = start_inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
@@ -5427,7 +5517,7 @@ again:
continue;
btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ di_inode = btrfs_iget_logging(di_key.objectid, root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto out;
@@ -5487,7 +5577,7 @@ again:
btrfs_add_delayed_iput(curr_inode);
curr_inode = NULL;
- vfs_inode = btrfs_iget(fs_info->sb, ino, root);
+ vfs_inode = btrfs_iget_logging(ino, root);
if (IS_ERR(vfs_inode)) {
ret = PTR_ERR(vfs_inode);
break;
@@ -5582,7 +5672,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
return BTRFS_LOG_FORCE_COMMIT;
- inode = btrfs_iget(root->fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was deleted in
* the current transaction then we either:
@@ -5683,7 +5773,6 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
/*
@@ -5714,7 +5803,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
list_del(&curr->list);
kfree(curr);
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
@@ -5725,7 +5814,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (ret != -ENOENT)
break;
- inode = btrfs_iget(fs_info->sb, parent, root);
+ inode = btrfs_iget_logging(parent, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
break;
@@ -5847,7 +5936,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr,
- inode_only, logged_isize);
+ inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5866,7 +5955,7 @@ again:
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5883,7 +5972,7 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 1;
@@ -5898,7 +5987,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
- logged_isize);
+ logged_isize, ctx);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -5923,7 +6012,7 @@ next_key:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
- ins_nr, inode_only, logged_isize);
+ ins_nr, inode_only, logged_isize, ctx);
if (ret)
return ret;
}
@@ -5934,7 +6023,7 @@ next_key:
* lock the same leaf with btrfs_log_prealloc_extents() below.
*/
btrfs_release_path(path);
- ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
}
return ret;
@@ -6247,7 +6336,6 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
const bool orig_log_new_dentries = ctx->log_new_dentries;
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_item *item;
int ret = 0;
@@ -6273,7 +6361,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
if (key.type == BTRFS_ROOT_ITEM_KEY)
continue;
- di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ di_inode = btrfs_iget_logging(key.objectid, inode->root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
break;
@@ -6526,7 +6614,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
btrfs_release_path(dst_path);
- ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
xattrs_logged = true;
@@ -6553,7 +6641,7 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
btrfs_release_path(path);
@@ -6657,7 +6745,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
@@ -6722,8 +6809,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
- root);
+ dir_inode = btrfs_iget_logging(inode_key.objectid, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -6785,7 +6871,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
while (true) {
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
@@ -6800,7 +6885,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -7502,6 +7587,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
ctx.logging_new_name = true;
+ btrfs_init_log_ctx_scratch_eb(&ctx);
/*
* We don't care about the return value. If we fail to log the new name
* then we know the next attempt to sync the log will fallback to a full
@@ -7510,6 +7596,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a550a8a375..22e9cbc815 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -6,10 +6,18 @@
#ifndef BTRFS_TREE_LOG_H
#define BTRFS_TREE_LOG_H
+#include <linux/list.h>
+#include <linux/fs.h>
#include "messages.h"
#include "ctree.h"
#include "transaction.h"
+struct inode;
+struct dentry;
+struct btrfs_ordered_extent;
+struct btrfs_root;
+struct btrfs_trans_handle;
+
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
@@ -36,37 +44,20 @@ struct btrfs_log_ctx {
struct list_head conflict_inodes;
int num_conflict_inodes;
bool logging_conflict_inodes;
+ /*
+ * Used for fsyncs that need to copy items from the subvolume tree to
+ * the log tree (full sync flag set or copy everything flag set) to
+ * avoid allocating a temporary extent buffer while holding a lock on
+ * an extent buffer of the subvolume tree and under the log transaction.
+ * Also helps to avoid allocating and freeing a temporary extent buffer
+ * in case we need to process multiple leaves from the subvolume tree.
+ */
+ struct extent_buffer *scratch_eb;
};
-static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
- struct inode *inode)
-{
- ctx->log_ret = 0;
- ctx->log_transid = 0;
- ctx->log_new_dentries = false;
- ctx->logging_new_name = false;
- ctx->logging_new_delayed_dentries = false;
- ctx->logged_before = false;
- ctx->inode = inode;
- INIT_LIST_HEAD(&ctx->list);
- INIT_LIST_HEAD(&ctx->ordered_extents);
- INIT_LIST_HEAD(&ctx->conflict_inodes);
- ctx->num_conflict_inodes = 0;
- ctx->logging_conflict_inodes = false;
-}
-
-static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
-{
- struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_extent *tmp;
-
- ASSERT(inode_is_locked(ctx->inode));
-
- list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
- list_del_init(&ordered->log_list);
- btrfs_put_ordered_extent(ordered);
- }
-}
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode);
+void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx);
+void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx);
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
{
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 3df6153d5d..fa45b5fb96 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -44,7 +44,7 @@ struct tree_mod_elem {
/*
* Pull a new tree mod seq number for our operation.
*/
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
@@ -170,8 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
* this until all tree mod log insertions are recorded in the rb tree and then
* write unlock fs_info::tree_mod_log_lock.
*/
-static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
@@ -188,7 +187,7 @@ static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
}
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
-static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
@@ -367,9 +366,9 @@ free_tms:
return ret;
}
-static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct tree_mod_elem **tm_list,
- int nritems)
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
{
int i, j;
int ret;
@@ -1005,7 +1004,7 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
check.level = level;
- check.owner_root = root->root_key.objectid;
+ check.owner_root = btrfs_root_id(root);
old = read_tree_block(fs_info, logical, &check);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 94f10afeee..ff00c8e8a3 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -3,7 +3,13 @@
#ifndef BTRFS_TREE_MOD_LOG_H
#define BTRFS_TREE_MOD_LOG_H
-#include "ctree.h"
+#include <linux/list.h>
+
+struct extent_buffer;
+struct btrfs_fs_info;
+struct btrfs_path;
+struct btrfs_root;
+struct btrfs_seq_list;
/* Represents a tree mod log user. */
struct btrfs_seq_list {
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index b4ac2b0cd2..183863f4bf 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -7,7 +7,6 @@
#include <linux/slab.h>
#include "messages.h"
#include "ulist.h"
-#include "ctree.h"
/*
* ulist is a generic data structure to hold a collection of unique u64
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index b2cef187ea..8e200fe1a2 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -7,6 +7,7 @@
#ifndef BTRFS_ULIST_H
#define BTRFS_ULIST_H
+#include <linux/types.h>
#include <linux/list.h>
#include <linux/rbtree.h>
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 5be74f9e47..b0aff297d6 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -9,7 +9,6 @@
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
-#include "print-tree.h"
#include "fs.h"
#include "accessors.h"
#include "uuid-tree.h"
@@ -114,7 +113,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
sizeof(subid_le));
- if (ret >= 0) {
+ if (ret == 0) {
/* Add an item for the type for the first time */
eb = path->nodes[0];
slot = path->slots[0];
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
index 5350c87fe2..080ede0227 100644
--- a/fs/btrfs/uuid-tree.h
+++ b/fs/btrfs/uuid-tree.h
@@ -3,6 +3,11 @@
#ifndef BTRFS_UUID_TREE_H
#define BTRFS_UUID_TREE_H
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_fs_info;
+
int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 66e2270b0d..4042dd6437 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -14,7 +14,6 @@
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
-#include "disk-io.h"
#include "locking.h"
#include "fs.h"
#include "accessors.h"
diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h
index 91c10f7d0a..d696659e43 100644
--- a/fs/btrfs/verity.h
+++ b/fs/btrfs/verity.h
@@ -3,8 +3,13 @@
#ifndef BTRFS_VERITY_H
#define BTRFS_VERITY_H
+struct inode;
+struct btrfs_inode;
+
#ifdef CONFIG_FS_VERITY
+#include <linux/fsverity.h>
+
extern const struct fsverity_operations btrfs_verityops;
int btrfs_drop_verity_items(struct btrfs_inode *inode);
@@ -12,6 +17,8 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
#else
+#include <linux/errno.h>
+
static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
{
return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ebadc5ec8..c39145e8c4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,10 +14,8 @@
#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
-#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
-#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "rcu-string.h"
@@ -468,39 +466,42 @@ static noinline struct btrfs_fs_devices *find_fsid(
static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
- int flush, struct bdev_handle **bdev_handle,
+ int flush, struct file **bdev_file,
struct btrfs_super_block **disk_super)
{
struct block_device *bdev;
int ret;
- *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev_handle)) {
- ret = PTR_ERR(*bdev_handle);
+ if (IS_ERR(*bdev_file)) {
+ ret = PTR_ERR(*bdev_file);
goto error;
}
- bdev = (*bdev_handle)->bdev;
+ bdev = file_bdev(*bdev_file);
if (flush)
sync_blockdev(bdev);
- ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
- if (ret) {
- bdev_release(*bdev_handle);
- goto error;
+ if (holder) {
+ ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
+ if (ret) {
+ fput(*bdev_file);
+ goto error;
+ }
}
invalidate_bdev(bdev);
*disk_super = btrfs_read_dev_super(bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
return 0;
error:
- *bdev_handle = NULL;
+ *disk_super = NULL;
+ *bdev_file = NULL;
return ret;
}
@@ -643,7 +644,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -654,7 +655,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret)
return ret;
@@ -678,20 +679,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev_handle->bdev))
+ if (bdev_read_only(file_bdev(bdev_file)))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!bdev_nonrot(bdev_handle->bdev))
+ if (!bdev_nonrot(file_bdev(bdev_file)))
fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev_handle->bdev))
+ if (bdev_max_discard_sectors(file_bdev(bdev_file)))
fs_devices->discardable = true;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (device->devt != device->bdev->bd_dev) {
@@ -716,7 +717,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return -EINVAL;
}
@@ -779,8 +780,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (same_fsid_diff_dev) {
generate_random_uuid(fs_devices->fsid);
fs_devices->temp_fsid = true;
- pr_info("BTRFS: device %s using temp-fsid %pU\n",
- path, fs_devices->fsid);
+ pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
+ path, MAJOR(path_devt), MINOR(path_devt),
+ fs_devices->fsid);
}
mutex_lock(&fs_devices->device_list_mutex);
@@ -809,8 +811,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (fs_devices->opened) {
btrfs_err(NULL,
-"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
- path, fs_devices->fsid, current->comm,
+"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
+ path, MAJOR(path_devt), MINOR(path_devt),
+ fs_devices->fsid, current->comm,
task_pid_nr(current));
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
@@ -836,13 +839,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (disk_super->label[0])
pr_info(
- "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
+"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
disk_super->label, devid, found_transid, path,
+ MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
else
pr_info(
- "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
+"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
disk_super->fsid, devid, found_transid, path,
+ MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
} else if (!device->name || strcmp(device->name->str, path)) {
@@ -1025,10 +1030,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev_handle) {
- bdev_release(device->bdev_handle);
+ if (device->bdev_file) {
+ fput(device->bdev_file);
device->bdev = NULL;
- device->bdev_handle = NULL;
+ device->bdev_file = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1073,7 +1078,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- bdev_release(device->bdev_handle);
+ fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1285,7 +1290,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
return ERR_PTR(-EINVAL);
/* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
+ page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -1374,8 +1379,9 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
u64 bytenr, bytenr_orig;
+ dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
@@ -1397,31 +1403,30 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
- if (IS_ERR(bdev_handle))
- return ERR_CAST(bdev_handle);
+ bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
+ ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
- disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
}
- if (btrfs_skip_registration(disk_super, path, bdev_handle->bdev->bd_dev,
- mount_arg_dev)) {
+ devt = file_bdev(bdev_file)->bd_dev;
+ if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
- path, MAJOR(bdev_handle->bdev->bd_dev),
- MINOR(bdev_handle->bdev->bd_dev));
+ path, MAJOR(devt), MINOR(devt));
- btrfs_free_stale_devices(bdev_handle->bdev->bd_dev, NULL);
+ btrfs_free_stale_devices(devt, NULL);
device = NULL;
goto free_disk_super;
@@ -1435,7 +1440,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return device;
}
@@ -2086,11 +2091,10 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
copy_num, ret);
}
-void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
{
int copy_num;
+ struct block_device *bdev = device->bdev;
if (!bdev)
return;
@@ -2106,12 +2110,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
+ update_dev_time(device->name->str);
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle)
+ struct file **bdev_file)
{
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
@@ -2220,7 +2224,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev_handle) {
+ if (device->bdev_file) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2236,20 +2240,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and bdev_release() will pull in the ->open_mutex on
- * the block device and it's dependencies. Instead just flush the
- * device and let the caller do the final bdev_release.
+ * write lock, and fput() on the block device will pull in the
+ * ->open_mutex on the block device and it's dependencies. Instead
+ * just flush the device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
- btrfs_scratch_superblocks(fs_info, device->bdev,
- device->name->str);
+ btrfs_scratch_superblocks(fs_info, device);
if (device->bdev) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
}
- *bdev_handle = device->bdev_handle;
+ *bdev_file = device->bdev_file;
synchronize_rcu();
btrfs_free_device(device);
@@ -2355,8 +2358,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
- tgtdev->name->str);
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
@@ -2386,7 +2388,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int ret;
if (!path || !path[0])
@@ -2404,7 +2406,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
}
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret) {
btrfs_put_dev_args_from_path(args);
return ret;
@@ -2417,7 +2419,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return 0;
}
@@ -2637,7 +2639,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices = NULL;
@@ -2650,12 +2652,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
- if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
+ if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
ret = -EINVAL;
goto error;
}
@@ -2667,11 +2669,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev_handle->bdev);
+ sync_blockdev(file_bdev(bdev_file));
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev_handle->bdev) {
+ if (device->bdev == file_bdev(bdev_file)) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2687,8 +2689,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
device->fs_info = fs_info;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error_free_device;
@@ -2715,7 +2717,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
- set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
btrfs_clear_sb_rdonly(sb);
@@ -2871,7 +2873,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -3545,6 +3547,44 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
return 0;
}
+static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ const struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+}
+
+static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ const struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+}
+
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
@@ -3689,7 +3729,7 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
int ret;
- BUG_ON(!fs_info->balance_ctl);
+ ASSERT(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = NULL;
@@ -5578,21 +5618,6 @@ struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
return map;
}
-struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp)
-{
- const int size = btrfs_chunk_map_size(map->num_stripes);
- struct btrfs_chunk_map *clone;
-
- clone = kmemdup(map, size, gfp);
- if (!clone)
- return NULL;
-
- refcount_set(&clone->refs, 1);
- RB_CLEAR_NODE(&clone->rb_node);
-
- return clone;
-}
-
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
@@ -6009,6 +6034,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
{
+ const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
int i;
int num_stripes;
int preferred_mirror;
@@ -6023,13 +6049,12 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- switch (fs_info->fs_devices->read_policy) {
+ switch (policy) {
default:
/* Shouldn't happen, just warn and use pid instead of failing */
- btrfs_warn_rl(fs_info,
- "unknown read_policy type %u, reset to pid",
- fs_info->fs_devices->read_policy);
- fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
+ policy);
+ WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
fallthrough;
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53f87f398d..66e6fc481e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -6,13 +6,28 @@
#ifndef BTRFS_VOLUMES_H
#define BTRFS_VOLUMES_H
+#include <linux/blk_types.h>
+#include <linux/sizes.h>
+#include <linux/atomic.h>
#include <linux/sort.h>
-#include <linux/btrfs.h>
-#include "async-thread.h"
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/log2.h>
+#include <linux/kobject.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/btrfs.h>
#include "messages.h"
-#include "tree-checker.h"
#include "rcu-string.h"
+struct block_device;
+struct bdev_handle;
+struct btrfs_fs_info;
+struct btrfs_block_group;
+struct btrfs_trans_handle;
+struct btrfs_zoned_device_info;
+
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
extern struct mutex uuid_mutex;
@@ -77,7 +92,10 @@ enum btrfs_raid_types {
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
#define BTRFS_DEV_STATE_NO_READA (5)
-struct btrfs_zoned_device_info;
+/* Special value encoding failure to write primary super block. */
+#define BTRFS_SUPER_PRIMARY_WRITE_ERROR (INT_MAX / 2)
+
+struct btrfs_fs_devices;
struct btrfs_device {
struct list_head dev_list; /* device_list_mutex */
@@ -90,7 +108,7 @@ struct btrfs_device {
u64 generation;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
struct btrfs_zoned_device_info *zone_info;
@@ -127,6 +145,12 @@ struct btrfs_device {
/* type and info about this device */
u64 type;
+ /*
+ * Counter of super block write errors, values larger than
+ * BTRFS_SUPER_PRIMARY_WRITE_ERROR encode primary super block write failure.
+ */
+ atomic_t sb_write_errors;
+
/* minimal io size for this device */
u32 sector_size;
@@ -276,6 +300,25 @@ enum btrfs_read_policy {
BTRFS_NR_READ_POLICY,
};
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Checksum mode - offload it to workqueues or do it synchronously in
+ * btrfs_submit_chunk().
+ */
+enum btrfs_offload_csum_mode {
+ /*
+ * Choose offloading checksum or do it synchronously automatically.
+ * Do it synchronously if the checksum is fast, or offload to workqueues
+ * otherwise.
+ */
+ BTRFS_OFFLOAD_CSUM_AUTO,
+ /* Always offload checksum to workqueues. */
+ BTRFS_OFFLOAD_CSUM_FORCE_ON,
+ /* Never offload checksum to workqueues. */
+ BTRFS_OFFLOAD_CSUM_FORCE_OFF,
+};
+#endif
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
@@ -380,6 +423,11 @@ struct btrfs_fs_devices {
/* Policy used to read the mirrored stripes. */
enum btrfs_read_policy read_policy;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /* Checksum mode - offload it or do it synchronously. */
+ enum btrfs_offload_csum_mode offload_csum_mode;
+#endif
};
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
@@ -557,8 +605,6 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
}
}
-struct btrfs_balance_args;
-struct btrfs_balance_progress;
struct btrfs_balance_control {
struct btrfs_balance_args data;
struct btrfs_balance_args meta;
@@ -661,7 +707,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle);
+ struct file **bdev_file);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
@@ -706,7 +752,6 @@ struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp);
int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
#endif
-struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp);
struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
@@ -780,9 +825,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
-void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device);
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
int btrfs_bg_type_to_factor(u64 flags);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6287763fdc..15d0999e34 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -504,7 +504,7 @@ static int btrfs_initxattrs(struct inode *inode,
const struct xattr *xattr;
unsigned int nofs_flag;
char *name;
- int err = 0;
+ int ret = 0;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
@@ -515,7 +515,7 @@ static int btrfs_initxattrs(struct inode *inode,
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
- err = -ENOMEM;
+ ret = -ENOMEM;
break;
}
strcpy(name, XATTR_SECURITY_PREFIX);
@@ -524,14 +524,14 @@ static int btrfs_initxattrs(struct inode *inode,
if (strcmp(name, XATTR_NAME_CAPS) == 0)
clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_setxattr(trans, inode, name, xattr->value,
+ ret = btrfs_setxattr(trans, inode, name, xattr->value,
xattr->value_len, 0);
kfree(name);
- if (err < 0)
+ if (ret < 0)
break;
}
memalloc_nofs_restore(nofs_flag);
- return err;
+ return ret;
}
int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 118118ca3e..b9376ea258 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_XATTR_H
#define BTRFS_XATTR_H
-#include <linux/xattr.h>
+struct dentry;
+struct inode;
+struct qstr;
+struct xattr_handler;
+struct btrfs_trans_handle;
extern const struct xattr_handler * const btrfs_xattr_handlers[];
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8da66ea699..d9e5c88a0f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -91,24 +91,24 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
char *data_in = NULL;
- char *cpage_out;
- int nr_pages = 0;
- struct page *in_page = NULL;
- struct page *out_page = NULL;
+ char *cfolio_out;
+ int nr_folios = 0;
+ struct folio *in_folio = NULL;
+ struct folio *out_folio = NULL;
unsigned long bytes_left;
- unsigned int in_buf_pages;
+ unsigned int in_buf_folios;
unsigned long len = *total_out;
- unsigned long nr_dest_pages = *out_pages;
- const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+ unsigned long nr_dest_folios = *out_folios;
+ const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
- *out_pages = 0;
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
@@ -121,18 +121,18 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = btrfs_alloc_compr_page();
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[0] = out_page;
- nr_pages = 1;
+ cfolio_out = folio_address(out_folio);
+ folios[0] = out_folio;
+ nr_folios = 1;
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = 0;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
workspace->strm.avail_out = PAGE_SIZE;
while (workspace->strm.total_in < len) {
@@ -142,19 +142,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*/
if (workspace->strm.avail_in == 0) {
bytes_left = len - workspace->strm.total_in;
- in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
- workspace->buf_size / PAGE_SIZE);
- if (in_buf_pages > 1) {
+ in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
+ workspace->buf_size / PAGE_SIZE);
+ if (in_buf_folios > 1) {
int i;
- for (i = 0; i < in_buf_pages; i++) {
+ for (i = 0; i < in_buf_folios; i++) {
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
+ data_in = NULL;
}
- in_page = find_get_page(mapping,
- start >> PAGE_SHIFT);
- data_in = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping,
+ start, &in_folio);
+ if (ret < 0)
+ goto out;
+ data_in = kmap_local_folio(in_folio, 0);
copy_page(workspace->buf + i * PAGE_SIZE,
data_in);
start += PAGE_SIZE;
@@ -163,11 +166,14 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
} else {
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
+ data_in = NULL;
}
- in_page = find_get_page(mapping,
- start >> PAGE_SHIFT);
- data_in = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping,
+ start, &in_folio);
+ if (ret < 0)
+ goto out;
+ data_in = kmap_local_folio(in_folio, 0);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -196,20 +202,20 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = btrfs_alloc_compr_page();
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[nr_pages] = out_page;
- nr_pages++;
+ cfolio_out = folio_address(out_folio);
+ folios[nr_folios] = out_folio;
+ nr_folios++;
workspace->strm.avail_out = PAGE_SIZE;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
}
/* we're all done */
if (workspace->strm.total_in >= len)
@@ -231,21 +237,21 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -EIO;
goto out;
} else if (workspace->strm.avail_out == 0) {
- /* get another page for the stream end */
- if (nr_pages == nr_dest_pages) {
+ /* Get another folio for the stream end. */
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = btrfs_alloc_compr_page();
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = page_address(out_page);
- pages[nr_pages] = out_page;
- nr_pages++;
+ cfolio_out = folio_address(out_folio);
+ folios[nr_folios] = out_folio;
+ nr_folios++;
workspace->strm.avail_out = PAGE_SIZE;
- workspace->strm.next_out = cpage_out;
+ workspace->strm.next_out = cfolio_out;
}
}
zlib_deflateEnd(&workspace->strm);
@@ -259,10 +265,10 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = workspace->strm.total_out;
*total_in = workspace->strm.total_in;
out:
- *out_pages = nr_pages;
+ *out_folios = nr_folios;
if (data_in) {
kunmap_local(data_in);
- put_page(in_page);
+ folio_put(in_folio);
}
return ret;
@@ -275,13 +281,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
int wbits = MAX_WBITS;
char *data_in;
size_t total_out = 0;
- unsigned long page_in_index = 0;
+ unsigned long folio_in_index = 0;
size_t srclen = cb->compressed_len;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
- struct page **pages_in = cb->compressed_pages;
+ struct folio **folios_in = cb->compressed_folios;
- data_in = kmap_local_page(pages_in[page_in_index]);
+ data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -331,12 +337,12 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
kunmap_local(data_in);
- page_in_index++;
- if (page_in_index >= total_pages_in) {
+ folio_in_index++;
+ if (folio_in_index >= total_folios_in) {
data_in = NULL;
break;
}
- data_in = kmap_local_page(pages_in[page_in_index]);
+ data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -398,7 +404,7 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
out:
if (unlikely(to_copy != destlen)) {
- pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
+ pr_warn_ratelimited("BTRFS: inflate failed, decompressed=%lu expected=%zu\n",
to_copy, destlen);
ret = -EIO;
} else {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 9729fa29c7..947a87576f 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -12,10 +12,8 @@
#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
-#include "transaction.h"
#include "dev-replace.h"
#include "space-info.h"
-#include "super.h"
#include "fs.h"
#include "accessors.h"
#include "bio.h"
@@ -120,7 +118,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
return -ENOENT;
} else if (full[0] && full[1]) {
/* Compare two super blocks */
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
struct page *page[BTRFS_NR_SB_LOG_ZONES];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
int i;
@@ -824,11 +822,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ unsigned int nofs_flags;
+
ASSERT(sb_zone_is_full(reset));
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- reset->start, reset->len,
- GFP_NOFS);
+ reset->start, reset->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -974,11 +975,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
* explicit ZONE_FINISH is not necessary.
*/
if (zone->wp != zone->start + zone->capacity) {
+ unsigned int nofs_flags;
int ret;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_NOFS);
+ zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
@@ -996,11 +1000,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
+ unsigned int nofs_flags;
sector_t zone_sectors;
sector_t nr_sectors;
u8 zone_sectors_shift;
u32 sb_zone;
u32 nr_zones;
+ int ret;
zone_sectors = bdev_zone_sectors(bdev);
zone_sectors_shift = ilog2(zone_sectors);
@@ -1011,9 +1017,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
/*
@@ -1124,12 +1133,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
+ unsigned int nofs_flags;
int ret;
*bytes = 0;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
- physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_NOFS);
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -1279,7 +1290,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct btrfs_chunk_map *map)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- struct btrfs_device *device = map->stripes[zone_idx].dev;
+ struct btrfs_device *device;
int dev_replace_is_ongoing = 0;
unsigned int nofs_flag;
struct blk_zone zone;
@@ -1287,7 +1298,11 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
info->physical = map->stripes[zone_idx].physical;
+ down_read(&dev_replace->rwsem);
+ device = map->stripes[zone_idx].dev;
+
if (!device->bdev) {
+ up_read(&dev_replace->rwsem);
info->alloc_offset = WP_MISSING_DEV;
return 0;
}
@@ -1297,6 +1312,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
__set_bit(zone_idx, active);
if (!btrfs_dev_is_sequential(device, info->physical)) {
+ up_read(&dev_replace->rwsem);
info->alloc_offset = WP_CONVENTIONAL;
return 0;
}
@@ -1304,11 +1320,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
/* This zone will be used for allocation, so mark this zone non-empty. */
btrfs_dev_clear_zone_empty(device, info->physical);
- down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
- up_read(&dev_replace->rwsem);
/*
* The group is mapped to a sequential zone. Get the zone write pointer
@@ -1319,6 +1333,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
ret = btrfs_get_dev_zone(device, info->physical, &zone);
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ up_read(&dev_replace->rwsem);
if (ret != -EIO && ret != -EOPNOTSUPP)
return ret;
info->alloc_offset = WP_MISSING_DEV;
@@ -1330,6 +1345,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
device->devid);
+ up_read(&dev_replace->rwsem);
return -EIO;
}
@@ -1357,6 +1373,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
break;
}
+ up_read(&dev_replace->rwsem);
+
return 0;
}
@@ -2241,14 +2259,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int nofs_flags;
if (zinfo->max_active_zones == 0)
continue;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret) {
up_read(&dev_replace->rwsem);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index f573bda496..77c4321e33 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -4,12 +4,27 @@
#define BTRFS_ZONED_H
#include <linux/types.h>
+#include <linux/atomic.h>
#include <linux/blkdev.h>
+#include <linux/blkzoned.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include "messages.h"
#include "volumes.h"
#include "disk-io.h"
#include "block-group.h"
#include "btrfs_inode.h"
+#include "fs.h"
+
+struct block_device;
+struct extent_buffer;
+struct btrfs_bio;
+struct btrfs_ordered_extent;
+struct btrfs_fs_info;
+struct btrfs_space_info;
+struct btrfs_eb_write_context;
+struct btrfs_fs_devices;
#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 0d66db8bc1..2b232b82c3 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -18,8 +18,9 @@
#include <linux/slab.h>
#include <linux/zstd.h>
#include "misc.h"
+#include "fs.h"
#include "compression.h"
-#include "ctree.h"
+#include "super.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
@@ -373,25 +374,25 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out)
+int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct folio **folios, unsigned long *out_folios,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
zstd_cstream *stream;
int ret = 0;
- int nr_pages = 0;
- struct page *in_page = NULL; /* The current page to read */
- struct page *out_page = NULL; /* The current page to write to */
+ int nr_folios = 0;
+ struct folio *in_folio = NULL; /* The current folio to read. */
+ struct folio *out_folio = NULL; /* The current folio to write to. */
unsigned long tot_in = 0;
unsigned long tot_out = 0;
unsigned long len = *total_out;
- const unsigned long nr_dest_pages = *out_pages;
- unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+ const unsigned long nr_dest_folios = *out_folios;
+ unsigned long max_out = nr_dest_folios * PAGE_SIZE;
zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
len);
- *out_pages = 0;
+ *out_folios = 0;
*total_out = 0;
*total_in = 0;
@@ -405,19 +406,21 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* map in the first page of input data */
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
+ if (ret < 0)
+ goto out;
+ workspace->in_buf.src = kmap_local_folio(in_folio, 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
/* Allocate and map in the output buffer */
- out_page = btrfs_alloc_compr_page();
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -452,17 +455,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = btrfs_alloc_compr_page();
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -478,11 +481,14 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
kunmap_local(workspace->in_buf.src);
- put_page(in_page);
+ workspace->in_buf.src = NULL;
+ folio_put(in_folio);
start += PAGE_SIZE;
len -= PAGE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap_local_page(in_page);
+ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
+ if (ret < 0)
+ goto out;
+ workspace->in_buf.src = kmap_local_folio(in_folio, 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -509,17 +515,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- if (nr_pages == nr_dest_pages) {
+ if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_page = btrfs_alloc_compr_page();
- if (out_page == NULL) {
+ out_folio = btrfs_alloc_compr_folio();
+ if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
}
- pages[nr_pages++] = out_page;
- workspace->out_buf.dst = page_address(out_page);
+ folios[nr_folios++] = out_folio;
+ workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -533,10 +539,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = tot_in;
*total_out = tot_out;
out:
- *out_pages = nr_pages;
+ *out_folios = nr_folios;
if (workspace->in_buf.src) {
kunmap_local(workspace->in_buf.src);
- put_page(in_page);
+ folio_put(in_folio);
}
return ret;
}
@@ -544,12 +550,12 @@ out:
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- struct page **pages_in = cb->compressed_pages;
+ struct folio **folios_in = cb->compressed_folios;
size_t srclen = cb->compressed_len;
zstd_dstream *stream;
int ret = 0;
- unsigned long page_in_index = 0;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long folio_in_index = 0;
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long total_out = 0;
@@ -561,7 +567,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -598,14 +604,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->in_buf.pos == workspace->in_buf.size) {
kunmap_local(workspace->in_buf.src);
- page_in_index++;
- if (page_in_index >= total_pages_in) {
+ folio_in_index++;
+ if (folio_in_index >= total_folios_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+ workspace->in_buf.src =
+ kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -618,80 +625,48 @@ done:
}
int zstd_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
zstd_dstream *stream;
int ret = 0;
- size_t ret2;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
+ unsigned long to_copy = 0;
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
if (!stream) {
pr_warn("BTRFS: zstd_init_dstream failed\n");
- ret = -EIO;
goto finish;
}
- destlen = min_t(size_t, destlen, PAGE_SIZE);
-
workspace->in_buf.src = data_in;
workspace->in_buf.pos = 0;
workspace->in_buf.size = srclen;
workspace->out_buf.dst = workspace->buf;
workspace->out_buf.pos = 0;
- workspace->out_buf.size = PAGE_SIZE;
-
- ret2 = 1;
- while (pg_offset < destlen
- && workspace->in_buf.pos < workspace->in_buf.size) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- /* Check if the frame is over and we still need more input */
- if (ret2 == 0) {
- pr_debug("BTRFS: zstd_decompress_stream ended early\n");
- ret = -EIO;
- goto finish;
- }
- ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
- &workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
- zstd_get_error_code(ret2));
- ret = -EIO;
- goto finish;
- }
-
- buf_start = total_out;
- total_out += workspace->out_buf.pos;
- workspace->out_buf.pos = 0;
-
- if (total_out <= start_byte)
- continue;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min_t(unsigned long, destlen - pg_offset,
- workspace->out_buf.size - buf_offset);
-
- memcpy_to_page(dest_page, pg_offset,
- workspace->out_buf.dst + buf_offset, bytes);
-
- pg_offset += bytes;
+ workspace->out_buf.size = sectorsize;
+
+ /*
+ * Since both input and output buffers should not exceed one sector,
+ * one call should end the decompression.
+ */
+ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf);
+ if (zstd_is_error(ret)) {
+ pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n",
+ zstd_get_error_code(ret));
+ goto finish;
}
- ret = 0;
+ to_copy = workspace->out_buf.pos;
+ memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy);
finish:
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
+ /* Error or early end. */
+ if (unlikely(to_copy < destlen)) {
+ ret = -EIO;
+ memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
}
return ret;
}