summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /fs/btrfs
parentInitial commit. (diff)
downloadlinux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
Adding upstream version 6.1.76.upstream/6.1.76upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig109
-rw-r--r--fs/btrfs/Makefile45
-rw-r--r--fs/btrfs/acl.c129
-rw-r--r--fs/btrfs/async-thread.c341
-rw-r--r--fs/btrfs/async-thread.h45
-rw-r--r--fs/btrfs/backref.c3355
-rw-r--r--fs/btrfs/backref.h398
-rw-r--r--fs/btrfs/block-group.c4232
-rw-r--r--fs/btrfs/block-group.h359
-rw-r--r--fs/btrfs/block-rsv.c562
-rw-r--r--fs/btrfs/block-rsv.h104
-rw-r--r--fs/btrfs/btrfs_inode.h439
-rw-r--r--fs/btrfs/check-integrity.c2864
-rw-r--r--fs/btrfs/check-integrity.h20
-rw-r--r--fs/btrfs/compression.c1747
-rw-r--r--fs/btrfs/compression.h183
-rw-r--r--fs/btrfs/ctree.c5011
-rw-r--r--fs/btrfs/ctree.h4132
-rw-r--r--fs/btrfs/delalloc-space.c493
-rw-r--r--fs/btrfs/delalloc-space.h24
-rw-r--r--fs/btrfs/delayed-inode.c2198
-rw-r--r--fs/btrfs/delayed-inode.h177
-rw-r--r--fs/btrfs/delayed-ref.c1171
-rw-r--r--fs/btrfs/delayed-ref.h405
-rw-r--r--fs/btrfs/dev-replace.c1305
-rw-r--r--fs/btrfs/dev-replace.h29
-rw-r--r--fs/btrfs/dir-item.c439
-rw-r--r--fs/btrfs/discard.c758
-rw-r--r--fs/btrfs/discard.h40
-rw-r--r--fs/btrfs/disk-io.c5355
-rw-r--r--fs/btrfs/disk-io.h145
-rw-r--r--fs/btrfs/export.c281
-rw-r--r--fs/btrfs/export.h26
-rw-r--r--fs/btrfs/extent-io-tree.c1680
-rw-r--r--fs/btrfs/extent-io-tree.h239
-rw-r--r--fs/btrfs/extent-tree.c6264
-rw-r--r--fs/btrfs/extent_io.c5897
-rw-r--r--fs/btrfs/extent_io.h296
-rw-r--r--fs/btrfs/extent_map.c985
-rw-r--r--fs/btrfs/extent_map.h118
-rw-r--r--fs/btrfs/file-item.c1304
-rw-r--r--fs/btrfs/file.c3856
-rw-r--r--fs/btrfs/free-space-cache.c4292
-rw-r--r--fs/btrfs/free-space-cache.h151
-rw-r--r--fs/btrfs/free-space-tree.c1658
-rw-r--r--fs/btrfs/free-space-tree.h54
-rw-r--r--fs/btrfs/inode-item.c748
-rw-r--r--fs/btrfs/inode-item.h94
-rw-r--r--fs/btrfs/inode.c11522
-rw-r--r--fs/btrfs/ioctl.c5626
-rw-r--r--fs/btrfs/locking.c405
-rw-r--r--fs/btrfs/locking.h149
-rw-r--r--fs/btrfs/lzo.c494
-rw-r--r--fs/btrfs/misc.h150
-rw-r--r--fs/btrfs/ordered-data.c1172
-rw-r--r--fs/btrfs/ordered-data.h215
-rw-r--r--fs/btrfs/orphan.c58
-rw-r--r--fs/btrfs/print-tree.c411
-rw-r--r--fs/btrfs/print-tree.h16
-rw-r--r--fs/btrfs/props.c467
-rw-r--r--fs/btrfs/props.h26
-rw-r--r--fs/btrfs/qgroup.c4431
-rw-r--r--fs/btrfs/qgroup.h440
-rw-r--r--fs/btrfs/raid56.c2761
-rw-r--r--fs/btrfs/raid56.h186
-rw-r--r--fs/btrfs/rcu-string.h52
-rw-r--r--fs/btrfs/ref-verify.c1025
-rw-r--r--fs/btrfs/ref-verify.h49
-rw-r--r--fs/btrfs/reflink.c930
-rw-r--r--fs/btrfs/reflink.h12
-rw-r--r--fs/btrfs/relocation.c4532
-rw-r--r--fs/btrfs/root-tree.c544
-rw-r--r--fs/btrfs/scrub.c4558
-rw-r--r--fs/btrfs/send.c8136
-rw-r--r--fs/btrfs/send.h183
-rw-r--r--fs/btrfs/space-info.c1786
-rw-r--r--fs/btrfs/space-info.h163
-rw-r--r--fs/btrfs/struct-funcs.c166
-rw-r--r--fs/btrfs/subpage.c768
-rw-r--r--fs/btrfs/subpage.h158
-rw-r--r--fs/btrfs/super.c2857
-rw-r--r--fs/btrfs/sysfs.c2354
-rw-r--r--fs/btrfs/sysfs.h43
-rw-r--r--fs/btrfs/tests/btrfs-tests.c303
-rw-r--r--fs/btrfs/tests/btrfs-tests.h57
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c218
-rw-r--r--fs/btrfs/tests/extent-io-tests.c612
-rw-r--r--fs/btrfs/tests/extent-map-tests.c636
-rw-r--r--fs/btrfs/tests/free-space-tests.c1063
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c588
-rw-r--r--fs/btrfs/tests/inode-tests.c1118
-rw-r--r--fs/btrfs/tests/qgroup-tests.c527
-rw-r--r--fs/btrfs/transaction.c2649
-rw-r--r--fs/btrfs/transaction.h239
-rw-r--r--fs/btrfs/tree-checker.c1947
-rw-r--r--fs/btrfs/tree-checker.h30
-rw-r--r--fs/btrfs/tree-defrag.c471
-rw-r--r--fs/btrfs/tree-log.c7562
-rw-r--r--fs/btrfs/tree-log.h104
-rw-r--r--fs/btrfs/tree-mod-log.c929
-rw-r--r--fs/btrfs/tree-mod-log.h53
-rw-r--r--fs/btrfs/ulist.c276
-rw-r--r--fs/btrfs/ulist.h74
-rw-r--r--fs/btrfs/uuid-tree.c390
-rw-r--r--fs/btrfs/verity.c812
-rw-r--r--fs/btrfs/volumes.c8497
-rw-r--r--fs/btrfs/volumes.h762
-rw-r--r--fs/btrfs/xattr.c492
-rw-r--r--fs/btrfs/xattr.h25
-rw-r--r--fs/btrfs/zlib.c456
-rw-r--r--fs/btrfs/zoned.c2364
-rw-r--r--fs/btrfs/zoned.h420
-rw-r--r--fs/btrfs/zstd.c703
113 files changed, 155779 insertions, 0 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 000000000..183e5c4ae
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config BTRFS_FS
+ tristate "Btrfs filesystem support"
+ select CRYPTO
+ select CRYPTO_CRC32C
+ select LIBCRC32C
+ select CRYPTO_XXHASH
+ select CRYPTO_SHA256
+ select CRYPTO_BLAKE2B
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ select ZSTD_COMPRESS
+ select ZSTD_DECOMPRESS
+ select FS_IOMAP
+ select RAID6_PQ
+ select XOR_BLOCKS
+ select SRCU
+ depends on PAGE_SIZE_LESS_THAN_256KB
+
+ help
+ Btrfs is a general purpose copy-on-write filesystem with extents,
+ writable snapshotting, support for multiple devices and many more
+ features focused on fault tolerance, repair and easy administration.
+
+ The filesystem disk format is no longer unstable, and it's not
+ expected to change unless there are strong reasons to do so. If there
+ is a format change, file systems with a unchanged format will
+ continue to be mountable and usable by newer kernels.
+
+ For more information, please see the web pages at
+ http://btrfs.wiki.kernel.org.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called btrfs.
+
+ If unsure, say N.
+
+config BTRFS_FS_POSIX_ACL
+ bool "Btrfs POSIX Access Control Lists"
+ depends on BTRFS_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+ bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+ depends on BTRFS_FS
+ help
+ Adds code that examines all block write requests (including
+ writes of the super block). The goal is to verify that the
+ state of the filesystem on disk is always consistent, i.e.,
+ after a power-loss or kernel panic event the filesystem is
+ in a consistent state.
+
+ If the integrity check tool is included and activated in
+ the mount options, plenty of kernel memory is used, and
+ plenty of additional CPU cycles are spent. Enabling this
+ functionality is not intended for normal use.
+
+ In most cases, unless you are a btrfs developer who needs
+ to verify the integrity of (super)-block write requests
+ during the run of a regression test, say N
+
+config BTRFS_FS_RUN_SANITY_TESTS
+ bool "Btrfs will run sanity tests upon loading"
+ depends on BTRFS_FS
+ help
+ This will run some basic sanity tests on the free space cache
+ code to make sure it is acting as it should. These are mostly
+ regression tests and are only really interesting to btrfs
+ developers.
+
+ If unsure, say N.
+
+config BTRFS_DEBUG
+ bool "Btrfs debugging support"
+ depends on BTRFS_FS
+ help
+ Enable run-time debugging support for the btrfs filesystem. This may
+ enable additional and expensive checks with negative impact on
+ performance, or export extra information via sysfs.
+
+ If unsure, say N.
+
+config BTRFS_ASSERT
+ bool "Btrfs assert support"
+ depends on BTRFS_FS
+ help
+ Enable run-time assertion checking. This will result in panics if
+ any of the assertions trip. This is meant for btrfs developers only.
+
+ If unsure, say N.
+
+config BTRFS_FS_REF_VERIFY
+ bool "Btrfs with the ref verify tool compiled in"
+ depends on BTRFS_FS
+ default n
+ help
+ Enable run-time extent reference verification instrumentation. This
+ is meant to be used by btrfs developers for tracking down extent
+ reference problems or verifying they didn't break something.
+
+ If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000..fa9ddcc9e
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Subset of W=1 warnings
+subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter
+subdir-ccflags-y += -Wmissing-declarations
+subdir-ccflags-y += -Wmissing-format-attribute
+subdir-ccflags-y += -Wmissing-prototypes
+subdir-ccflags-y += -Wold-style-definition
+subdir-ccflags-y += -Wmissing-include-dirs
+condflags := \
+ $(call cc-option, -Wunused-but-set-variable) \
+ $(call cc-option, -Wunused-const-variable) \
+ $(call cc-option, -Wpacked-not-aligned) \
+ $(call cc-option, -Wstringop-truncation)
+subdir-ccflags-y += $(condflags)
+# The following turn off the warnings enabled by -Wextra
+subdir-ccflags-y += -Wno-missing-field-initializers
+subdir-ccflags-y += -Wno-sign-compare
+subdir-ccflags-y += -Wno-type-limits
+subdir-ccflags-y += -Wno-shift-negative-value
+
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+ file-item.o inode-item.o disk-io.o \
+ transaction.o inode.o file.o tree-defrag.o \
+ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
+ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
+ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
+ subpage.o tree-mod-log.o extent-io-tree.o
+
+btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
+btrfs-$(CONFIG_FS_VERITY) += verity.o
+
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
+ tests/extent-buffer-tests.o tests/btrfs-tests.o \
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
+ tests/free-space-tree-tests.o tests/extent-map-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000..548d6a547
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
+{
+ int size;
+ const char *name;
+ char *value = NULL;
+ struct posix_acl *acl;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ size = btrfs_getxattr(inode, name, NULL, 0);
+ if (size > 0) {
+ value = kzalloc(size, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = btrfs_getxattr(inode, name, value, size);
+ }
+ if (size > 0)
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ else if (size == -ENODATA || size == 0)
+ acl = NULL;
+ else
+ acl = ERR_PTR(size);
+ kfree(value);
+
+ return acl;
+}
+
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type)
+{
+ int ret, size = 0;
+ const char *name;
+ char *value = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EINVAL : 0;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ unsigned int nofs_flag;
+
+ size = posix_acl_xattr_size(acl->a_count);
+ /*
+ * We're holding a transaction handle, so use a NOFS memory
+ * allocation context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
+ value = kmalloc(size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ if (!value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (trans)
+ ret = btrfs_setxattr(trans, inode, name, value, size, 0);
+ else
+ ret = btrfs_setxattr_trans(inode, name, value, size, 0);
+
+out:
+ kfree(value);
+
+ if (!ret)
+ set_cached_acl(inode, type, acl);
+
+ return ret;
+}
+
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
+{
+ int ret;
+ umode_t old_mode = inode->i_mode;
+
+ if (type == ACL_TYPE_ACCESS && acl) {
+ ret = posix_acl_update_mode(mnt_userns, inode,
+ &inode->i_mode, &acl);
+ if (ret)
+ return ret;
+ }
+ ret = __btrfs_set_acl(NULL, inode, acl, type);
+ if (ret)
+ inode->i_mode = old_mode;
+ return ret;
+}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000..aac240430
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,341 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
+ */
+
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+#include "ctree.h"
+
+enum {
+ WORK_DONE_BIT,
+ WORK_ORDER_DONE_BIT,
+};
+
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
+struct btrfs_workqueue {
+ struct workqueue_struct *normal_wq;
+
+ /* File system this workqueue services */
+ struct btrfs_fs_info *fs_info;
+
+ /* List head pointing to ordered work list */
+ struct list_head ordered_list;
+
+ /* Spinlock for ordered_list */
+ spinlock_t list_lock;
+
+ /* Thresholding related variants */
+ atomic_t pending;
+
+ /* Up limit of concurrency workers */
+ int limit_active;
+
+ /* Current number of concurrency workers */
+ int current_active;
+
+ /* Threshold to change current_active */
+ int thresh;
+ unsigned int count;
+ spinlock_t thres_lock;
+};
+
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
+{
+ return wq->fs_info;
+}
+
+struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
+{
+ return work->wq->fs_info;
+}
+
+bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
+{
+ /*
+ * We could compare wq->pending with num_online_cpus()
+ * to support "thresh == NO_THRESHOLD" case, but it requires
+ * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
+ * postpone it until someone needs the support of that case.
+ */
+ if (wq->thresh == NO_THRESHOLD)
+ return false;
+
+ return atomic_read(&wq->pending) > wq->thresh * 2;
+}
+
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name, unsigned int flags,
+ int limit_active, int thresh)
+{
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+
+ if (!ret)
+ return NULL;
+
+ ret->fs_info = fs_info;
+ ret->limit_active = limit_active;
+ atomic_set(&ret->pending, 0);
+ if (thresh == 0)
+ thresh = DFT_THRESHOLD;
+ /* For low threshold, disabling threshold is a better choice */
+ if (thresh < DFT_THRESHOLD) {
+ ret->current_active = limit_active;
+ ret->thresh = NO_THRESHOLD;
+ } else {
+ /*
+ * For threshold-able wq, let its concurrency grow on demand.
+ * Use minimal max_active at alloc time to reduce resource
+ * usage.
+ */
+ ret->current_active = 1;
+ ret->thresh = thresh;
+ }
+
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
+ name);
+ if (!ret->normal_wq) {
+ kfree(ret);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&ret->ordered_list);
+ spin_lock_init(&ret->list_lock);
+ spin_lock_init(&ret->thres_lock);
+ trace_btrfs_workqueue_alloc(ret, name);
+ return ret;
+}
+
+/*
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
+ */
+static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
+{
+ if (wq->thresh == NO_THRESHOLD)
+ return;
+ atomic_inc(&wq->pending);
+}
+
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
+{
+ int new_current_active;
+ long pending;
+ int need_change = 0;
+
+ if (wq->thresh == NO_THRESHOLD)
+ return;
+
+ atomic_dec(&wq->pending);
+ spin_lock(&wq->thres_lock);
+ /*
+ * Use wq->count to limit the calling frequency of
+ * workqueue_set_max_active.
+ */
+ wq->count++;
+ wq->count %= (wq->thresh / 4);
+ if (!wq->count)
+ goto out;
+ new_current_active = wq->current_active;
+
+ /*
+ * pending may be changed later, but it's OK since we really
+ * don't need it so accurate to calculate new_max_active.
+ */
+ pending = atomic_read(&wq->pending);
+ if (pending > wq->thresh)
+ new_current_active++;
+ if (pending < wq->thresh / 2)
+ new_current_active--;
+ new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
+ if (new_current_active != wq->current_active) {
+ need_change = 1;
+ wq->current_active = new_current_active;
+ }
+out:
+ spin_unlock(&wq->thres_lock);
+
+ if (need_change) {
+ workqueue_set_max_active(wq->normal_wq, wq->current_active);
+ }
+}
+
+static void run_ordered_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *self)
+{
+ struct list_head *list = &wq->ordered_list;
+ struct btrfs_work *work;
+ spinlock_t *lock = &wq->list_lock;
+ unsigned long flags;
+ bool free_self = false;
+
+ while (1) {
+ spin_lock_irqsave(lock, flags);
+ if (list_empty(list))
+ break;
+ work = list_entry(list->next, struct btrfs_work,
+ ordered_list);
+ if (!test_bit(WORK_DONE_BIT, &work->flags))
+ break;
+ /*
+ * Orders all subsequent loads after reading WORK_DONE_BIT,
+ * paired with the smp_mb__before_atomic in btrfs_work_helper
+ * this guarantees that the ordered function will see all
+ * updates from ordinary work function.
+ */
+ smp_rmb();
+
+ /*
+ * we are going to call the ordered done function, but
+ * we leave the work item on the list as a barrier so
+ * that later work items that are done don't have their
+ * functions called before this one returns
+ */
+ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+ break;
+ trace_btrfs_ordered_sched(work);
+ spin_unlock_irqrestore(lock, flags);
+ work->ordered_func(work);
+
+ /* now take the lock again and drop our item from the list */
+ spin_lock_irqsave(lock, flags);
+ list_del(&work->ordered_list);
+ spin_unlock_irqrestore(lock, flags);
+
+ if (work == self) {
+ /*
+ * This is the work item that the worker is currently
+ * executing.
+ *
+ * The kernel workqueue code guarantees non-reentrancy
+ * of work items. I.e., if a work item with the same
+ * address and work function is queued twice, the second
+ * execution is blocked until the first one finishes. A
+ * work item may be freed and recycled with the same
+ * work function; the workqueue code assumes that the
+ * original work item cannot depend on the recycled work
+ * item in that case (see find_worker_executing_work()).
+ *
+ * Note that different types of Btrfs work can depend on
+ * each other, and one type of work on one Btrfs
+ * filesystem may even depend on the same type of work
+ * on another Btrfs filesystem via, e.g., a loop device.
+ * Therefore, we must not allow the current work item to
+ * be recycled until we are really done, otherwise we
+ * break the above assumption and can deadlock.
+ */
+ free_self = true;
+ } else {
+ /*
+ * We don't want to call the ordered free functions with
+ * the lock held.
+ */
+ work->ordered_free(work);
+ /* NB: work must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, work);
+ }
+ }
+ spin_unlock_irqrestore(lock, flags);
+
+ if (free_self) {
+ self->ordered_free(self);
+ /* NB: self must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, self);
+ }
+}
+
+static void btrfs_work_helper(struct work_struct *normal_work)
+{
+ struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
+ normal_work);
+ struct btrfs_workqueue *wq = work->wq;
+ int need_order = 0;
+
+ /*
+ * We should not touch things inside work in the following cases:
+ * 1) after work->func() if it has no ordered_free
+ * Since the struct is freed in work->func().
+ * 2) after setting WORK_DONE_BIT
+ * The work may be freed in other threads almost instantly.
+ * So we save the needed things here.
+ */
+ if (work->ordered_func)
+ need_order = 1;
+
+ trace_btrfs_work_sched(work);
+ thresh_exec_hook(wq);
+ work->func(work);
+ if (need_order) {
+ /*
+ * Ensures all memory accesses done in the work function are
+ * ordered before setting the WORK_DONE_BIT. Ensuring the thread
+ * which is going to executed the ordered work sees them.
+ * Pairs with the smp_rmb in run_ordered_work.
+ */
+ smp_mb__before_atomic();
+ set_bit(WORK_DONE_BIT, &work->flags);
+ run_ordered_work(wq, work);
+ } else {
+ /* NB: work must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, work);
+ }
+}
+
+void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
+ btrfs_func_t ordered_func, btrfs_func_t ordered_free)
+{
+ work->func = func;
+ work->ordered_func = ordered_func;
+ work->ordered_free = ordered_free;
+ INIT_WORK(&work->normal_work, btrfs_work_helper);
+ INIT_LIST_HEAD(&work->ordered_list);
+ work->flags = 0;
+}
+
+void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
+{
+ unsigned long flags;
+
+ work->wq = wq;
+ thresh_queue_hook(wq);
+ if (work->ordered_func) {
+ spin_lock_irqsave(&wq->list_lock, flags);
+ list_add_tail(&work->ordered_list, &wq->ordered_list);
+ spin_unlock_irqrestore(&wq->list_lock, flags);
+ }
+ trace_btrfs_work_queued(work);
+ queue_work(wq->normal_wq, &work->normal_work);
+}
+
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
+{
+ if (!wq)
+ return;
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
+ kfree(wq);
+}
+
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
+{
+ if (wq)
+ wq->limit_active = limit_active;
+}
+
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
+{
+ flush_workqueue(wq->normal_wq);
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000..6e2596dda
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
+ */
+
+#ifndef BTRFS_ASYNC_THREAD_H
+#define BTRFS_ASYNC_THREAD_H
+
+#include <linux/workqueue.h>
+
+struct btrfs_fs_info;
+struct btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+
+struct btrfs_work {
+ btrfs_func_t func;
+ btrfs_func_t ordered_func;
+ btrfs_func_t ordered_free;
+
+ /* Don't touch things below */
+ struct work_struct normal_work;
+ struct list_head ordered_list;
+ struct btrfs_workqueue *wq;
+ unsigned long flags;
+};
+
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name,
+ unsigned int flags,
+ int limit_active,
+ int thresh);
+void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
+ btrfs_func_t ordered_func, btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq);
+bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
+
+#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000..18cf801ab
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,3355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ */
+
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <trace/events/btrfs.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
+#include "locking.h"
+#include "misc.h"
+#include "tree-mod-log.h"
+
+/* Just an arbitrary number so we can be sure this happened */
+#define BACKREF_FOUND_SHARED 6
+
+struct extent_inode_elem {
+ u64 inum;
+ u64 offset;
+ struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(const struct btrfs_key *key,
+ const struct extent_buffer *eb,
+ const struct btrfs_file_extent_item *fi,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie,
+ bool ignore_offset)
+{
+ u64 offset = 0;
+ struct extent_inode_elem *e;
+
+ if (!ignore_offset &&
+ !btrfs_file_extent_compression(eb, fi) &&
+ !btrfs_file_extent_encryption(eb, fi) &&
+ !btrfs_file_extent_other_encoding(eb, fi)) {
+ u64 data_offset;
+ u64 data_len;
+
+ data_offset = btrfs_file_extent_offset(eb, fi);
+ data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+ if (extent_item_pos < data_offset ||
+ extent_item_pos >= data_offset + data_len)
+ return 1;
+ offset = extent_item_pos - data_offset;
+ }
+
+ e = kmalloc(sizeof(*e), GFP_NOFS);
+ if (!e)
+ return -ENOMEM;
+
+ e->next = *eie;
+ e->inum = key->objectid;
+ e->offset = key->offset + offset;
+ *eie = e;
+
+ return 0;
+}
+
+static void free_inode_elem_list(struct extent_inode_elem *eie)
+{
+ struct extent_inode_elem *eie_next;
+
+ for (; eie; eie = eie_next) {
+ eie_next = eie->next;
+ kfree(eie);
+ }
+}
+
+static int find_extent_in_eb(const struct extent_buffer *eb,
+ u64 wanted_disk_byte, u64 extent_item_pos,
+ struct extent_inode_elem **eie,
+ bool ignore_offset)
+{
+ u64 disk_byte;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int slot;
+ int nritems;
+ int extent_type;
+ int ret;
+
+ /*
+ * from the shared data ref, we only have the leaf but we need
+ * the key. thus, we must look into all items and see that we
+ * find one (some) with a reference to our extent item.
+ */
+ nritems = btrfs_header_nritems(eb);
+ for (slot = 0; slot < nritems; ++slot) {
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte != wanted_disk_byte)
+ continue;
+
+ ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct preftree {
+ struct rb_root_cached root;
+ unsigned int count;
+};
+
+#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
+
+struct preftrees {
+ struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
+ struct preftree indirect; /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
+ struct preftree indirect_missing_keys;
+};
+
+/*
+ * Checks for a shared extent during backref search.
+ *
+ * The share_count tracks prelim_refs (direct and indirect) having a
+ * ref->count >0:
+ * - incremented when a ref->count transitions to >0
+ * - decremented when a ref->count transitions to <1
+ */
+struct share_check {
+ u64 root_objectid;
+ u64 inum;
+ int share_count;
+ bool have_delayed_delete_refs;
+};
+
+static inline int extent_is_shared(struct share_check *sc)
+{
+ return (sc && sc->share_count > 1) ? BACKREF_FOUND_SHARED : 0;
+}
+
+static struct kmem_cache *btrfs_prelim_ref_cache;
+
+int __init btrfs_prelim_ref_init(void)
+{
+ btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
+ sizeof(struct prelim_ref),
+ 0,
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_prelim_ref_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_prelim_ref_exit(void)
+{
+ kmem_cache_destroy(btrfs_prelim_ref_cache);
+}
+
+static void free_pref(struct prelim_ref *ref)
+{
+ kmem_cache_free(btrfs_prelim_ref_cache, ref);
+}
+
+/*
+ * Return 0 when both refs are for the same block (and can be merged).
+ * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
+ * indicates a 'higher' block.
+ */
+static int prelim_ref_compare(struct prelim_ref *ref1,
+ struct prelim_ref *ref2)
+{
+ if (ref1->level < ref2->level)
+ return -1;
+ if (ref1->level > ref2->level)
+ return 1;
+ if (ref1->root_id < ref2->root_id)
+ return -1;
+ if (ref1->root_id > ref2->root_id)
+ return 1;
+ if (ref1->key_for_search.type < ref2->key_for_search.type)
+ return -1;
+ if (ref1->key_for_search.type > ref2->key_for_search.type)
+ return 1;
+ if (ref1->key_for_search.objectid < ref2->key_for_search.objectid)
+ return -1;
+ if (ref1->key_for_search.objectid > ref2->key_for_search.objectid)
+ return 1;
+ if (ref1->key_for_search.offset < ref2->key_for_search.offset)
+ return -1;
+ if (ref1->key_for_search.offset > ref2->key_for_search.offset)
+ return 1;
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+
+ return 0;
+}
+
+static void update_share_count(struct share_check *sc, int oldcount,
+ int newcount)
+{
+ if ((!sc) || (oldcount == 0 && newcount < 1))
+ return;
+
+ if (oldcount > 0 && newcount < 1)
+ sc->share_count--;
+ else if (oldcount < 1 && newcount > 0)
+ sc->share_count++;
+}
+
+/*
+ * Add @newref to the @root rbtree, merging identical refs.
+ *
+ * Callers should assume that newref has been freed after calling.
+ */
+static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
+ struct preftree *preftree,
+ struct prelim_ref *newref,
+ struct share_check *sc)
+{
+ struct rb_root_cached *root;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct prelim_ref *ref;
+ int result;
+ bool leftmost = true;
+
+ root = &preftree->root;
+ p = &root->rb_root.rb_node;
+
+ while (*p) {
+ parent = *p;
+ ref = rb_entry(parent, struct prelim_ref, rbnode);
+ result = prelim_ref_compare(ref, newref);
+ if (result < 0) {
+ p = &(*p)->rb_left;
+ } else if (result > 0) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ /* Identical refs, merge them and free @newref */
+ struct extent_inode_elem *eie = ref->inode_list;
+
+ while (eie && eie->next)
+ eie = eie->next;
+
+ if (!eie)
+ ref->inode_list = newref->inode_list;
+ else
+ eie->next = newref->inode_list;
+ trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
+ preftree->count);
+ /*
+ * A delayed ref can have newref->count < 0.
+ * The ref->count is updated to follow any
+ * BTRFS_[ADD|DROP]_DELAYED_REF actions.
+ */
+ update_share_count(sc, ref->count,
+ ref->count + newref->count);
+ ref->count += newref->count;
+ free_pref(newref);
+ return;
+ }
+ }
+
+ update_share_count(sc, 0, newref->count);
+ preftree->count++;
+ trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
+ rb_link_node(&newref->rbnode, parent, p);
+ rb_insert_color_cached(&newref->rbnode, root, leftmost);
+}
+
+/*
+ * Release the entire tree. We don't care about internal consistency so
+ * just free everything and then reset the tree root.
+ */
+static void prelim_release(struct preftree *preftree)
+{
+ struct prelim_ref *ref, *next_ref;
+
+ rbtree_postorder_for_each_entry_safe(ref, next_ref,
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
+ free_pref(ref);
+ }
+
+ preftree->root = RB_ROOT_CACHED;
+ preftree->count = 0;
+}
+
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ * block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | - | -
+ * key to resolve | - | y | y | y
+ * tree block logical | - | - | - | -
+ * root for resolving | y | y | y | y
+ *
+ * - column 1: we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | y | -
+ * key to resolve | - | - | - | y
+ * tree block logical | y | y | y | y
+ * root for resolving | - | y | y | y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2: we take the first key from the block to find the parent
+ * (see add_missing_keys)
+ * - column 4: we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
+ struct preftree *preftree, u64 root_id,
+ const struct btrfs_key *key, int level, u64 parent,
+ u64 wanted_disk_byte, int count,
+ struct share_check *sc, gfp_t gfp_mask)
+{
+ struct prelim_ref *ref;
+
+ if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ return 0;
+
+ ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
+ if (!ref)
+ return -ENOMEM;
+
+ ref->root_id = root_id;
+ if (key)
+ ref->key_for_search = *key;
+ else
+ memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+
+ ref->inode_list = NULL;
+ ref->level = level;
+ ref->count = count;
+ ref->parent = parent;
+ ref->wanted_disk_byte = wanted_disk_byte;
+ prelim_ref_insert(fs_info, preftree, ref, sc);
+ return extent_is_shared(sc);
+}
+
+/* direct refs use root == 0, key == NULL */
+static int add_direct_ref(const struct btrfs_fs_info *fs_info,
+ struct preftrees *preftrees, int level, u64 parent,
+ u64 wanted_disk_byte, int count,
+ struct share_check *sc, gfp_t gfp_mask)
+{
+ return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level,
+ parent, wanted_disk_byte, count, sc, gfp_mask);
+}
+
+/* indirect refs use parent == 0 */
+static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
+ struct preftrees *preftrees, u64 root_id,
+ const struct btrfs_key *key, int level,
+ u64 wanted_disk_byte, int count,
+ struct share_check *sc, gfp_t gfp_mask)
+{
+ struct preftree *tree = &preftrees->indirect;
+
+ if (!key)
+ tree = &preftrees->indirect_missing_keys;
+ return add_prelim_ref(fs_info, tree, root_id, key, level, 0,
+ wanted_disk_byte, count, sc, gfp_mask);
+}
+
+static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
+{
+ struct rb_node **p = &preftrees->direct.root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct prelim_ref *ref = NULL;
+ struct prelim_ref target = {};
+ int result;
+
+ target.parent = bytenr;
+
+ while (*p) {
+ parent = *p;
+ ref = rb_entry(parent, struct prelim_ref, rbnode);
+ result = prelim_ref_compare(ref, &target);
+
+ if (result < 0)
+ p = &(*p)->rb_left;
+ else if (result > 0)
+ p = &(*p)->rb_right;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+ struct ulist *parents,
+ struct preftrees *preftrees, struct prelim_ref *ref,
+ int level, u64 time_seq, const u64 *extent_item_pos,
+ bool ignore_offset)
+{
+ int ret = 0;
+ int slot;
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ struct btrfs_key *key_for_search = &ref->key_for_search;
+ struct btrfs_file_extent_item *fi;
+ struct extent_inode_elem *eie = NULL, *old = NULL;
+ u64 disk_byte;
+ u64 wanted_disk_byte = ref->wanted_disk_byte;
+ u64 count = 0;
+ u64 data_offset;
+ u8 type;
+
+ if (level != 0) {
+ eb = path->nodes[level];
+ ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+ if (ret < 0)
+ return ret;
+ return 0;
+ }
+
+ /*
+ * 1. We normally enter this function with the path already pointing to
+ * the first item to check. But sometimes, we may enter it with
+ * slot == nritems.
+ * 2. We are searching for normal backref but bytenr of this leaf
+ * matches shared data backref
+ * 3. The leaf owner is not equal to the root we are searching
+ *
+ * For these cases, go to the next leaf before we continue.
+ */
+ eb = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(eb) ||
+ is_shared_data_backref(preftrees, eb->start) ||
+ ref->root_id != btrfs_header_owner(eb)) {
+ if (time_seq == BTRFS_SEQ_LAST)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ }
+
+ while (!ret && count < ref->count) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+
+ if (key.objectid != key_for_search->objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ /*
+ * We are searching for normal backref but bytenr of this leaf
+ * matches shared data backref, OR
+ * the leaf owner is not equal to the root we are searching for
+ */
+ if (slot == 0 &&
+ (is_shared_data_backref(preftrees, eb->start) ||
+ ref->root_id != btrfs_header_owner(eb))) {
+ if (time_seq == BTRFS_SEQ_LAST)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ continue;
+ }
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(eb, fi);
+ if (type == BTRFS_FILE_EXTENT_INLINE)
+ goto next;
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ data_offset = btrfs_file_extent_offset(eb, fi);
+
+ if (disk_byte == wanted_disk_byte) {
+ eie = NULL;
+ old = NULL;
+ if (ref->key_for_search.offset == key.offset - data_offset)
+ count++;
+ else
+ goto next;
+ if (extent_item_pos) {
+ ret = check_extent_in_eb(&key, eb, fi,
+ *extent_item_pos,
+ &eie, ignore_offset);
+ if (ret < 0)
+ break;
+ }
+ if (ret > 0)
+ goto next;
+ ret = ulist_add_merge_ptr(parents, eb->start,
+ eie, (void **)&old, GFP_NOFS);
+ if (ret < 0)
+ break;
+ if (!ret && extent_item_pos) {
+ while (old->next)
+ old = old->next;
+ old->next = eie;
+ }
+ eie = NULL;
+ }
+next:
+ if (time_seq == BTRFS_SEQ_LAST)
+ ret = btrfs_next_item(root, path);
+ else
+ ret = btrfs_next_old_item(root, path, time_seq);
+ }
+
+ if (ret > 0)
+ ret = 0;
+ else if (ret < 0)
+ free_inode_elem_list(eie);
+ return ret;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 time_seq,
+ struct preftrees *preftrees,
+ struct prelim_ref *ref, struct ulist *parents,
+ const u64 *extent_item_pos, bool ignore_offset)
+{
+ struct btrfs_root *root;
+ struct extent_buffer *eb;
+ int ret = 0;
+ int root_level;
+ int level = ref->level;
+ struct btrfs_key search_key = ref->key_for_search;
+
+ /*
+ * If we're search_commit_root we could possibly be holding locks on
+ * other tree nodes. This happens when qgroups does backref walks when
+ * adding new delayed refs. To deal with this we need to look in cache
+ * for the root, and if we don't find it then we need to search the
+ * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
+ * here.
+ */
+ if (path->search_commit_root)
+ root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
+ else
+ root = btrfs_get_fs_root(fs_info, ref->root_id, false);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out_free;
+ }
+
+ if (!path->search_commit_root &&
+ test_bit(BTRFS_ROOT_DELETING, &root->state)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (btrfs_is_testing(fs_info)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (path->search_commit_root)
+ root_level = btrfs_header_level(root->commit_root);
+ else if (time_seq == BTRFS_SEQ_LAST)
+ root_level = btrfs_header_level(root->node);
+ else
+ root_level = btrfs_old_root_level(root, time_seq);
+
+ if (root_level + 1 == level)
+ goto out;
+
+ /*
+ * We can often find data backrefs with an offset that is too large
+ * (>= LLONG_MAX, maximum allowed file offset) due to underflows when
+ * subtracting a file's offset with the data offset of its
+ * corresponding extent data item. This can happen for example in the
+ * clone ioctl.
+ *
+ * So if we detect such case we set the search key's offset to zero to
+ * make sure we will find the matching file extent item at
+ * add_all_parents(), otherwise we will miss it because the offset
+ * taken form the backref is much larger then the offset of the file
+ * extent item. This can make us scan a very large number of file
+ * extent items, but at least it will not make us miss any.
+ *
+ * This is an ugly workaround for a behaviour that should have never
+ * existed, but it does and a fix for the clone ioctl would touch a lot
+ * of places, cause backwards incompatibility and would not fix the
+ * problem for extents cloned with older kernels.
+ */
+ if (search_key.type == BTRFS_EXTENT_DATA_KEY &&
+ search_key.offset >= LLONG_MAX)
+ search_key.offset = 0;
+ path->lowest_level = level;
+ if (time_seq == BTRFS_SEQ_LAST)
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ else
+ ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
+
+ btrfs_debug(fs_info,
+ "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
+ ref->root_id, level, ref->count, ret,
+ ref->key_for_search.objectid, ref->key_for_search.type,
+ ref->key_for_search.offset);
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[level];
+ while (!eb) {
+ if (WARN_ON(!level)) {
+ ret = 1;
+ goto out;
+ }
+ level--;
+ eb = path->nodes[level];
+ }
+
+ ret = add_all_parents(root, path, parents, preftrees, ref, level,
+ time_seq, extent_item_pos, ignore_offset);
+out:
+ btrfs_put_root(root);
+out_free:
+ path->lowest_level = 0;
+ btrfs_release_path(path);
+ return ret;
+}
+
+static struct extent_inode_elem *
+unode_aux_to_inode_list(struct ulist_node *node)
+{
+ if (!node)
+ return NULL;
+ return (struct extent_inode_elem *)(uintptr_t)node->aux;
+}
+
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
+/*
+ * We maintain three separate rbtrees: one for direct refs, one for
+ * indirect refs which have a key, and one for indirect refs which do not
+ * have a key. Each tree does merge on insertion.
+ *
+ * Once all of the references are located, we iterate over the tree of
+ * indirect refs with missing keys. An appropriate key is located and
+ * the ref is moved onto the tree for indirect refs. After all missing
+ * keys are thus located, we iterate over the indirect ref tree, resolve
+ * each reference, and then insert the resolved reference onto the
+ * direct tree (merging there too).
+ *
+ * New backrefs (i.e., for parent nodes) are added to the appropriate
+ * rbtree as they are encountered. The new backrefs are subsequently
+ * resolved as above.
+ */
+static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 time_seq,
+ struct preftrees *preftrees,
+ const u64 *extent_item_pos,
+ struct share_check *sc, bool ignore_offset)
+{
+ int err;
+ int ret = 0;
+ struct ulist *parents;
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+ struct rb_node *rnode;
+
+ parents = ulist_alloc(GFP_NOFS);
+ if (!parents)
+ return -ENOMEM;
+
+ /*
+ * We could trade memory usage for performance here by iterating
+ * the tree, allocating new refs for each insertion, and then
+ * freeing the entire indirect tree when we're done. In some test
+ * cases, the tree can grow quite large (~200k objects).
+ */
+ while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
+ struct prelim_ref *ref;
+
+ ref = rb_entry(rnode, struct prelim_ref, rbnode);
+ if (WARN(ref->parent,
+ "BUG: direct ref found in indirect tree")) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
+ preftrees->indirect.count--;
+
+ if (ref->count == 0) {
+ free_pref(ref);
+ continue;
+ }
+
+ if (sc && sc->root_objectid &&
+ ref->root_id != sc->root_objectid) {
+ free_pref(ref);
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
+ err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
+ ref, parents, extent_item_pos,
+ ignore_offset);
+ /*
+ * we can only tolerate ENOENT,otherwise,we should catch error
+ * and return directly.
+ */
+ if (err == -ENOENT) {
+ prelim_ref_insert(fs_info, &preftrees->direct, ref,
+ NULL);
+ continue;
+ } else if (err) {
+ free_pref(ref);
+ ret = err;
+ goto out;
+ }
+
+ /* we put the first parent into the ref at hand */
+ ULIST_ITER_INIT(&uiter);
+ node = ulist_next(parents, &uiter);
+ ref->parent = node ? node->val : 0;
+ ref->inode_list = unode_aux_to_inode_list(node);
+
+ /* Add a prelim_ref(s) for any other parent(s). */
+ while ((node = ulist_next(parents, &uiter))) {
+ struct prelim_ref *new_ref;
+
+ new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
+ GFP_NOFS);
+ if (!new_ref) {
+ free_pref(ref);
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(new_ref, ref, sizeof(*ref));
+ new_ref->parent = node->val;
+ new_ref->inode_list = unode_aux_to_inode_list(node);
+ prelim_ref_insert(fs_info, &preftrees->direct,
+ new_ref, NULL);
+ }
+
+ /*
+ * Now it's a direct ref, put it in the direct tree. We must
+ * do this last because the ref could be merged/freed here.
+ */
+ prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
+
+ ulist_reinit(parents);
+ cond_resched();
+ }
+out:
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
+ return ret;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int add_missing_keys(struct btrfs_fs_info *fs_info,
+ struct preftrees *preftrees, bool lock)
+{
+ struct prelim_ref *ref;
+ struct extent_buffer *eb;
+ struct preftree *tree = &preftrees->indirect_missing_keys;
+ struct rb_node *node;
+
+ while ((node = rb_first_cached(&tree->root))) {
+ ref = rb_entry(node, struct prelim_ref, rbnode);
+ rb_erase_cached(node, &tree->root);
+
+ BUG_ON(ref->parent); /* should not be a direct ref */
+ BUG_ON(ref->key_for_search.type);
+ BUG_ON(!ref->wanted_disk_byte);
+
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte,
+ ref->root_id, 0, ref->level - 1, NULL);
+ if (IS_ERR(eb)) {
+ free_pref(ref);
+ return PTR_ERR(eb);
+ }
+ if (!extent_buffer_uptodate(eb)) {
+ free_pref(ref);
+ free_extent_buffer(eb);
+ return -EIO;
+ }
+
+ if (lock)
+ btrfs_tree_read_lock(eb);
+ if (btrfs_header_level(eb) == 0)
+ btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+ if (lock)
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
+ cond_resched();
+ }
+ return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_head *head, u64 seq,
+ struct preftrees *preftrees, struct share_check *sc)
+{
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_key key;
+ struct rb_node *n;
+ int count;
+ int ret = 0;
+
+ spin_lock(&head->lock);
+ for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
+ node = rb_entry(n, struct btrfs_delayed_ref_node,
+ ref_node);
+ if (node->seq > seq)
+ continue;
+
+ switch (node->action) {
+ case BTRFS_ADD_DELAYED_EXTENT:
+ case BTRFS_UPDATE_DELAYED_HEAD:
+ WARN_ON(1);
+ continue;
+ case BTRFS_ADD_DELAYED_REF:
+ count = node->ref_mod;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ count = node->ref_mod * -1;
+ break;
+ default:
+ BUG();
+ }
+ switch (node->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY: {
+ /* NORMAL INDIRECT METADATA backref */
+ struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = add_indirect_ref(fs_info, preftrees, ref->root,
+ key_ptr, ref->level + 1,
+ node->bytenr, count, sc,
+ GFP_ATOMIC);
+ break;
+ }
+ case BTRFS_SHARED_BLOCK_REF_KEY: {
+ /* SHARED DIRECT METADATA backref */
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+
+ ret = add_direct_ref(fs_info, preftrees, ref->level + 1,
+ ref->parent, node->bytenr, count,
+ sc, GFP_ATOMIC);
+ break;
+ }
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ /* NORMAL INDIRECT DATA backref */
+ struct btrfs_delayed_data_ref *ref;
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+
+ /*
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
+ */
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
+
+ ret = add_indirect_ref(fs_info, preftrees, ref->root,
+ &key, 0, node->bytenr, count, sc,
+ GFP_ATOMIC);
+ break;
+ }
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ /* SHARED DIRECT FULL backref */
+ struct btrfs_delayed_data_ref *ref;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ ret = add_direct_ref(fs_info, preftrees, 0, ref->parent,
+ node->bytenr, count, sc,
+ GFP_ATOMIC);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ /*
+ * We must ignore BACKREF_FOUND_SHARED until all delayed
+ * refs have been checked.
+ */
+ if (ret && (ret != BACKREF_FOUND_SHARED))
+ break;
+ }
+ if (!ret)
+ ret = extent_is_shared(sc);
+
+ spin_unlock(&head->lock);
+ return ret;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ *
+ * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
+ */
+static int add_inline_refs(const struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ int *info_level, struct preftrees *preftrees,
+ struct share_check *sc)
+{
+ int ret = 0;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ unsigned long ptr;
+ unsigned long end;
+ struct btrfs_extent_item *ei;
+ u64 flags;
+ u64 item_size;
+
+ /*
+ * enumerate all inline refs
+ */
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ item_size = btrfs_item_size(leaf, slot);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + item_size;
+
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)ptr;
+ *info_level = btrfs_tree_block_level(leaf, info);
+ ptr += sizeof(struct btrfs_tree_block_info);
+ BUG_ON(ptr > end);
+ } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+ *info_level = found_key.offset;
+ } else {
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+ }
+
+ while (ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ u64 offset;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(leaf, iref,
+ BTRFS_REF_TYPE_ANY);
+ if (type == BTRFS_REF_TYPE_INVALID)
+ return -EUCLEAN;
+
+ offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+ switch (type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = add_direct_ref(fs_info, preftrees,
+ *info_level + 1, offset,
+ bytenr, 1, NULL, GFP_NOFS);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+
+ ret = add_direct_ref(fs_info, preftrees, 0, offset,
+ bytenr, count, sc, GFP_NOFS);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = add_indirect_ref(fs_info, preftrees, offset,
+ NULL, *info_level + 1,
+ bytenr, 1, NULL, GFP_NOFS);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
+ root = btrfs_extent_data_ref_root(leaf, dref);
+
+ ret = add_indirect_ref(fs_info, preftrees, root,
+ &key, 0, bytenr, count,
+ sc, GFP_NOFS);
+
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ if (ret)
+ return ret;
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+
+ return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ *
+ * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
+ */
+static int add_keyed_refs(struct btrfs_root *extent_root,
+ struct btrfs_path *path, u64 bytenr,
+ int info_level, struct preftrees *preftrees,
+ struct share_check *sc)
+{
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ while (1) {
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = 0;
+ break;
+ }
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.objectid != bytenr)
+ break;
+ if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+ continue;
+ if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+ break;
+
+ switch (key.type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ /* SHARED DIRECT METADATA backref */
+ ret = add_direct_ref(fs_info, preftrees,
+ info_level + 1, key.offset,
+ bytenr, 1, NULL, GFP_NOFS);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ /* SHARED DIRECT FULL backref */
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_shared_data_ref);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = add_direct_ref(fs_info, preftrees, 0,
+ key.offset, bytenr, count,
+ sc, GFP_NOFS);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ /* NORMAL INDIRECT METADATA backref */
+ ret = add_indirect_ref(fs_info, preftrees, key.offset,
+ NULL, info_level + 1, bytenr,
+ 1, NULL, GFP_NOFS);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ /* NORMAL INDIRECT DATA backref */
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_extent_data_ref);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = add_indirect_ref(fs_info, preftrees, root,
+ &key, 0, bytenr, count,
+ sc, GFP_NOFS);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ if (ret)
+ return ret;
+
+ }
+
+ return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
+ * behave much like trans == NULL case, the difference only lies in it will not
+ * commit root.
+ * The special case is for qgroup to search roots in commit_transaction().
+ *
+ * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a
+ * shared extent is detected.
+ *
+ * Otherwise this returns 0 for success and <0 for an error.
+ *
+ * If ignore_offset is set to false, only extent refs whose offsets match
+ * extent_item_pos are returned. If true, every extent ref is returned
+ * and extent_item_pos is ignored.
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist *refs,
+ struct ulist *roots, const u64 *extent_item_pos,
+ struct share_check *sc, bool ignore_offset)
+{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_delayed_ref_root *delayed_refs = NULL;
+ struct btrfs_delayed_ref_head *head;
+ int info_level = 0;
+ int ret;
+ struct prelim_ref *ref;
+ struct rb_node *node;
+ struct extent_inode_elem *eie = NULL;
+ struct preftrees preftrees = {
+ .direct = PREFTREE_INIT,
+ .indirect = PREFTREE_INIT,
+ .indirect_missing_keys = PREFTREE_INIT
+ };
+
+ key.objectid = bytenr;
+ key.offset = (u64)-1;
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ if (!trans) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
+
+ if (time_seq == BTRFS_SEQ_LAST)
+ path->skip_locking = 1;
+
+again:
+ head = NULL;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (trans && likely(trans->type != __TRANS_DUMMY) &&
+ time_seq != BTRFS_SEQ_LAST) {
+ /*
+ * We have a specific time_seq we care about and trans which
+ * means we have the path lock, we need to grab the ref head and
+ * lock it so we have a consistent view of the refs at the given
+ * time.
+ */
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's
+ * released and try again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref_head(head);
+ goto again;
+ }
+ spin_unlock(&delayed_refs->lock);
+ ret = add_delayed_refs(fs_info, head, time_seq,
+ &preftrees, sc);
+ mutex_unlock(&head->mutex);
+ if (ret)
+ goto out;
+ } else {
+ spin_unlock(&delayed_refs->lock);
+ }
+ }
+
+ if (path->slots[0]) {
+ struct extent_buffer *leaf;
+ int slot;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid == bytenr &&
+ (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY)) {
+ ret = add_inline_refs(fs_info, path, bytenr,
+ &info_level, &preftrees, sc);
+ if (ret)
+ goto out;
+ ret = add_keyed_refs(root, path, bytenr, info_level,
+ &preftrees, sc);
+ if (ret)
+ goto out;
+ }
+ }
+
+ btrfs_release_path(path);
+
+ ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
+ if (ret)
+ goto out;
+
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
+
+ ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
+ extent_item_pos, sc, ignore_offset);
+ if (ret)
+ goto out;
+
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));
+
+ /*
+ * This walks the tree of merged and resolved refs. Tree blocks are
+ * read in as needed. Unique entries are added to the ulist, and
+ * the list of found roots is updated.
+ *
+ * We release the entire tree in one go before returning.
+ */
+ node = rb_first_cached(&preftrees.direct.root);
+ while (node) {
+ ref = rb_entry(node, struct prelim_ref, rbnode);
+ node = rb_next(&ref->rbnode);
+ /*
+ * ref->count < 0 can happen here if there are delayed
+ * refs with a node->action of BTRFS_DROP_DELAYED_REF.
+ * prelim_ref_insert() relies on this when merging
+ * identical refs to keep the overall count correct.
+ * prelim_ref_insert() will merge only those refs
+ * which compare identically. Any refs having
+ * e.g. different offsets would not be merged,
+ * and would retain their original ref->count < 0.
+ */
+ if (roots && ref->count && ref->root_id && ref->parent == 0) {
+ if (sc && sc->root_objectid &&
+ ref->root_id != sc->root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
+
+ /* no parent == root of tree */
+ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ }
+ if (ref->count && ref->parent) {
+ if (extent_item_pos && !ref->inode_list &&
+ ref->level == 0) {
+ struct extent_buffer *eb;
+
+ eb = read_tree_block(fs_info, ref->parent, 0,
+ 0, ref->level, NULL);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ }
+ if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (!path->skip_locking)
+ btrfs_tree_read_lock(eb);
+ ret = find_extent_in_eb(eb, bytenr,
+ *extent_item_pos, &eie, ignore_offset);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ goto out;
+ ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
+ }
+ ret = ulist_add_merge_ptr(refs, ref->parent,
+ ref->inode_list,
+ (void **)&eie, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ if (!ret && extent_item_pos) {
+ /*
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
+ */
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ while (eie->next)
+ eie = eie->next;
+ eie->next = ref->inode_list;
+ }
+ eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
+ }
+ cond_resched();
+ }
+
+out:
+ btrfs_free_path(path);
+
+ prelim_release(&preftrees.direct);
+ prelim_release(&preftrees.indirect);
+ prelim_release(&preftrees.indirect_missing_keys);
+
+ if (ret < 0)
+ free_inode_elem_list(eie);
+ return ret;
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos, bool ignore_offset)
+{
+ int ret;
+
+ *leafs = ulist_alloc(GFP_NOFS);
+ if (!*leafs)
+ return -ENOMEM;
+
+ ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
+ *leafs, NULL, extent_item_pos, NULL, ignore_offset);
+ if (ret < 0 && ret != -ENOENT) {
+ free_leaf_list(*leafs);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots,
+ bool ignore_offset)
+{
+ struct ulist *tmp;
+ struct ulist_node *node = NULL;
+ struct ulist_iterator uiter;
+ int ret;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ *roots = ulist_alloc(GFP_NOFS);
+ if (!*roots) {
+ ulist_free(tmp);
+ return -ENOMEM;
+ }
+
+ ULIST_ITER_INIT(&uiter);
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr, time_seq,
+ tmp, *roots, NULL, NULL, ignore_offset);
+ if (ret < 0 && ret != -ENOENT) {
+ ulist_free(tmp);
+ ulist_free(*roots);
+ *roots = NULL;
+ return ret;
+ }
+ node = ulist_next(tmp, &uiter);
+ if (!node)
+ break;
+ bytenr = node->val;
+ cond_resched();
+ }
+
+ ulist_free(tmp);
+ return 0;
+}
+
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots,
+ bool skip_commit_root_sem)
+{
+ int ret;
+
+ if (!trans && !skip_commit_root_sem)
+ down_read(&fs_info->commit_root_sem);
+ ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
+ time_seq, roots, false);
+ if (!trans && !skip_commit_root_sem)
+ up_read(&fs_info->commit_root_sem);
+ return ret;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool *is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+
+ if (!cache->use_cache)
+ return false;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return false;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ entry = &cache->entries[level];
+
+ /* Unused cache entry or being used for some other extent buffer. */
+ if (entry->bytenr != bytenr)
+ return false;
+
+ /*
+ * We cached a false result, but the last snapshot generation of the
+ * root changed, so we now have a snapshot. Don't trust the result.
+ */
+ if (!entry->is_shared &&
+ entry->gen != btrfs_root_last_snapshot(&root->root_item))
+ return false;
+
+ /*
+ * If we cached a true result and the last generation used for dropping
+ * a root changed, we can not trust the result, because the dropped root
+ * could be a snapshot sharing this extent buffer.
+ */
+ if (entry->is_shared &&
+ entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ return false;
+
+ *is_shared = entry->is_shared;
+ /*
+ * If the node at this level is shared, than all nodes below are also
+ * shared. Currently some of the nodes below may be marked as not shared
+ * because we have just switched from one leaf to another, and switched
+ * also other nodes above the leaf and below the current level, so mark
+ * them as shared.
+ */
+ if (*is_shared) {
+ for (int i = 0; i < level; i++) {
+ cache->entries[i].is_shared = true;
+ cache->entries[i].gen = entry->gen;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+ u64 gen;
+
+ if (!cache->use_cache)
+ return;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ if (is_shared)
+ gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ else
+ gen = btrfs_root_last_snapshot(&root->root_item);
+
+ entry = &cache->entries[level];
+ entry->bytenr = bytenr;
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+
+ /*
+ * If we found an extent buffer is shared, set the cache result for all
+ * extent buffers below it to true. As nodes in the path are COWed,
+ * their sharedness is moved to their children, and if a leaf is COWed,
+ * then the sharedness of a data extent becomes direct, the refcount of
+ * data extent is increased in the extent item at the extent tree.
+ */
+ if (is_shared) {
+ for (int i = 0; i < level; i++) {
+ entry = &cache->entries[i];
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+ }
+ }
+}
+
+/*
+ * Check if a data extent is shared or not.
+ *
+ * @root: The root the inode belongs to.
+ * @inum: Number of the inode whose extent we are checking.
+ * @bytenr: Logical bytenr of the extent we are checking.
+ * @extent_gen: Generation of the extent (file extent item) or 0 if it is
+ * not known.
+ * @roots: List of roots this extent is shared among.
+ * @tmp: Temporary list used for iteration.
+ * @cache: A backref lookup result cache.
+ *
+ * btrfs_is_data_extent_shared uses the backref walking code but will short
+ * circuit as soon as it finds a root or inode that doesn't match the
+ * one passed in. This provides a significant performance benefit for
+ * callers (such as fiemap) which want to know whether the extent is
+ * shared but do not need a ref count.
+ *
+ * This attempts to attach to the running transaction in order to account for
+ * delayed refs, but continues on even when no running transaction exists.
+ *
+ * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
+ */
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans;
+ struct ulist_iterator uiter;
+ struct ulist_node *node;
+ struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
+ int ret = 0;
+ struct share_check shared = {
+ .root_objectid = root->root_key.objectid,
+ .inum = inum,
+ .share_count = 0,
+ .have_delayed_delete_refs = false,
+ };
+ int level;
+
+ ulist_init(roots);
+ ulist_init(tmp);
+
+ trans = btrfs_join_transaction_nostart(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ trans = NULL;
+ down_read(&fs_info->commit_root_sem);
+ } else {
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ }
+
+ /* -1 means we are in the bytenr of the data extent. */
+ level = -1;
+ ULIST_ITER_INIT(&uiter);
+ cache->use_cache = true;
+ while (1) {
+ bool is_shared;
+ bool cached;
+
+ ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
+ roots, NULL, &shared, false);
+ if (ret == BACKREF_FOUND_SHARED) {
+ /* this is the only condition under which we return 1 */
+ ret = 1;
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, true);
+ break;
+ }
+ if (ret < 0 && ret != -ENOENT)
+ break;
+ ret = 0;
+ /*
+ * If our data extent is not shared through reflinks and it was
+ * created in a generation after the last one used to create a
+ * snapshot of the inode's root, then it can not be shared
+ * indirectly through subtrees, as that can only happen with
+ * snapshots. In this case bail out, no need to check for the
+ * sharedness of extent buffers.
+ */
+ if (level == -1 &&
+ extent_gen > btrfs_root_last_snapshot(&root->root_item))
+ break;
+
+ /*
+ * If our data extent was not directly shared (without multiple
+ * reference items), than it might have a single reference item
+ * with a count > 1 for the same offset, which means there are 2
+ * (or more) file extent items that point to the data extent -
+ * this happens when a file extent item needs to be split and
+ * then one item gets moved to another leaf due to a b+tree leaf
+ * split when inserting some item. In this case the file extent
+ * items may be located in different leaves and therefore some
+ * of the leaves may be referenced through shared subtrees while
+ * others are not. Since our extent buffer cache only works for
+ * a single path (by far the most common case and simpler to
+ * deal with), we can not use it if we have multiple leaves
+ * (which implies multiple paths).
+ */
+ if (level == -1 && tmp->nnodes > 1)
+ cache->use_cache = false;
+
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, false);
+ node = ulist_next(tmp, &uiter);
+ if (!node)
+ break;
+ bytenr = node->val;
+ level++;
+ cached = lookup_backref_shared_cache(cache, root, bytenr, level,
+ &is_shared);
+ if (cached) {
+ ret = (is_shared ? 1 : 0);
+ break;
+ }
+ shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
+ cond_resched();
+ }
+
+ if (trans) {
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ btrfs_end_transaction(trans);
+ } else {
+ up_read(&fs_info->commit_root_sem);
+ }
+out:
+ ulist_release(roots);
+ ulist_release(tmp);
+ return ret;
+}
+
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off)
+{
+ int ret, slot;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_inode_extref *extref;
+ const struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = start_off;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If the item at offset is not found,
+ * btrfs_search_slot will point us to the slot
+ * where it should be inserted. In our case
+ * that will be the slot directly before the
+ * next INODE_REF_KEY_V2 item. In the case
+ * that we're pointing to the last slot in a
+ * leaf, we must move one leaf over.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret) {
+ if (ret >= 1)
+ ret = -ENOENT;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /*
+ * Check that we're still looking at an extended ref key for
+ * this particular objectid. If we have different
+ * objectid or type then there are no more to be found
+ * in the tree and we can exit.
+ */
+ ret = -ENOENT;
+ if (found_key.objectid != inode_objectid)
+ break;
+ if (found_key.type != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ ret = 0;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ extref = (struct btrfs_inode_extref *)ptr;
+ *ret_extref = extref;
+ if (found_off)
+ *found_off = found_key.offset;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * this iterates to turn a name (from iref/extref) into a full filesystem path.
+ * Elements of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size)
+{
+ int slot;
+ u64 next_inum;
+ int ret;
+ s64 bytes_left = ((s64)size) - 1;
+ struct extent_buffer *eb = eb_in;
+ struct btrfs_key found_key;
+ struct btrfs_inode_ref *iref;
+
+ if (bytes_left >= 0)
+ dest[bytes_left] = '\0';
+
+ while (1) {
+ bytes_left -= name_len;
+ if (bytes_left >= 0)
+ read_extent_buffer(eb, dest + bytes_left,
+ name_off, name_len);
+ if (eb != eb_in) {
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ ret = btrfs_find_item(fs_root, path, parent, 0,
+ BTRFS_INODE_REF_KEY, &found_key);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret)
+ break;
+
+ next_inum = found_key.offset;
+
+ /* regular exit ahead */
+ if (parent == next_inum)
+ break;
+
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ /* make sure we can use eb after releasing the path */
+ if (eb != eb_in) {
+ path->nodes[0] = NULL;
+ path->locks[0] = 0;
+ }
+ btrfs_release_path(path);
+ iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_off = (unsigned long)(iref + 1);
+
+ parent = next_inum;
+ --bytes_left;
+ if (bytes_left >= 0)
+ dest[bytes_left] = '/';
+ }
+
+ btrfs_release_path(path);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ return dest + bytes_left;
+}
+
+/*
+ * this makes the path point to (logical EXTENT_ITEM *)
+ * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
+ * tree blocks and <0 on error.
+ */
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags_ret)
+{
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
+ int ret;
+ u64 flags;
+ u64 size = 0;
+ u32 item_size;
+ const struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = logical;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+ if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+ size = fs_info->nodesize;
+ else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+ size = found_key->offset;
+
+ if (found_key->objectid > logical ||
+ found_key->objectid + size <= logical) {
+ btrfs_debug(fs_info,
+ "logical %llu is not within any extent", logical);
+ return -ENOENT;
+ }
+
+ eb = path->nodes[0];
+ item_size = btrfs_item_size(eb, path->slots[0]);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+ flags = btrfs_extent_flags(eb, ei);
+
+ btrfs_debug(fs_info,
+ "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u",
+ logical, logical - found_key->objectid, found_key->objectid,
+ found_key->offset, flags, item_size);
+
+ WARN_ON(!flags_ret);
+ if (flags_ret) {
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+ else if (flags & BTRFS_EXTENT_FLAG_DATA)
+ *flags_ret = BTRFS_EXTENT_FLAG_DATA;
+ else
+ BUG();
+ return 0;
+ }
+
+ return -EIO;
+}
+
+/*
+ * helper function to iterate extent inline refs. ptr must point to a 0 value
+ * for the first call and may be modified. it is used to track state.
+ * if more refs exist, 0 is returned and the next call to
+ * get_extent_inline_ref must pass the modified ptr parameter to get the
+ * next ref. after the last ref was processed, 1 is returned.
+ * returns <0 on error
+ */
+static int get_extent_inline_ref(unsigned long *ptr,
+ const struct extent_buffer *eb,
+ const struct btrfs_key *key,
+ const struct btrfs_extent_item *ei,
+ u32 item_size,
+ struct btrfs_extent_inline_ref **out_eiref,
+ int *out_type)
+{
+ unsigned long end;
+ u64 flags;
+ struct btrfs_tree_block_info *info;
+
+ if (!*ptr) {
+ /* first call */
+ flags = btrfs_extent_flags(eb, ei);
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (key->type == BTRFS_METADATA_ITEM_KEY) {
+ /* a skinny metadata extent */
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(ei + 1);
+ } else {
+ WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(info + 1);
+ }
+ } else {
+ *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ }
+ *ptr = (unsigned long)*out_eiref;
+ if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
+ return -ENOENT;
+ }
+
+ end = (unsigned long)ei + item_size;
+ *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
+ *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref,
+ BTRFS_REF_TYPE_ANY);
+ if (*out_type == BTRFS_REF_TYPE_INVALID)
+ return -EUCLEAN;
+
+ *ptr += btrfs_extent_inline_ref_size(*out_type);
+ WARN_ON(*ptr > end);
+ if (*ptr == end)
+ return 1; /* last */
+
+ return 0;
+}
+
+/*
+ * reads the tree block backref for an extent. tree level and root are returned
+ * through out_level and out_root. ptr must point to a 0 value for the first
+ * call and may be modified (see get_extent_inline_ref comment).
+ * returns 0 if data was provided, 1 if there was no more data to provide or
+ * <0 on error.
+ */
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level)
+{
+ int ret;
+ int type;
+ struct btrfs_extent_inline_ref *eiref;
+
+ if (*ptr == (unsigned long)-1)
+ return 1;
+
+ while (1) {
+ ret = get_extent_inline_ref(ptr, eb, key, ei, item_size,
+ &eiref, &type);
+ if (ret < 0)
+ return ret;
+
+ if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_BLOCK_REF_KEY)
+ break;
+
+ if (ret == 1)
+ return 1;
+ }
+
+ /* we can treat both ref types equally here */
+ *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
+
+ if (key->type == BTRFS_EXTENT_ITEM_KEY) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *out_level = btrfs_tree_block_level(eb, info);
+ } else {
+ ASSERT(key->type == BTRFS_METADATA_ITEM_KEY);
+ *out_level = (u8)key->offset;
+ }
+
+ if (ret == 1)
+ *ptr = (unsigned long)-1;
+
+ return 0;
+}
+
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
+ struct extent_inode_elem *inode_list,
+ u64 root, u64 extent_item_objectid,
+ iterate_extent_inodes_t *iterate, void *ctx)
+{
+ struct extent_inode_elem *eie;
+ int ret = 0;
+
+ for (eie = inode_list; eie; eie = eie->next) {
+ btrfs_debug(fs_info,
+ "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu",
+ extent_item_objectid, eie->inum,
+ eie->offset, root);
+ ret = iterate(eie->inum, eie->offset, root, ctx);
+ if (ret) {
+ btrfs_debug(fs_info,
+ "stopping iteration for %llu due to ret=%d",
+ extent_item_objectid, ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * calls iterate() for every inode that references the extent identified by
+ * the given parameters.
+ * when the iterator function returns a non-zero value, iteration stops.
+ */
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+ u64 extent_item_objectid, u64 extent_item_pos,
+ int search_commit_root,
+ iterate_extent_inodes_t *iterate, void *ctx,
+ bool ignore_offset)
+{
+ int ret;
+ struct btrfs_trans_handle *trans = NULL;
+ struct ulist *refs = NULL;
+ struct ulist *roots = NULL;
+ struct ulist_node *ref_node = NULL;
+ struct ulist_node *root_node = NULL;
+ struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
+ struct ulist_iterator ref_uiter;
+ struct ulist_iterator root_uiter;
+
+ btrfs_debug(fs_info, "resolving all inodes for extent %llu",
+ extent_item_objectid);
+
+ if (!search_commit_root) {
+ trans = btrfs_attach_transaction(fs_info->tree_root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT &&
+ PTR_ERR(trans) != -EROFS)
+ return PTR_ERR(trans);
+ trans = NULL;
+ }
+ }
+
+ if (trans)
+ btrfs_get_tree_mod_seq(fs_info, &seq_elem);
+ else
+ down_read(&fs_info->commit_root_sem);
+
+ ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+ seq_elem.seq, &refs,
+ &extent_item_pos, ignore_offset);
+ if (ret)
+ goto out;
+
+ ULIST_ITER_INIT(&ref_uiter);
+ while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+ ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
+ seq_elem.seq, &roots,
+ ignore_offset);
+ if (ret)
+ break;
+ ULIST_ITER_INIT(&root_uiter);
+ while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+ btrfs_debug(fs_info,
+ "root %llu references leaf %llu, data list %#llx",
+ root_node->val, ref_node->val,
+ ref_node->aux);
+ ret = iterate_leaf_refs(fs_info,
+ (struct extent_inode_elem *)
+ (uintptr_t)ref_node->aux,
+ root_node->val,
+ extent_item_objectid,
+ iterate, ctx);
+ }
+ ulist_free(roots);
+ }
+
+ free_leaf_list(refs);
+out:
+ if (trans) {
+ btrfs_put_tree_mod_seq(fs_info, &seq_elem);
+ btrfs_end_transaction(trans);
+ } else {
+ up_read(&fs_info->commit_root_sem);
+ }
+
+ return ret;
+}
+
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct btrfs_data_container *inodes = ctx;
+ const size_t c = 3 * sizeof(u64);
+
+ if (inodes->bytes_left >= c) {
+ inodes->bytes_left -= c;
+ inodes->val[inodes->elem_cnt] = inum;
+ inodes->val[inodes->elem_cnt + 1] = offset;
+ inodes->val[inodes->elem_cnt + 2] = root;
+ inodes->elem_cnt += 3;
+ } else {
+ inodes->bytes_missing += c - inodes->bytes_left;
+ inodes->bytes_left = 0;
+ inodes->elem_missed += 3;
+ }
+
+ return 0;
+}
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ void *ctx, bool ignore_offset)
+{
+ int ret;
+ u64 extent_item_pos;
+ u64 flags = 0;
+ struct btrfs_key found_key;
+ int search_commit_root = path->search_commit_root;
+
+ ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
+ btrfs_release_path(path);
+ if (ret < 0)
+ return ret;
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ return -EINVAL;
+
+ extent_item_pos = logical - found_key.objectid;
+ ret = iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, search_commit_root,
+ build_ino_list, ctx, ignore_offset);
+
+ return ret;
+}
+
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, struct inode_fs_paths *ipath);
+
+static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
+{
+ int ret = 0;
+ int slot;
+ u32 cur;
+ u32 len;
+ u32 name_len;
+ u64 parent = 0;
+ int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
+ struct extent_buffer *eb;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_key found_key;
+
+ while (!ret) {
+ ret = btrfs_find_item(fs_root, path, inum,
+ parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+ &found_key);
+
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = found ? 0 : -ENOENT;
+ break;
+ }
+ ++found;
+
+ parent = found_key.offset;
+ slot = path->slots[0];
+ eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!eb) {
+ ret = -ENOMEM;
+ break;
+ }
+ btrfs_release_path(path);
+
+ iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+ for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) {
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ /* path must be released before calling iterate()! */
+ btrfs_debug(fs_root->fs_info,
+ "following ref at offset %u for inode %llu in tree %llu",
+ cur, found_key.objectid,
+ fs_root->root_key.objectid);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)(iref + 1), eb, ipath);
+ if (ret)
+ break;
+ len = sizeof(*iref) + name_len;
+ iref = (struct btrfs_inode_ref *)((char *)iref + len);
+ }
+ free_extent_buffer(eb);
+ }
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath)
+{
+ int ret;
+ int slot;
+ u64 offset = 0;
+ u64 parent;
+ int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
+ struct extent_buffer *eb;
+ struct btrfs_inode_extref *extref;
+ u32 item_size;
+ u32 cur_offset;
+ unsigned long ptr;
+
+ while (1) {
+ ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+ &offset);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = found ? 0 : -ENOENT;
+ break;
+ }
+ ++found;
+
+ slot = path->slots[0];
+ eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!eb) {
+ ret = -ENOMEM;
+ break;
+ }
+ btrfs_release_path(path);
+
+ item_size = btrfs_item_size(eb, slot);
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ u32 name_len;
+
+ extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ name_len = btrfs_inode_extref_name_len(eb, extref);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)&extref->name, eb, ipath);
+ if (ret)
+ break;
+
+ cur_offset += btrfs_inode_extref_name_len(eb, extref);
+ cur_offset += sizeof(*extref);
+ }
+ free_extent_buffer(eb);
+
+ offset++;
+ }
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+/*
+ * returns 0 if the path could be dumped (probably truncated)
+ * returns <0 in case of an error
+ */
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, struct inode_fs_paths *ipath)
+{
+ char *fspath;
+ char *fspath_min;
+ int i = ipath->fspath->elem_cnt;
+ const int s_ptr = sizeof(char *);
+ u32 bytes_left;
+
+ bytes_left = ipath->fspath->bytes_left > s_ptr ?
+ ipath->fspath->bytes_left - s_ptr : 0;
+
+ fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
+ fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+ name_off, eb, inum, fspath_min, bytes_left);
+ if (IS_ERR(fspath))
+ return PTR_ERR(fspath);
+
+ if (fspath > fspath_min) {
+ ipath->fspath->val[i] = (u64)(unsigned long)fspath;
+ ++ipath->fspath->elem_cnt;
+ ipath->fspath->bytes_left = fspath - fspath_min;
+ } else {
+ ++ipath->fspath->elem_missed;
+ ipath->fspath->bytes_missing += fspath_min - fspath;
+ ipath->fspath->bytes_left = 0;
+ }
+
+ return 0;
+}
+
+/*
+ * this dumps all file system paths to the inode into the ipath struct, provided
+ * is has been created large enough. each path is zero-terminated and accessed
+ * from ipath->fspath->val[i].
+ * when it returns, there are ipath->fspath->elem_cnt number of paths available
+ * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
+ * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise,
+ * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
+ * have been needed to return all paths.
+ */
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
+{
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, ipath);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, ipath);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
+}
+
+struct btrfs_data_container *init_data_container(u32 total_bytes)
+{
+ struct btrfs_data_container *data;
+ size_t alloc_bytes;
+
+ alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
+ data = kvmalloc(alloc_bytes, GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ if (total_bytes >= sizeof(*data)) {
+ data->bytes_left = total_bytes - sizeof(*data);
+ data->bytes_missing = 0;
+ } else {
+ data->bytes_missing = sizeof(*data) - total_bytes;
+ data->bytes_left = 0;
+ }
+
+ data->elem_cnt = 0;
+ data->elem_missed = 0;
+
+ return data;
+}
+
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+ struct btrfs_path *path)
+{
+ struct inode_fs_paths *ifp;
+ struct btrfs_data_container *fspath;
+
+ fspath = init_data_container(total_bytes);
+ if (IS_ERR(fspath))
+ return ERR_CAST(fspath);
+
+ ifp = kmalloc(sizeof(*ifp), GFP_KERNEL);
+ if (!ifp) {
+ kvfree(fspath);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ifp->btrfs_path = path;
+ ifp->fspath = fspath;
+ ifp->fs_root = fs_root;
+
+ return ifp;
+}
+
+void free_ipath(struct inode_fs_paths *ipath)
+{
+ if (!ipath)
+ return;
+ kvfree(ipath->fspath);
+ kfree(ipath);
+}
+
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(
+ struct btrfs_fs_info *fs_info, gfp_t gfp_flag)
+{
+ struct btrfs_backref_iter *ret;
+
+ ret = kzalloc(sizeof(*ret), gfp_flag);
+ if (!ret)
+ return NULL;
+
+ ret->path = btrfs_alloc_path();
+ if (!ret->path) {
+ kfree(ret);
+ return NULL;
+ }
+
+ /* Current backref iterator only supports iteration in commit root */
+ ret->path->search_commit_root = 1;
+ ret->path->skip_locking = 1;
+ ret->fs_info = fs_info;
+
+ return ret;
+}
+
+int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+{
+ struct btrfs_fs_info *fs_info = iter->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
+ struct btrfs_path *path = iter->path;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+ iter->bytenr = bytenr;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ ret = -EUCLEAN;
+ goto release;
+ }
+ if (path->slots[0] == 0) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EUCLEAN;
+ goto release;
+ }
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if ((key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY) || key.objectid != bytenr) {
+ ret = -ENOENT;
+ goto release;
+ }
+ memcpy(&iter->cur_key, &key, sizeof(key));
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->end_ptr = (u32)(iter->item_ptr +
+ btrfs_item_size(path->nodes[0], path->slots[0]));
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+
+ /*
+ * Only support iteration on tree backref yet.
+ *
+ * This is an extra precaution for non skinny-metadata, where
+ * EXTENT_ITEM is also used for tree blocks, that we can only use
+ * extent flags to determine if it's a tree block.
+ */
+ if (btrfs_extent_flags(path->nodes[0], ei) & BTRFS_EXTENT_FLAG_DATA) {
+ ret = -ENOTSUPP;
+ goto release;
+ }
+ iter->cur_ptr = (u32)(iter->item_ptr + sizeof(*ei));
+
+ /* If there is no inline backref, go search for keyed backref */
+ if (iter->cur_ptr >= iter->end_ptr) {
+ ret = btrfs_next_item(extent_root, path);
+
+ /* No inline nor keyed ref */
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto release;
+ }
+ if (ret < 0)
+ goto release;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key,
+ path->slots[0]);
+ if (iter->cur_key.objectid != bytenr ||
+ (iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY &&
+ iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY)) {
+ ret = -ENOENT;
+ goto release;
+ }
+ iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->item_ptr = iter->cur_ptr;
+ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(
+ path->nodes[0], path->slots[0]));
+ }
+
+ return 0;
+release:
+ btrfs_backref_iter_release(iter);
+ return ret;
+}
+
+/*
+ * Go to the next backref item of current bytenr, can be either inlined or
+ * keyed.
+ *
+ * Caller needs to check whether it's inline ref or not by iter->cur_key.
+ *
+ * Return 0 if we get next backref without problem.
+ * Return >0 if there is no extra backref for this bytenr.
+ * Return <0 if there is something wrong happened.
+ */
+int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
+{
+ struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct btrfs_root *extent_root;
+ struct btrfs_path *path = iter->path;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+ u32 size;
+
+ if (btrfs_backref_iter_is_inline_ref(iter)) {
+ /* We're still inside the inline refs */
+ ASSERT(iter->cur_ptr < iter->end_ptr);
+
+ if (btrfs_backref_has_tree_block_info(iter)) {
+ /* First tree block info */
+ size = sizeof(struct btrfs_tree_block_info);
+ } else {
+ /* Use inline ref type to determine the size */
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)
+ ((unsigned long)iter->cur_ptr);
+ type = btrfs_extent_inline_ref_type(eb, iref);
+
+ size = btrfs_extent_inline_ref_size(type);
+ }
+ iter->cur_ptr += size;
+ if (iter->cur_ptr < iter->end_ptr)
+ return 0;
+
+ /* All inline items iterated, fall through */
+ }
+
+ /* We're at keyed items, there is no inline item, go to the next one */
+ extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr);
+ ret = btrfs_next_item(extent_root, iter->path);
+ if (ret)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]);
+ if (iter->cur_key.objectid != iter->bytenr ||
+ (iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+ iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY))
+ return 1;
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->cur_ptr = iter->item_ptr;
+ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ return 0;
+}
+
+void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_backref_cache *cache, int is_reloc)
+{
+ int i;
+
+ cache->rb_root = RB_ROOT;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&cache->pending[i]);
+ INIT_LIST_HEAD(&cache->changed);
+ INIT_LIST_HEAD(&cache->detached);
+ INIT_LIST_HEAD(&cache->leaves);
+ INIT_LIST_HEAD(&cache->pending_edge);
+ INIT_LIST_HEAD(&cache->useless_node);
+ cache->fs_info = fs_info;
+ cache->is_reloc = is_reloc;
+}
+
+struct btrfs_backref_node *btrfs_backref_alloc_node(
+ struct btrfs_backref_cache *cache, u64 bytenr, int level)
+{
+ struct btrfs_backref_node *node;
+
+ ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL);
+ node = kzalloc(sizeof(*node), GFP_NOFS);
+ if (!node)
+ return node;
+
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+ cache->nr_nodes++;
+ node->level = level;
+ node->bytenr = bytenr;
+
+ return node;
+}
+
+struct btrfs_backref_edge *btrfs_backref_alloc_edge(
+ struct btrfs_backref_cache *cache)
+{
+ struct btrfs_backref_edge *edge;
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (edge)
+ cache->nr_edges++;
+ return edge;
+}
+
+/*
+ * Drop the backref node from cache, also cleaning up all its
+ * upper edges and any uncached nodes in the path.
+ *
+ * This cleanup happens bottom up, thus the node should either
+ * be the lowest node in the cache or a detached node.
+ */
+void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+
+ if (!node)
+ return;
+
+ BUG_ON(!node->lowest && !node->detached);
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next, struct btrfs_backref_edge,
+ list[LOWER]);
+ upper = edge->node[UPPER];
+ list_del(&edge->list[LOWER]);
+ list_del(&edge->list[UPPER]);
+ btrfs_backref_free_edge(cache, edge);
+
+ /*
+ * Add the node to leaf node list if no other child block
+ * cached.
+ */
+ if (list_empty(&upper->lower)) {
+ list_add_tail(&upper->lower, &cache->leaves);
+ upper->lowest = 1;
+ }
+ }
+
+ btrfs_backref_drop_node(cache, node);
+}
+
+/*
+ * Release all nodes/edges from current cache
+ */
+void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
+{
+ struct btrfs_backref_node *node;
+ int i;
+
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct btrfs_backref_node, list);
+ btrfs_backref_cleanup_node(cache, node);
+ }
+
+ while (!list_empty(&cache->leaves)) {
+ node = list_entry(cache->leaves.next,
+ struct btrfs_backref_node, lower);
+ btrfs_backref_cleanup_node(cache, node);
+ }
+
+ cache->last_trans = 0;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ ASSERT(list_empty(&cache->pending[i]));
+ ASSERT(list_empty(&cache->pending_edge));
+ ASSERT(list_empty(&cache->useless_node));
+ ASSERT(list_empty(&cache->changed));
+ ASSERT(list_empty(&cache->detached));
+ ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
+ ASSERT(!cache->nr_nodes);
+ ASSERT(!cache->nr_edges);
+}
+
+/*
+ * Handle direct tree backref
+ *
+ * Direct tree backref means, the backref item shows its parent bytenr
+ * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
+ *
+ * @ref_key: The converted backref key.
+ * For keyed backref, it's the item key.
+ * For inlined backref, objectid is the bytenr,
+ * type is btrfs_inline_ref_type, offset is
+ * btrfs_inline_ref_offset.
+ */
+static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
+ struct btrfs_key *ref_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *upper;
+ struct rb_node *rb_node;
+
+ ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
+
+ /* Only reloc root uses backref pointing to itself */
+ if (ref_key->objectid == ref_key->offset) {
+ struct btrfs_root *root;
+
+ cur->is_reloc_root = 1;
+ /* Only reloc backref cache cares about a specific root */
+ if (cache->is_reloc) {
+ root = find_reloc_root(cache->fs_info, cur->bytenr);
+ if (!root)
+ return -ENOENT;
+ cur->root = root;
+ } else {
+ /*
+ * For generic purpose backref cache, reloc root node
+ * is useless.
+ */
+ list_add(&cur->list, &cache->useless_node);
+ }
+ return 0;
+ }
+
+ edge = btrfs_backref_alloc_edge(cache);
+ if (!edge)
+ return -ENOMEM;
+
+ rb_node = rb_simple_search(&cache->rb_root, ref_key->offset);
+ if (!rb_node) {
+ /* Parent node not yet cached */
+ upper = btrfs_backref_alloc_node(cache, ref_key->offset,
+ cur->level + 1);
+ if (!upper) {
+ btrfs_backref_free_edge(cache, edge);
+ return -ENOMEM;
+ }
+
+ /*
+ * Backrefs for the upper level block isn't cached, add the
+ * block to pending list
+ */
+ list_add_tail(&edge->list[UPPER], &cache->pending_edge);
+ } else {
+ /* Parent node already cached */
+ upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
+ ASSERT(upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER);
+ return 0;
+}
+
+/*
+ * Handle indirect tree backref
+ *
+ * Indirect tree backref means, we only know which tree the node belongs to.
+ * We still need to do a tree search to find out the parents. This is for
+ * TREE_BLOCK_REF backref (keyed or inlined).
+ *
+ * @ref_key: The same as @ref_key in handle_direct_tree_backref()
+ * @tree_key: The first key of this tree block.
+ * @path: A clean (released) path, to avoid allocating path every time
+ * the function get called.
+ */
+static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_key *ref_key,
+ struct btrfs_key *tree_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_node *lower;
+ struct btrfs_backref_edge *edge;
+ struct extent_buffer *eb;
+ struct btrfs_root *root;
+ struct rb_node *rb_node;
+ int level;
+ bool need_check = true;
+ int ret;
+
+ root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ cur->cowonly = 1;
+
+ if (btrfs_root_level(&root->root_item) == cur->level) {
+ /* Tree root */
+ ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr);
+ /*
+ * For reloc backref cache, we may ignore reloc root. But for
+ * general purpose backref cache, we can't rely on
+ * btrfs_should_ignore_reloc_root() as it may conflict with
+ * current running relocation and lead to missing root.
+ *
+ * For general purpose backref cache, reloc root detection is
+ * completely relying on direct backref (key->offset is parent
+ * bytenr), thus only do such check for reloc cache.
+ */
+ if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) {
+ btrfs_put_root(root);
+ list_add(&cur->list, &cache->useless_node);
+ } else {
+ cur->root = root;
+ }
+ return 0;
+ }
+
+ level = cur->level + 1;
+
+ /* Search the tree to find parent blocks referring to the block */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ btrfs_put_root(root);
+ return ret;
+ }
+ if (ret > 0 && path->slots[level] > 0)
+ path->slots[level]--;
+
+ eb = path->nodes[level];
+ if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
+ btrfs_err(fs_info,
+"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
+ cur->bytenr, level - 1, root->root_key.objectid,
+ tree_key->objectid, tree_key->type, tree_key->offset);
+ btrfs_put_root(root);
+ ret = -ENOENT;
+ goto out;
+ }
+ lower = cur;
+
+ /* Add all nodes and edges in the path */
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level]) {
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
+ lower->bytenr);
+ /* Same as previous should_ignore_reloc_root() call */
+ if (btrfs_should_ignore_reloc_root(root) &&
+ cache->is_reloc) {
+ btrfs_put_root(root);
+ list_add(&lower->list, &cache->useless_node);
+ } else {
+ lower->root = root;
+ }
+ break;
+ }
+
+ edge = btrfs_backref_alloc_edge(cache);
+ if (!edge) {
+ btrfs_put_root(root);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ eb = path->nodes[level];
+ rb_node = rb_simple_search(&cache->rb_root, eb->start);
+ if (!rb_node) {
+ upper = btrfs_backref_alloc_node(cache, eb->start,
+ lower->level + 1);
+ if (!upper) {
+ btrfs_put_root(root);
+ btrfs_backref_free_edge(cache, edge);
+ ret = -ENOMEM;
+ goto out;
+ }
+ upper->owner = btrfs_header_owner(eb);
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ upper->cowonly = 1;
+
+ /*
+ * If we know the block isn't shared we can avoid
+ * checking its backrefs.
+ */
+ if (btrfs_block_can_be_shared(root, eb))
+ upper->checked = 0;
+ else
+ upper->checked = 1;
+
+ /*
+ * Add the block to pending list if we need to check its
+ * backrefs, we only do this once while walking up a
+ * tree as we will catch anything else later on.
+ */
+ if (!upper->checked && need_check) {
+ need_check = false;
+ list_add_tail(&edge->list[UPPER],
+ &cache->pending_edge);
+ } else {
+ if (upper->checked)
+ need_check = true;
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ } else {
+ upper = rb_entry(rb_node, struct btrfs_backref_node,
+ rb_node);
+ ASSERT(upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ if (!upper->owner)
+ upper->owner = btrfs_header_owner(eb);
+ }
+ btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER);
+
+ if (rb_node) {
+ btrfs_put_root(root);
+ break;
+ }
+ lower = upper;
+ upper = NULL;
+ }
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
+ * Add backref node @cur into @cache.
+ *
+ * NOTE: Even if the function returned 0, @cur is not yet cached as its upper
+ * links aren't yet bi-directional. Needs to finish such links.
+ * Use btrfs_backref_finish_upper_links() to finish such linkage.
+ *
+ * @path: Released path for indirect tree backref lookup
+ * @iter: Released backref iter for extent tree search
+ * @node_key: The first key of the tree block
+ */
+int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_backref_iter *iter,
+ struct btrfs_key *node_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *exist;
+ int ret;
+
+ ret = btrfs_backref_iter_start(iter, cur->bytenr);
+ if (ret < 0)
+ return ret;
+ /*
+ * We skip the first btrfs_tree_block_info, as we don't use the key
+ * stored in it, but fetch it from the tree block
+ */
+ if (btrfs_backref_has_tree_block_info(iter)) {
+ ret = btrfs_backref_iter_next(iter);
+ if (ret < 0)
+ goto out;
+ /* No extra backref? This means the tree block is corrupted */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ WARN_ON(cur->checked);
+ if (!list_empty(&cur->upper)) {
+ /*
+ * The backref was added previously when processing backref of
+ * type BTRFS_TREE_BLOCK_REF_KEY
+ */
+ ASSERT(list_is_singular(&cur->upper));
+ edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
+ list[LOWER]);
+ ASSERT(list_empty(&edge->list[UPPER]));
+ exist = edge->node[UPPER];
+ /*
+ * Add the upper level block to pending list if we need check
+ * its backrefs
+ */
+ if (!exist->checked)
+ list_add_tail(&edge->list[UPPER], &cache->pending_edge);
+ } else {
+ exist = NULL;
+ }
+
+ for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ int type;
+
+ cond_resched();
+ eb = btrfs_backref_get_eb(iter);
+
+ key.objectid = iter->bytenr;
+ if (btrfs_backref_iter_is_inline_ref(iter)) {
+ struct btrfs_extent_inline_ref *iref;
+
+ /* Update key for inline backref */
+ iref = (struct btrfs_extent_inline_ref *)
+ ((unsigned long)iter->cur_ptr);
+ type = btrfs_get_extent_inline_ref_type(eb, iref,
+ BTRFS_REF_TYPE_BLOCK);
+ if (type == BTRFS_REF_TYPE_INVALID) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ key.type = type;
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+ } else {
+ key.type = iter->cur_key.type;
+ key.offset = iter->cur_key.offset;
+ }
+
+ /*
+ * Parent node found and matches current inline ref, no need to
+ * rebuild this node for this inline ref
+ */
+ if (exist &&
+ ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+ exist->owner == key.offset) ||
+ (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+ exist->bytenr == key.offset))) {
+ exist = NULL;
+ continue;
+ }
+
+ /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ ret = handle_direct_tree_backref(cache, &key, cur);
+ if (ret < 0)
+ goto out;
+ continue;
+ } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
+ ret = -EINVAL;
+ btrfs_print_v0_err(fs_info);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ goto out;
+ } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+ continue;
+ }
+
+ /*
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
+ * means the root objectid. We need to search the tree to get
+ * its parent bytenr.
+ */
+ ret = handle_indirect_tree_backref(cache, path, &key, node_key,
+ cur);
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+ cur->checked = 1;
+ WARN_ON(exist);
+out:
+ btrfs_backref_iter_release(iter);
+ return ret;
+}
+
+/*
+ * Finish the upwards linkage created by btrfs_backref_add_tree_node()
+ */
+int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *start)
+{
+ struct list_head *useless_node = &cache->useless_node;
+ struct btrfs_backref_edge *edge;
+ struct rb_node *rb_node;
+ LIST_HEAD(pending_edge);
+
+ ASSERT(start->checked);
+
+ /* Insert this node to cache if it's not COW-only */
+ if (!start->cowonly) {
+ rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
+ &start->rb_node);
+ if (rb_node)
+ btrfs_backref_panic(cache->fs_info, start->bytenr,
+ -EEXIST);
+ list_add_tail(&start->lower, &cache->leaves);
+ }
+
+ /*
+ * Use breadth first search to iterate all related edges.
+ *
+ * The starting points are all the edges of this node
+ */
+ list_for_each_entry(edge, &start->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &pending_edge);
+
+ while (!list_empty(&pending_edge)) {
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_node *lower;
+
+ edge = list_first_entry(&pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ upper = edge->node[UPPER];
+ lower = edge->node[LOWER];
+
+ /* Parent is detached, no need to keep any edges */
+ if (upper->detached) {
+ list_del(&edge->list[LOWER]);
+ btrfs_backref_free_edge(cache, edge);
+
+ /* Lower node is orphan, queue for cleanup */
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, useless_node);
+ continue;
+ }
+
+ /*
+ * All new nodes added in current build_backref_tree() haven't
+ * been linked to the cache rb tree.
+ * So if we have upper->rb_node populated, this means a cache
+ * hit. We only need to link the edge, as @upper and all its
+ * parents have already been linked.
+ */
+ if (!RB_EMPTY_NODE(&upper->rb_node)) {
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+ continue;
+ }
+
+ /* Sanity check, we shouldn't have any unchecked nodes */
+ if (!upper->checked) {
+ ASSERT(0);
+ return -EUCLEAN;
+ }
+
+ /* Sanity check, COW-only node has non-COW-only parent */
+ if (start->cowonly != upper->cowonly) {
+ ASSERT(0);
+ return -EUCLEAN;
+ }
+
+ /* Only cache non-COW-only (subvolume trees) tree blocks */
+ if (!upper->cowonly) {
+ rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ if (rb_node) {
+ btrfs_backref_panic(cache->fs_info,
+ upper->bytenr, -EEXIST);
+ return -EUCLEAN;
+ }
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+
+ /*
+ * Also queue all the parent edges of this uncached node
+ * to finish the upper linkage
+ */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &pending_edge);
+ }
+ return 0;
+}
+
+void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_node *lower;
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+
+ while (!list_empty(&cache->useless_node)) {
+ lower = list_first_entry(&cache->useless_node,
+ struct btrfs_backref_node, list);
+ list_del_init(&lower->list);
+ }
+ while (!list_empty(&cache->pending_edge)) {
+ edge = list_first_entry(&cache->pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ upper = edge->node[UPPER];
+ btrfs_backref_free_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes and
+ * isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &cache->useless_node);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to process */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER],
+ &cache->pending_edge);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &cache->useless_node);
+ }
+
+ while (!list_empty(&cache->useless_node)) {
+ lower = list_first_entry(&cache->useless_node,
+ struct btrfs_backref_node, list);
+ list_del_init(&lower->list);
+ if (lower == node)
+ node = NULL;
+ btrfs_backref_drop_node(cache, lower);
+ }
+
+ btrfs_backref_cleanup_node(cache, node);
+ ASSERT(list_empty(&cache->useless_node) &&
+ list_empty(&cache->pending_edge));
+}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000..8e69584d5
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,398 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ */
+
+#ifndef BTRFS_BACKREF_H
+#define BTRFS_BACKREF_H
+
+#include <linux/btrfs.h>
+#include "ulist.h"
+#include "disk-io.h"
+#include "extent_io.h"
+
+struct inode_fs_paths {
+ struct btrfs_path *btrfs_path;
+ struct btrfs_root *fs_root;
+ struct btrfs_data_container *fspath;
+};
+
+struct btrfs_backref_shared_cache_entry {
+ u64 bytenr;
+ u64 gen;
+ bool is_shared;
+};
+
+struct btrfs_backref_shared_cache {
+ /*
+ * A path from a root to a leaf that has a file extent item pointing to
+ * a given data extent should never exceed the maximum b+tree height.
+ */
+ struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
+ bool use_cache;
+};
+
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
+ void *ctx);
+
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags);
+
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level);
+
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+ u64 extent_item_objectid,
+ u64 extent_offset, int search_commit_root,
+ iterate_extent_inodes_t *iterate, void *ctx,
+ bool ignore_offset);
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, void *ctx,
+ bool ignore_offset);
+
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos, bool ignore_offset);
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots,
+ bool skip_commit_root_sem);
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size);
+
+struct btrfs_data_container *init_data_container(u32 total_bytes);
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+ struct btrfs_path *path);
+void free_ipath(struct inode_fs_paths *ipath);
+
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off);
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache);
+
+int __init btrfs_prelim_ref_init(void);
+void __cold btrfs_prelim_ref_exit(void);
+
+struct prelim_ref {
+ struct rb_node rbnode;
+ u64 root_id;
+ struct btrfs_key key_for_search;
+ int level;
+ int count;
+ struct extent_inode_elem *inode_list;
+ u64 parent;
+ u64 wanted_disk_byte;
+};
+
+/*
+ * Iterate backrefs of one extent.
+ *
+ * Now it only supports iteration of tree block in commit root.
+ */
+struct btrfs_backref_iter {
+ u64 bytenr;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_key cur_key;
+ u32 item_ptr;
+ u32 cur_ptr;
+ u32 end_ptr;
+};
+
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(
+ struct btrfs_fs_info *fs_info, gfp_t gfp_flag);
+
+static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
+{
+ if (!iter)
+ return;
+ btrfs_free_path(iter->path);
+ kfree(iter);
+}
+
+static inline struct extent_buffer *btrfs_backref_get_eb(
+ struct btrfs_backref_iter *iter)
+{
+ if (!iter)
+ return NULL;
+ return iter->path->nodes[0];
+}
+
+/*
+ * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
+ * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
+ *
+ * This helper determines if that's the case.
+ */
+static inline bool btrfs_backref_has_tree_block_info(
+ struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ iter->cur_ptr - iter->item_ptr == sizeof(struct btrfs_extent_item))
+ return true;
+ return false;
+}
+
+int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
+
+int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
+
+static inline bool btrfs_backref_iter_is_inline_ref(
+ struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
+ return true;
+ return false;
+}
+
+static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
+{
+ iter->bytenr = 0;
+ iter->item_ptr = 0;
+ iter->cur_ptr = 0;
+ iter->end_ptr = 0;
+ btrfs_release_path(iter->path);
+ memset(&iter->cur_key, 0, sizeof(iter->cur_key));
+}
+
+/*
+ * Backref cache related structures
+ *
+ * The whole objective of backref_cache is to build a bi-directional map
+ * of tree blocks (represented by backref_node) and all their parents.
+ */
+
+/*
+ * Represent a tree block in the backref cache
+ */
+struct btrfs_backref_node {
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simple_node for search/insert */
+
+ u64 new_bytenr;
+ /* Objectid of tree block owner, can be not uptodate */
+ u64 owner;
+ /* Link to pending, changed or detached list */
+ struct list_head list;
+
+ /* List of upper level edges, which link this node to its parents */
+ struct list_head upper;
+ /* List of lower level edges, which link this node to its children */
+ struct list_head lower;
+
+ /* NULL if this node is not tree root */
+ struct btrfs_root *root;
+ /* Extent buffer got by COWing the block */
+ struct extent_buffer *eb;
+ /* Level of the tree block */
+ unsigned int level:8;
+ /* Is the block in a non-shareable tree */
+ unsigned int cowonly:1;
+ /* 1 if no child node is in the cache */
+ unsigned int lowest:1;
+ /* Is the extent buffer locked */
+ unsigned int locked:1;
+ /* Has the block been processed */
+ unsigned int processed:1;
+ /* Have backrefs of this block been checked */
+ unsigned int checked:1;
+ /*
+ * 1 if corresponding block has been COWed but some upper level block
+ * pointers may not point to the new location
+ */
+ unsigned int pending:1;
+ /* 1 if the backref node isn't connected to any other backref node */
+ unsigned int detached:1;
+
+ /*
+ * For generic purpose backref cache, where we only care if it's a reloc
+ * root, doesn't care the source subvolid.
+ */
+ unsigned int is_reloc_root:1;
+};
+
+#define LOWER 0
+#define UPPER 1
+
+/*
+ * Represent an edge connecting upper and lower backref nodes.
+ */
+struct btrfs_backref_edge {
+ /*
+ * list[LOWER] is linked to btrfs_backref_node::upper of lower level
+ * node, and list[UPPER] is linked to btrfs_backref_node::lower of
+ * upper level node.
+ *
+ * Also, build_backref_tree() uses list[UPPER] for pending edges, before
+ * linking list[UPPER] to its upper level nodes.
+ */
+ struct list_head list[2];
+
+ /* Two related nodes */
+ struct btrfs_backref_node *node[2];
+};
+
+struct btrfs_backref_cache {
+ /* Red black tree of all backref nodes in the cache */
+ struct rb_root rb_root;
+ /* For passing backref nodes to btrfs_reloc_cow_block */
+ struct btrfs_backref_node *path[BTRFS_MAX_LEVEL];
+ /*
+ * List of blocks that have been COWed but some block pointers in upper
+ * level blocks may not reflect the new location
+ */
+ struct list_head pending[BTRFS_MAX_LEVEL];
+ /* List of backref nodes with no child node */
+ struct list_head leaves;
+ /* List of blocks that have been COWed in current transaction */
+ struct list_head changed;
+ /* List of detached backref node. */
+ struct list_head detached;
+
+ u64 last_trans;
+
+ int nr_nodes;
+ int nr_edges;
+
+ /* List of unchecked backref edges during backref cache build */
+ struct list_head pending_edge;
+
+ /* List of useless backref nodes during backref cache build */
+ struct list_head useless_node;
+
+ struct btrfs_fs_info *fs_info;
+
+ /*
+ * Whether this cache is for relocation
+ *
+ * Reloction backref cache require more info for reloc root compared
+ * to generic backref cache.
+ */
+ unsigned int is_reloc;
+};
+
+void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_backref_cache *cache, int is_reloc);
+struct btrfs_backref_node *btrfs_backref_alloc_node(
+ struct btrfs_backref_cache *cache, u64 bytenr, int level);
+struct btrfs_backref_edge *btrfs_backref_alloc_edge(
+ struct btrfs_backref_cache *cache);
+
+#define LINK_LOWER (1 << 0)
+#define LINK_UPPER (1 << 1)
+static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which)
+{
+ ASSERT(upper && lower && upper->level == lower->level + 1);
+ edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
+ if (link_which & LINK_LOWER)
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ if (link_which & LINK_UPPER)
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+}
+
+static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ if (node) {
+ ASSERT(list_empty(&node->list));
+ ASSERT(list_empty(&node->lower));
+ ASSERT(node->eb == NULL);
+ cache->nr_nodes--;
+ btrfs_put_root(node->root);
+ kfree(node);
+ }
+}
+
+static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge)
+{
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
+}
+
+static inline void btrfs_backref_unlock_node_buffer(
+ struct btrfs_backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
+static inline void btrfs_backref_drop_node_buffer(
+ struct btrfs_backref_node *node)
+{
+ if (node->eb) {
+ btrfs_backref_unlock_node_buffer(node);
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+/*
+ * Drop the backref node from cache without cleaning up its children
+ * edges.
+ *
+ * This can only be called on node without parent edges.
+ * The children edges are still kept as is.
+ */
+static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node)
+{
+ ASSERT(list_empty(&node->upper));
+
+ btrfs_backref_drop_node_buffer(node);
+ list_del_init(&node->list);
+ list_del_init(&node->lower);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ btrfs_backref_free_node(tree, node);
+}
+
+void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+
+void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
+
+static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
+ u64 bytenr, int errno)
+{
+ btrfs_panic(fs_info, errno,
+ "Inconsistency in backref cache found at offset %llu",
+ bytenr);
+}
+
+int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_backref_iter *iter,
+ struct btrfs_key *node_key,
+ struct btrfs_backref_node *cur);
+
+int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *start);
+
+void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+
+#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
new file mode 100644
index 000000000..08017b180
--- /dev/null
+++ b/fs/btrfs/block-group.c
@@ -0,0 +1,4232 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/list_sort.h>
+#include "misc.h"
+#include "ctree.h"
+#include "block-group.h"
+#include "space-info.h"
+#include "disk-io.h"
+#include "free-space-cache.h"
+#include "free-space-tree.h"
+#include "volumes.h"
+#include "transaction.h"
+#include "ref-verify.h"
+#include "sysfs.h"
+#include "tree-log.h"
+#include "delalloc-space.h"
+#include "discard.h"
+#include "raid56.h"
+#include "zoned.h"
+
+/*
+ * Return target flags in extended format or 0 if restripe for this chunk_type
+ * is not in progress
+ *
+ * Should be called with balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ u64 target = 0;
+
+ if (!bctl)
+ return 0;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ return target;
+}
+
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Return reduced profile in chunk format. If profile changing is in progress
+ * (either running or paused) picks the target profile (if it's already
+ * available), otherwise falls back to plain reducing.
+ */
+static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 num_devices = fs_info->fs_devices->rw_devices;
+ u64 target;
+ u64 raid_type;
+ u64 allowed = 0;
+
+ /*
+ * See if restripe for this chunk_type is in progress, if so try to
+ * reduce to the target profile
+ */
+ spin_lock(&fs_info->balance_lock);
+ target = get_restripe_target(fs_info, flags);
+ if (target) {
+ spin_unlock(&fs_info->balance_lock);
+ return extended_to_chunk(target);
+ }
+ spin_unlock(&fs_info->balance_lock);
+
+ /* First, mask out the RAID levels which aren't possible */
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+ allowed |= btrfs_raid_array[raid_type].bg_flag;
+ }
+ allowed &= flags;
+
+ /* Select the highest-redundancy RAID level. */
+ if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
+ allowed = BTRFS_BLOCK_GROUP_RAID1C4;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
+ allowed = BTRFS_BLOCK_GROUP_RAID1C3;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+ allowed = BTRFS_BLOCK_GROUP_RAID5;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+ allowed = BTRFS_BLOCK_GROUP_RAID10;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+ allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_DUP)
+ allowed = BTRFS_BLOCK_GROUP_DUP;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+ allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+ flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ return extended_to_chunk(flags | allowed);
+}
+
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+{
+ unsigned seq;
+ u64 flags;
+
+ do {
+ flags = orig_flags;
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= fs_info->avail_data_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= fs_info->avail_system_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+ return btrfs_reduce_alloc_profile(fs_info, flags);
+}
+
+void btrfs_get_block_group(struct btrfs_block_group *cache)
+{
+ refcount_inc(&cache->refs);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group *cache)
+{
+ if (refcount_dec_and_test(&cache->refs)) {
+ WARN_ON(cache->pinned > 0);
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on reserved > 0 in that
+ * case.
+ */
+ if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
+ WARN_ON(cache->reserved > 0);
+
+ /*
+ * A block_group shouldn't be on the discard_list anymore.
+ * Remove the block_group from the discard_list to prevent us
+ * from causing a panic due to NULL pointer dereference.
+ */
+ if (WARN_ON(!list_empty(&cache->discard_list)))
+ btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
+ cache);
+
+ /*
+ * If not empty, someone is still holding mutex of
+ * full_stripe_lock, which can only be released by caller.
+ * And it will definitely cause use-after-free when caller
+ * tries to release full stripe lock.
+ *
+ * No better way to resolve, but only to warn.
+ */
+ WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
+ kfree(cache->free_space_ctl);
+ kfree(cache->physical_map);
+ kfree(cache);
+ }
+}
+
+/*
+ * This adds the block group to the fs_info rb tree for the block group cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_block_group *cache;
+ bool leftmost = true;
+
+ ASSERT(block_group->length != 0);
+
+ write_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_root.rb_node;
+
+ while (*p) {
+ parent = *p;
+ cache = rb_entry(parent, struct btrfs_block_group, cache_node);
+ if (block_group->start < cache->start) {
+ p = &(*p)->rb_left;
+ } else if (block_group->start > cache->start) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ write_unlock(&info->block_group_cache_lock);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&block_group->cache_node, parent, p);
+ rb_insert_color_cached(&block_group->cache_node,
+ &info->block_group_cache_tree, leftmost);
+
+ write_unlock(&info->block_group_cache_lock);
+
+ return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group *block_group_cache_tree_search(
+ struct btrfs_fs_info *info, u64 bytenr, int contains)
+{
+ struct btrfs_block_group *cache, *ret = NULL;
+ struct rb_node *n;
+ u64 end, start;
+
+ read_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_root.rb_node;
+
+ while (n) {
+ cache = rb_entry(n, struct btrfs_block_group, cache_node);
+ end = cache->start + cache->length - 1;
+ start = cache->start;
+
+ if (bytenr < start) {
+ if (!contains && (!ret || start < ret->start))
+ ret = cache;
+ n = n->rb_left;
+ } else if (bytenr > start) {
+ if (contains && bytenr <= end) {
+ ret = cache;
+ break;
+ }
+ n = n->rb_right;
+ } else {
+ ret = cache;
+ break;
+ }
+ }
+ if (ret)
+ btrfs_get_block_group(ret);
+ read_unlock(&info->block_group_cache_lock);
+
+ return ret;
+}
+
+/*
+ * Return the block group that starts at or after bytenr
+ */
+struct btrfs_block_group *btrfs_lookup_first_block_group(
+ struct btrfs_fs_info *info, u64 bytenr)
+{
+ return block_group_cache_tree_search(info, bytenr, 0);
+}
+
+/*
+ * Return the block group that contains the given bytenr
+ */
+struct btrfs_block_group *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info, u64 bytenr)
+{
+ return block_group_cache_tree_search(info, bytenr, 1);
+}
+
+struct btrfs_block_group *btrfs_next_block_group(
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct rb_node *node;
+
+ read_lock(&fs_info->block_group_cache_lock);
+
+ /* If our block group was removed, we need a full search. */
+ if (RB_EMPTY_NODE(&cache->cache_node)) {
+ const u64 next_bytenr = cache->start + cache->length;
+
+ read_unlock(&fs_info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ return btrfs_lookup_first_block_group(fs_info, next_bytenr);
+ }
+ node = rb_next(&cache->cache_node);
+ btrfs_put_block_group(cache);
+ if (node) {
+ cache = rb_entry(node, struct btrfs_block_group, cache_node);
+ btrfs_get_block_group(cache);
+ } else
+ cache = NULL;
+ read_unlock(&fs_info->block_group_cache_lock);
+ return cache;
+}
+
+/**
+ * Check if we can do a NOCOW write for a given extent.
+ *
+ * @fs_info: The filesystem information object.
+ * @bytenr: Logical start address of the extent.
+ *
+ * Check if we can do a NOCOW write for the given extent, and increments the
+ * number of NOCOW writers in the block group that contains the extent, as long
+ * as the block group exists and it's currently not in read-only mode.
+ *
+ * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
+ * is responsible for calling btrfs_dec_nocow_writers() later.
+ *
+ * Or NULL if we can not do a NOCOW write
+ */
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr)
+{
+ struct btrfs_block_group *bg;
+ bool can_nocow = true;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg)
+ return NULL;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ can_nocow = false;
+ else
+ atomic_inc(&bg->nocow_writers);
+ spin_unlock(&bg->lock);
+
+ if (!can_nocow) {
+ btrfs_put_block_group(bg);
+ return NULL;
+ }
+
+ /* No put on block group, done by btrfs_dec_nocow_writers(). */
+ return bg;
+}
+
+/**
+ * Decrement the number of NOCOW writers in a block group.
+ *
+ * @bg: The block group.
+ *
+ * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
+ * and on the block group returned by that call. Typically this is called after
+ * creating an ordered extent for a NOCOW write, to prevent races with scrub and
+ * relocation.
+ *
+ * After this call, the caller should not use the block group anymore. It it wants
+ * to use it, then it should get a reference on it before calling this function.
+ */
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
+{
+ if (atomic_dec_and_test(&bg->nocow_writers))
+ wake_up_var(&bg->nocow_writers);
+
+ /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
+ btrfs_put_block_group(bg);
+}
+
+void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
+{
+ wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
+}
+
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+ const u64 start)
+{
+ struct btrfs_block_group *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, start);
+ ASSERT(bg);
+ if (atomic_dec_and_test(&bg->reservations))
+ wake_up_var(&bg->reservations);
+ btrfs_put_block_group(bg);
+}
+
+void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
+{
+ struct btrfs_space_info *space_info = bg->space_info;
+
+ ASSERT(bg->ro);
+
+ if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+ return;
+
+ /*
+ * Our block group is read only but before we set it to read only,
+ * some task might have had allocated an extent from it already, but it
+ * has not yet created a respective ordered extent (and added it to a
+ * root's list of ordered extents).
+ * Therefore wait for any task currently allocating extents, since the
+ * block group's reservations counter is incremented while a read lock
+ * on the groups' semaphore is held and decremented after releasing
+ * the read access on that semaphore and creating the ordered extent.
+ */
+ down_write(&space_info->groups_sem);
+ up_write(&space_info->groups_sem);
+
+ wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
+}
+
+struct btrfs_caching_control *btrfs_get_caching_control(
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_caching_control *ctl;
+
+ spin_lock(&cache->lock);
+ if (!cache->caching_ctl) {
+ spin_unlock(&cache->lock);
+ return NULL;
+ }
+
+ ctl = cache->caching_ctl;
+ refcount_inc(&ctl->count);
+ spin_unlock(&cache->lock);
+ return ctl;
+}
+
+void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
+{
+ if (refcount_dec_and_test(&ctl->count))
+ kfree(ctl);
+}
+
+/*
+ * When we wait for progress in the block group caching, its because our
+ * allocation attempt failed at least once. So, we must sleep and let some
+ * progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to show
+ * up, and then it will check the block group free space numbers for our min
+ * num_bytes. Another option is to have it go ahead and look in the rbtree for
+ * a free extent of a given size, but this is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
+ */
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
+ u64 num_bytes)
+{
+ struct btrfs_caching_control *caching_ctl;
+ int progress;
+
+ caching_ctl = btrfs_get_caching_control(cache);
+ if (!caching_ctl)
+ return;
+
+ /*
+ * We've already failed to allocate from this block group, so even if
+ * there's enough space in the block group it isn't contiguous enough to
+ * allow for an allocation, so wait for at least the next wakeup tick,
+ * or for the thing to be done.
+ */
+ progress = atomic_read(&caching_ctl->progress);
+
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
+ (progress != atomic_read(&caching_ctl->progress) &&
+ (cache->free_space_ctl->free_space >= num_bytes)));
+
+ btrfs_put_caching_control(caching_ctl);
+}
+
+static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
+}
+
+static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+{
+ struct btrfs_caching_control *caching_ctl;
+ int ret;
+
+ caching_ctl = btrfs_get_caching_control(cache);
+ if (!caching_ctl)
+ return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ return ret;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ u64 start = block_group->start;
+ u64 len = block_group->length;
+ u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ fs_info->nodesize : fs_info->sectorsize;
+ u64 step = chunk << 1;
+
+ while (len > chunk) {
+ btrfs_remove_free_space(block_group, start, chunk);
+ start += step;
+ if (len < step)
+ len = 0;
+ else
+ len -= step;
+ }
+}
+#endif
+
+/*
+ * This is only called by btrfs_cache_block_group, since we could have freed
+ * extents we need to check the pinned_extents for any extents that can't be
+ * used yet since their free space will be released as soon as the transaction
+ * commits.
+ */
+int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end,
+ u64 *total_added_ret)
+{
+ struct btrfs_fs_info *info = block_group->fs_info;
+ u64 extent_start, extent_end, size;
+ int ret;
+
+ if (total_added_ret)
+ *total_added_ret = 0;
+
+ while (start < end) {
+ ret = find_first_extent_bit(&info->excluded_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY | EXTENT_UPTODATE,
+ NULL);
+ if (ret)
+ break;
+
+ if (extent_start <= start) {
+ start = extent_end + 1;
+ } else if (extent_start > start && extent_start < end) {
+ size = extent_start - start;
+ ret = btrfs_add_free_space_async_trimmed(block_group,
+ start, size);
+ if (ret)
+ return ret;
+ if (total_added_ret)
+ *total_added_ret += size;
+ start = extent_end + 1;
+ } else {
+ break;
+ }
+ }
+
+ if (start < end) {
+ size = end - start;
+ ret = btrfs_add_free_space_async_trimmed(block_group, start,
+ size);
+ if (ret)
+ return ret;
+ if (total_added_ret)
+ *total_added_ret += size;
+ }
+
+ return 0;
+}
+
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
+{
+ struct btrfs_block_group *block_group = caching_ctl->block_group;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *extent_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 total_found = 0;
+ u64 last = 0;
+ u32 nritems;
+ int ret;
+ bool wakeup = true;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
+ extent_root = btrfs_extent_root(fs_info, last);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * If we're fragmenting we don't want to make anybody think we can
+ * allocate from this block group until we've had a chance to fragment
+ * the free space.
+ */
+ if (btrfs_should_fragment_free_space(block_group))
+ wakeup = false;
+#endif
+ /*
+ * We don't want to deadlock with somebody trying to allocate a new
+ * extent for the extent root while also trying to search the extent
+ * root to add free space. So we skip locking and search the commit
+ * root, since its read-only
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = READA_FORWARD;
+
+ key.objectid = last;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+next:
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ if (btrfs_fs_closing(fs_info) > 1) {
+ last = (u64)-1;
+ break;
+ }
+
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ } else {
+ ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
+ if (ret)
+ break;
+
+ if (need_resched() ||
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
+ btrfs_release_path(path);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&caching_ctl->mutex);
+ cond_resched();
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+ goto next;
+ }
+
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
+ }
+
+ if (key.objectid < last) {
+ key.objectid = last;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ btrfs_release_path(path);
+ goto next;
+ }
+
+ if (key.objectid < block_group->start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ if (key.objectid >= block_group->start + block_group->length)
+ break;
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ u64 space_added;
+
+ ret = add_new_free_space(block_group, last, key.objectid,
+ &space_added);
+ if (ret)
+ goto out;
+ total_found += space_added;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ last = key.objectid +
+ fs_info->nodesize;
+ else
+ last = key.objectid + key.offset;
+
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ if (wakeup) {
+ atomic_inc(&caching_ctl->progress);
+ wake_up(&caching_ctl->wait);
+ }
+ }
+ }
+ path->slots[0]++;
+ }
+
+ ret = add_new_free_space(block_group, last,
+ block_group->start + block_group->length,
+ NULL);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ int ret;
+
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+
+ if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ ret = load_free_space_cache(block_group);
+ if (ret == 1) {
+ ret = 0;
+ goto done;
+ }
+
+ /*
+ * We failed to load the space cache, set ourselves to
+ * CACHE_STARTED and carry on.
+ */
+ spin_lock(&block_group->lock);
+ block_group->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&block_group->lock);
+ wake_up(&caching_ctl->wait);
+ }
+
+ /*
+ * If we are in the transaction that populated the free space tree we
+ * can't actually cache from the free space tree as our commit root and
+ * real root are the same, so we could change the contents of the blocks
+ * while caching. Instead do the slow caching in this case, and after
+ * the transaction has committed we will be safe.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
+ ret = load_free_space_tree(caching_ctl);
+ else
+ ret = load_extent_tree_free(caching_ctl);
+done:
+ spin_lock(&block_group->lock);
+ block_group->caching_ctl = NULL;
+ block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
+ spin_unlock(&block_group->lock);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(block_group)) {
+ u64 bytes_used;
+
+ spin_lock(&block_group->space_info->lock);
+ spin_lock(&block_group->lock);
+ bytes_used = block_group->length - block_group->used;
+ block_group->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&block_group->space_info->lock);
+ fragment_free_space(block_group);
+ }
+#endif
+
+ up_read(&fs_info->commit_root_sem);
+ btrfs_free_excluded_extents(block_group);
+ mutex_unlock(&caching_ctl->mutex);
+
+ wake_up(&caching_ctl->wait);
+
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_block_group(block_group);
+}
+
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ int ret = 0;
+
+ /* Allocator for zoned filesystems does not use the cache at all */
+ if (btrfs_is_zoned(fs_info))
+ return 0;
+
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+ if (!caching_ctl)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ refcount_set(&caching_ctl->count, 2);
+ atomic_set(&caching_ctl->progress, 0);
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_NO) {
+ kfree(caching_ctl);
+
+ caching_ctl = cache->caching_ctl;
+ if (caching_ctl)
+ refcount_inc(&caching_ctl->count);
+ spin_unlock(&cache->lock);
+ goto out;
+ }
+ WARN_ON(cache->caching_ctl);
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&cache->lock);
+
+ write_lock(&fs_info->block_group_cache_lock);
+ refcount_inc(&caching_ctl->count);
+ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ btrfs_get_block_group(cache);
+
+ btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+out:
+ if (wait && caching_ctl)
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
+ if (caching_ctl)
+ btrfs_put_caching_control(caching_ctl);
+
+ return ret;
+}
+
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ write_seqlock(&fs_info->profiles_lock);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
+}
+
+/*
+ * Clear incompat bits for the following feature(s):
+ *
+ * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
+ * in the whole filesystem
+ *
+ * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
+ */
+static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ bool found_raid56 = false;
+ bool found_raid1c34 = false;
+
+ if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
+ (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
+ (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
+ struct list_head *head = &fs_info->space_info;
+ struct btrfs_space_info *sinfo;
+
+ list_for_each_entry_rcu(sinfo, head, list) {
+ down_read(&sinfo->groups_sem);
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
+ found_raid56 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
+ found_raid56 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
+ found_raid1c34 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
+ found_raid1c34 = true;
+ up_read(&sinfo->groups_sem);
+ }
+ if (!found_raid56)
+ btrfs_clear_fs_incompat(fs_info, RAID56);
+ if (!found_raid1c34)
+ btrfs_clear_fs_incompat(fs_info, RAID1C34);
+ }
+}
+
+static int remove_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int ret;
+
+ root = btrfs_block_group_root(fs_info);
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_del_item(trans, root, path);
+ return ret;
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ u64 group_start, struct extent_map *em)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_block_group *block_group;
+ struct btrfs_free_cluster *cluster;
+ struct inode *inode;
+ struct kobject *kobj = NULL;
+ int ret;
+ int index;
+ int factor;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ bool remove_em;
+ bool remove_rsv = false;
+
+ block_group = btrfs_lookup_block_group(fs_info, group_start);
+ BUG_ON(!block_group);
+ BUG_ON(!block_group->ro);
+
+ trace_btrfs_remove_block_group(block_group);
+ /*
+ * Free the reserved super bytes from this block group before
+ * remove it.
+ */
+ btrfs_free_excluded_extents(block_group);
+ btrfs_free_ref_tree_range(fs_info, block_group->start,
+ block_group->length);
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ factor = btrfs_bg_type_to_factor(block_group->flags);
+
+ /* make sure this block group isn't part of an allocation cluster */
+ cluster = &fs_info->data_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ /*
+ * make sure this block group isn't part of a metadata
+ * allocation cluster
+ */
+ cluster = &fs_info->meta_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * get the inode first so any iput calls done for the io_list
+ * aren't the final iput (no unlinks allowed now)
+ */
+ inode = lookup_free_space_inode(block_group, path);
+
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ /*
+ * Make sure our free space cache IO is done before removing the
+ * free space inode
+ */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ btrfs_wait_cache_io(trans, block_group, path);
+ btrfs_put_block_group(block_group);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ }
+
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ remove_rsv = true;
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ ret = btrfs_remove_free_space_inode(trans, inode, block_group);
+ if (ret)
+ goto out;
+
+ write_lock(&fs_info->block_group_cache_lock);
+ rb_erase_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
+
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ /*
+ * we must use list_del_init so people can check to see if they
+ * are still on the list after taking the semaphore
+ */
+ list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index])) {
+ kobj = block_group->space_info->block_group_kobjs[index];
+ block_group->space_info->block_group_kobjs[index] = NULL;
+ clear_avail_alloc_bits(fs_info, block_group->flags);
+ }
+ up_write(&block_group->space_info->groups_sem);
+ clear_incompat_bg_bits(fs_info, block_group->flags);
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ btrfs_wait_block_group_cache_done(block_group);
+
+ write_lock(&fs_info->block_group_cache_lock);
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
+ }
+ }
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ WARN_ON(!list_empty(&block_group->dirty_list));
+ WARN_ON(!list_empty(&block_group->io_list));
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ btrfs_remove_free_space_cache(block_group);
+
+ spin_lock(&block_group->space_info->lock);
+ list_del_init(&block_group->ro_list);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ WARN_ON(block_group->space_info->total_bytes
+ < block_group->length);
+ WARN_ON(block_group->space_info->bytes_readonly
+ < block_group->length - block_group->zone_unusable);
+ WARN_ON(block_group->space_info->bytes_zone_unusable
+ < block_group->zone_unusable);
+ WARN_ON(block_group->space_info->disk_total
+ < block_group->length * factor);
+ WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags) &&
+ block_group->space_info->active_total_bytes
+ < block_group->length);
+ }
+ block_group->space_info->total_bytes -= block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ block_group->space_info->active_total_bytes -= block_group->length;
+ block_group->space_info->bytes_readonly -=
+ (block_group->length - block_group->zone_unusable);
+ block_group->space_info->bytes_zone_unusable -=
+ block_group->zone_unusable;
+ block_group->space_info->disk_total -= block_group->length * factor;
+
+ spin_unlock(&block_group->space_info->lock);
+
+ /*
+ * Remove the free space for the block group from the free space tree
+ * and the block group's item from the extent tree before marking the
+ * block group as removed. This is to prevent races with tasks that
+ * freeze and unfreeze a block group, this task and another task
+ * allocating a new block group - the unfreeze task ends up removing
+ * the block group's extent map before the task calling this function
+ * deletes the block group item from the extent tree, allowing for
+ * another task to attempt to create another block group with the same
+ * item key (and failing with -EEXIST and a transaction abort).
+ */
+ ret = remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_item(trans, path, block_group);
+ if (ret < 0)
+ goto out;
+
+ spin_lock(&block_group->lock);
+ set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+
+ /*
+ * At this point trimming or scrub can't start on this block group,
+ * because we removed the block group from the rbtree
+ * fs_info->block_group_cache_tree so no one can't find it anymore and
+ * even if someone already got this block group before we removed it
+ * from the rbtree, they have already incremented block_group->frozen -
+ * if they didn't, for the trimming case they won't find any free space
+ * entries because we already removed them all when we called
+ * btrfs_remove_free_space_cache().
+ *
+ * And we must not remove the extent map from the fs_info->mapping_tree
+ * to prevent the same logical address range and physical device space
+ * ranges from being reused for a new block group. This is needed to
+ * avoid races with trimming and scrub.
+ *
+ * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * completely transactionless, so while it is trimming a range the
+ * currently running transaction might finish and a new one start,
+ * allowing for new block groups to be created that can reuse the same
+ * physical device locations unless we take this special care.
+ *
+ * There may also be an implicit trim operation if the file system
+ * is mounted with -odiscard. The same protections must remain
+ * in place until the extents have been discarded completely when
+ * the transaction commit has completed.
+ */
+ remove_em = (atomic_read(&block_group->frozen) == 0);
+ spin_unlock(&block_group->lock);
+
+ if (remove_em) {
+ struct extent_map_tree *em_tree;
+
+ em_tree = &fs_info->mapping_tree;
+ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+
+out:
+ /* Once for the lookup reference */
+ btrfs_put_block_group(block_group);
+ if (remove_rsv)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info, const u64 chunk_offset)
+{
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = em->map_lookup;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(root, num_items);
+}
+
+/*
+ * Mark block group @cache read-only, so later write won't happen to block
+ * group @cache.
+ *
+ * If @force is not set, this function will only mark the block group readonly
+ * if we have enough free space (1M) in other metadata/system block groups.
+ * If @force is not set, this function will mark the block group readonly
+ * without checking free space.
+ *
+ * NOTE: This function doesn't care if other block groups can contain all the
+ * data in this block group. That check should be done by relocation routine,
+ * not this function.
+ */
+static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
+{
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+ int ret = -ENOSPC;
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+
+ if (cache->swap_extents) {
+ ret = -ETXTBSY;
+ goto out;
+ }
+
+ if (cache->ro) {
+ cache->ro++;
+ ret = 0;
+ goto out;
+ }
+
+ num_bytes = cache->length - cache->reserved - cache->pinned -
+ cache->bytes_super - cache->zone_unusable - cache->used;
+
+ /*
+ * Data never overcommits, even in mixed mode, so do just the straight
+ * check of left over space in how much we have allocated.
+ */
+ if (force) {
+ ret = 0;
+ } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
+ u64 sinfo_used = btrfs_space_info_used(sinfo, true);
+
+ /*
+ * Here we make sure if we mark this bg RO, we still have enough
+ * free space as buffer.
+ */
+ if (sinfo_used + num_bytes <= sinfo->total_bytes)
+ ret = 0;
+ } else {
+ /*
+ * We overcommit metadata, so we need to do the
+ * btrfs_can_overcommit check here, and we need to pass in
+ * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
+ * leeway to allow us to mark this block group as read only.
+ */
+ if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH))
+ ret = 0;
+ }
+
+ if (!ret) {
+ sinfo->bytes_readonly += num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes to readonly */
+ sinfo->bytes_readonly += cache->zone_unusable;
+ sinfo->bytes_zone_unusable -= cache->zone_unusable;
+ cache->zone_unusable = 0;
+ }
+ cache->ro++;
+ list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
+ }
+out:
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+ if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(cache->fs_info,
+ "unable to make block group %llu ro", cache->start);
+ btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ }
+ return ret;
+}
+
+static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_transaction *prev_trans = NULL;
+ const u64 start = bg->start;
+ const u64 end = start + bg->length - 1;
+ int ret;
+
+ spin_lock(&fs_info->trans_lock);
+ if (trans->transaction->list.prev != &fs_info->trans_list) {
+ prev_trans = list_last_entry(&trans->transaction->list,
+ struct btrfs_transaction, list);
+ refcount_inc(&prev_trans->use_count);
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N, another
+ * task might be running finish_extent_commit() for the previous
+ * transaction N - 1, and have seen a range belonging to the block
+ * group in pinned_extents before we were able to clear the whole block
+ * group range from pinned_extents. This means that task can lookup for
+ * the block group after we unpinned it from pinned_extents and removed
+ * it, leading to a BUG_ON() at unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans) {
+ ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY);
+ if (ret)
+ goto out;
+ }
+
+ ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY);
+out:
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans)
+ btrfs_put_transaction(prev_trans);
+
+ return ret == 0;
+}
+
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_trans_handle *trans;
+ const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ if (btrfs_fs_closing(fs_info))
+ return;
+
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip deletion if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ int trimming;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+
+ /*
+ * Async discard moves the final block group discard to be prior
+ * to the unused_bgs code path. Therefore, if it's not fully
+ * trimmed, punt it back to the async discard lists.
+ */
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
+ !btrfs_is_free_space_trimmed(block_group)) {
+ trace_btrfs_skip_unused_block_group(block_group);
+ up_write(&space_info->groups_sem);
+ /* Requeue if we failed because of async discard */
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
+ goto next;
+ }
+
+ spin_lock(&block_group->lock);
+ if (block_group->reserved || block_group->pinned ||
+ block_group->used || block_group->ro ||
+ list_is_singular(&block_group->list)) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = inc_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ ret = btrfs_zone_finish(block_group);
+ if (ret < 0) {
+ btrfs_dec_block_group_ro(block_group);
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->start);
+ if (IS_ERR(trans)) {
+ btrfs_dec_block_group_ro(block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ if (!clean_pinned_extents(trans, block_group)) {
+ btrfs_dec_block_group_ro(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * At this point, the block_group is read only and should fail
+ * new allocations. However, btrfs_finish_extent_commit() can
+ * cause this block_group to be placed back on the discard
+ * lists because now the block_group isn't fully discarded.
+ * Bail here and try again later after discarding everything.
+ */
+ spin_lock(&fs_info->discard_ctl.lock);
+ if (!list_empty(&block_group->discard_list)) {
+ spin_unlock(&fs_info->discard_ctl.lock);
+ btrfs_dec_block_group_ro(block_group);
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
+ goto end_trans;
+ }
+ spin_unlock(&fs_info->discard_ctl.lock);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+
+ btrfs_space_info_update_bytes_pinned(fs_info, space_info,
+ -block_group->pinned);
+ space_info->bytes_readonly += block_group->pinned;
+ block_group->pinned = 0;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ /*
+ * The normal path here is an unused block group is passed here,
+ * then trimming is handled in the transaction commit path.
+ * Async discard interposes before this to do the trimming
+ * before coming down the unused block group path as trimming
+ * will no longer be done later in the transaction commit path.
+ */
+ if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ goto flip_async;
+
+ /*
+ * DISCARD can flip during remount. On zoned filesystems, we
+ * need to reset sequential-required zones.
+ */
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_is_zoned(fs_info);
+
+ /* Implicit trim during transaction commit. */
+ if (trimming)
+ btrfs_freeze_block_group(block_group);
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, block_group->start);
+
+ if (ret) {
+ if (trimming)
+ btrfs_unfreeze_block_group(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * If we're not mounted with -odiscard, we can just forget
+ * about this block group. Otherwise we'll need to wait
+ * until transaction commit to do the actual discard.
+ */
+ if (trimming) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
+ list_move(&block_group->bg_list,
+ &trans->transaction->deleted_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_get_block_group(block_group);
+ }
+end_trans:
+ btrfs_end_transaction(trans);
+next:
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ return;
+
+flip_async:
+ btrfs_end_transaction(trans);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_put_block_group(block_group);
+ btrfs_discard_punt_unused_bgs_list(fs_info);
+}
+
+void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ trace_btrfs_add_unused_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
+ } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
+ /* Pull out the block group from the reclaim_bgs list. */
+ trace_btrfs_add_unused_block_group(bg);
+ list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+ const struct list_head *b)
+{
+ const struct btrfs_block_group *bg1, *bg2;
+
+ bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+ bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+ return bg1->used > bg2->used;
+}
+
+static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_zoned_should_reclaim(fs_info);
+ return true;
+}
+
+void btrfs_reclaim_bgs_work(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info =
+ container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+ struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ if (btrfs_fs_closing(fs_info))
+ return;
+
+ if (!btrfs_should_reclaim(fs_info))
+ return;
+
+ sb_start_write(fs_info->sb);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ sb_end_write(fs_info->sb);
+ return;
+ }
+
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip reclaim if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
+ btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
+ return;
+ }
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * Sort happens under lock because we can't simply splice it and sort.
+ * The block groups might still be in use and reachable via bg_list,
+ * and their presence in the reclaim_bgs list must be preserved.
+ */
+ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
+ while (!list_empty(&fs_info->reclaim_bgs)) {
+ u64 zone_unusable;
+ int ret = 0;
+
+ bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&bg->bg_list);
+
+ space_info = bg->space_info;
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+
+ spin_lock(&bg->lock);
+ if (bg->reserved || bg->pinned || bg->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&bg->lock);
+
+ /*
+ * Get out fast, in case we're read-only or unmounting the
+ * filesystem. It is OK to drop block groups from the list even
+ * for the read-only case. As we did sb_start_write(),
+ * "mount -o remount,ro" won't happen and read-only filesystem
+ * means it is forced read-only due to a fatal error. So, it
+ * never gets back to read-write to let us reclaim again.
+ */
+ if (btrfs_need_cleaner_sleep(fs_info)) {
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the blog group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore.
+ */
+ zone_unusable = bg->zone_unusable;
+ ret = inc_block_group_ro(bg, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0)
+ goto next;
+
+ btrfs_info(fs_info,
+ "reclaiming chunk %llu with %llu%% used %llu%% unusable",
+ bg->start,
+ div64_u64(bg->used * 100, bg->length),
+ div64_u64(zone_unusable * 100, bg->length));
+ trace_btrfs_reclaim_block_group(bg);
+ ret = btrfs_relocate_chunk(fs_info, bg->start);
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
+ btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+ }
+
+next:
+ if (ret)
+ btrfs_mark_bg_to_reclaim(bg);
+ btrfs_put_block_group(bg);
+
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ /*
+ * Reclaiming all the block groups in the list can take really
+ * long. Prioritize cleaning up unused block groups.
+ */
+ btrfs_delete_unused_bgs(fs_info);
+ /*
+ * If we are interrupted by a balance, we can just bail out. The
+ * cleaner thread restart again if necessary.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
+ goto end;
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+end:
+ btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
+}
+
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (!list_empty(&fs_info->reclaim_bgs))
+ queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ trace_btrfs_add_reclaim_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_block_group_item bg;
+ struct extent_buffer *leaf;
+ int slot;
+ u64 flags;
+ int ret = 0;
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+
+ em_tree = &fs_info->mapping_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ key->objectid, key->offset);
+ return -ENOENT;
+ }
+
+ if (em->start != key->objectid || em->len != key->offset) {
+ btrfs_err(fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ key->objectid, key->offset, em->start, em->len);
+ ret = -EUCLEAN;
+ goto out_free_em;
+ }
+
+ read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_stack_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ key->objectid, key->offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ ret = -EUCLEAN;
+ }
+
+out_free_em:
+ free_extent_map(em);
+ return ret;
+}
+
+static int find_first_block_group(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ int ret;
+ struct btrfs_key found_key;
+
+ btrfs_for_each_slot(root, key, &found_key, path, ret) {
+ if (found_key.objectid >= key->objectid &&
+ found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ return read_bg_from_eb(fs_info, &found_key, path);
+ }
+ }
+ return ret;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ write_seqlock(&fs_info->profiles_lock);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
+}
+
+/**
+ * Map a physical disk address to a list of logical addresses
+ *
+ * @fs_info: the filesystem
+ * @chunk_start: logical address of block group
+ * @bdev: physical device to resolve, can be NULL to indicate any device
+ * @physical: physical address to map to logical addresses
+ * @logical: return array of logical addresses which map to @physical
+ * @naddrs: length of @logical
+ * @stripe_len: size of IO stripe for the given block group
+ *
+ * Maps a particular @physical disk address to a list of @logical addresses.
+ * Used primarily to exclude those portions of a block group that contain super
+ * block copies.
+ */
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 *buf;
+ u64 bytenr;
+ u64 data_stripe_length;
+ u64 io_stripe_size;
+ int i, nr = 0;
+ int ret = 0;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(em))
+ return -EIO;
+
+ map = em->map_lookup;
+ data_stripe_length = em->orig_block_len;
+ io_stripe_size = map->stripe_len;
+ chunk_start = em->start;
+
+ /* For RAID5/6 adjust to a full IO stripe length */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ io_stripe_size = map->stripe_len * nr_data_stripes(map);
+
+ buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ bool already_inserted = false;
+ u64 stripe_nr;
+ u64 offset;
+ int j;
+
+ if (!in_range(physical, map->stripes[i].physical,
+ data_stripe_length))
+ continue;
+
+ if (bdev && map->stripes[i].dev->bdev != bdev)
+ continue;
+
+ stripe_nr = physical - map->stripes[i].physical;
+ stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+ }
+ /*
+ * The remaining case would be for RAID56, multiply by
+ * nr_data_stripes(). Alternatively, just use rmap_len below
+ * instead of map->stripe_len
+ */
+
+ bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
+
+ /* Ensure we don't add duplicate addresses */
+ for (j = 0; j < nr; j++) {
+ if (buf[j] == bytenr) {
+ already_inserted = true;
+ break;
+ }
+ }
+
+ if (!already_inserted)
+ buf[nr++] = bytenr;
+ }
+
+ *logical = buf;
+ *naddrs = nr;
+ *stripe_len = io_stripe_size;
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+static int exclude_super_stripes(struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ const bool zoned = btrfs_is_zoned(fs_info);
+ u64 bytenr;
+ u64 *logical;
+ int stripe_len;
+ int i, nr, ret;
+
+ if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
+ stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
+ cache->bytes_super += stripe_len;
+ ret = btrfs_add_excluded_extent(fs_info, cache->start,
+ stripe_len);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ ret = btrfs_rmap_block(fs_info, cache->start, NULL,
+ bytenr, &logical, &nr, &stripe_len);
+ if (ret)
+ return ret;
+
+ /* Shouldn't have super stripes in sequential zones */
+ if (zoned && nr) {
+ kfree(logical);
+ btrfs_err(fs_info,
+ "zoned: block group %llu must not contain super block",
+ cache->start);
+ return -EUCLEAN;
+ }
+
+ while (nr--) {
+ u64 len = min_t(u64, stripe_len,
+ cache->start + cache->length - logical[nr]);
+
+ cache->bytes_super += len;
+ ret = btrfs_add_excluded_extent(fs_info, logical[nr],
+ len);
+ if (ret) {
+ kfree(logical);
+ return ret;
+ }
+ }
+
+ kfree(logical);
+ }
+ return 0;
+}
+
+static struct btrfs_block_group *btrfs_create_block_group_cache(
+ struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct btrfs_block_group *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->start = start;
+
+ cache->fs_info = fs_info;
+ cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
+
+ cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+
+ refcount_set(&cache->refs, 1);
+ spin_lock_init(&cache->lock);
+ init_rwsem(&cache->data_rwsem);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->discard_list);
+ INIT_LIST_HEAD(&cache->dirty_list);
+ INIT_LIST_HEAD(&cache->io_list);
+ INIT_LIST_HEAD(&cache->active_bg_list);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
+ atomic_set(&cache->frozen, 0);
+ mutex_init(&cache->free_space_lock);
+ cache->full_stripe_locks_root.root = RB_ROOT;
+ mutex_init(&cache->full_stripe_locks_root.lock);
+
+ return cache;
+}
+
+/*
+ * Iterate all chunks and verify that each of them has the corresponding block
+ * group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct btrfs_block_group *bg;
+ u64 start = 0;
+ int ret = 0;
+
+ while (1) {
+ read_lock(&map_tree->lock);
+ /*
+ * lookup_extent_mapping will return the first extent map
+ * intersecting the range, so setting @len to 1 is enough to
+ * get the first chunk.
+ */
+ em = lookup_extent_mapping(map_tree, start, 1);
+ read_unlock(&map_tree->lock);
+ if (!em)
+ break;
+
+ bg = btrfs_lookup_block_group(fs_info, em->start);
+ if (!bg) {
+ btrfs_err(fs_info,
+ "chunk start=%llu len=%llu doesn't have corresponding block group",
+ em->start, em->len);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ break;
+ }
+ if (bg->start != em->start || bg->length != em->len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
+ em->start, em->len,
+ em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ bg->start, bg->length,
+ bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ break;
+ }
+ start = em->start + em->len;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ }
+ return ret;
+}
+
+static int read_one_block_group(struct btrfs_fs_info *info,
+ struct btrfs_block_group_item *bgi,
+ const struct btrfs_key *key,
+ int need_clear)
+{
+ struct btrfs_block_group *cache;
+ const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
+ int ret;
+
+ ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
+
+ cache = btrfs_create_block_group_cache(info, key->objectid);
+ if (!cache)
+ return -ENOMEM;
+
+ cache->length = key->offset;
+ cache->used = btrfs_stack_block_group_used(bgi);
+ cache->flags = btrfs_stack_block_group_flags(bgi);
+ cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+
+ set_free_space_tree_thresholds(cache);
+
+ if (need_clear) {
+ /*
+ * When we mount with old space cache, we need to
+ * set BTRFS_DC_CLEAR and set dirty flag.
+ *
+ * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+ * truncate the old free space cache inode and
+ * setup a new one.
+ * b) Setting 'dirty flag' makes sure that we flush
+ * the new space cache info onto disk.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+ }
+ if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
+ btrfs_err(info,
+"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
+ cache->start);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ ret = btrfs_load_block_group_zone_info(cache, false);
+ if (ret) {
+ btrfs_err(info, "zoned: failed to load zone info of bg %llu",
+ cache->start);
+ goto error;
+ }
+
+ /*
+ * We need to exclude the super stripes now so that the space info has
+ * super bytes accounted for, otherwise we'll think we have more space
+ * than we actually do.
+ */
+ ret = exclude_super_stripes(cache);
+ if (ret) {
+ /* We may have excluded something, so call this just in case. */
+ btrfs_free_excluded_extents(cache);
+ goto error;
+ }
+
+ /*
+ * For zoned filesystem, space after the allocation offset is the only
+ * free space for a block group. So, we don't need any caching work.
+ * btrfs_calc_zone_unusable() will set the amount of free space and
+ * zone_unusable space.
+ *
+ * For regular filesystem, check for two cases, either we are full, and
+ * therefore don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all the space
+ * in and be done with it. This saves us _a_lot_ of time, particularly
+ * in the full case.
+ */
+ if (btrfs_is_zoned(info)) {
+ btrfs_calc_zone_unusable(cache);
+ /* Should not have any excluded extents. Just in case, though. */
+ btrfs_free_excluded_extents(cache);
+ } else if (cache->length == cache->used) {
+ cache->cached = BTRFS_CACHE_FINISHED;
+ btrfs_free_excluded_extents(cache);
+ } else if (cache->used == 0) {
+ cache->cached = BTRFS_CACHE_FINISHED;
+ ret = add_new_free_space(cache, cache->start,
+ cache->start + cache->length, NULL);
+ btrfs_free_excluded_extents(cache);
+ if (ret)
+ goto error;
+ }
+
+ ret = btrfs_add_block_group_cache(info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ goto error;
+ }
+ trace_btrfs_add_block_group(info, cache, 0);
+ btrfs_add_bg_to_space_info(info, cache);
+
+ set_avail_alloc_bits(info, cache->flags);
+ if (btrfs_chunk_writeable(info, cache->start)) {
+ if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ }
+ } else {
+ inc_block_group_ro(cache, 1);
+ }
+
+ return 0;
+error:
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct rb_node *node;
+ int ret = 0;
+
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_block_group *bg;
+
+ em = rb_entry(node, struct extent_map, rb_node);
+ map = em->map_lookup;
+ bg = btrfs_create_block_group_cache(fs_info, em->start);
+ if (!bg) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* Fill dummy cache as FULL */
+ bg->length = em->len;
+ bg->flags = map->type;
+ bg->cached = BTRFS_CACHE_FINISHED;
+ bg->used = em->len;
+ bg->flags = map->type;
+ ret = btrfs_add_block_group_cache(fs_info, bg);
+ /*
+ * We may have some valid block group cache added already, in
+ * that case we skip to the next one.
+ */
+ if (ret == -EEXIST) {
+ ret = 0;
+ btrfs_put_block_group(bg);
+ continue;
+ }
+
+ if (ret) {
+ btrfs_remove_free_space_cache(bg);
+ btrfs_put_block_group(bg);
+ break;
+ }
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+
+ set_avail_alloc_bits(fs_info, bg->flags);
+ }
+ if (!ret)
+ btrfs_init_global_block_rsv(fs_info);
+ return ret;
+}
+
+int btrfs_read_block_groups(struct btrfs_fs_info *info)
+{
+ struct btrfs_root *root = btrfs_block_group_root(info);
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_block_group *cache;
+ struct btrfs_space_info *space_info;
+ struct btrfs_key key;
+ int need_clear = 0;
+ u64 cache_gen;
+
+ /*
+ * Either no extent root (with ibadroots rescue option) or we have
+ * unsupported RO options. The fs can never be mounted read-write, so no
+ * need to waste time searching block group items.
+ *
+ * This also allows new extent tree related changes to be RO compat,
+ * no need for a full incompat flag.
+ */
+ if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP))
+ return fill_dummy_bgs(info);
+
+ key.objectid = 0;
+ key.offset = 0;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ cache_gen = btrfs_super_cache_generation(info->super_copy);
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ btrfs_super_generation(info->super_copy) != cache_gen)
+ need_clear = 1;
+ if (btrfs_test_opt(info, CLEAR_CACHE))
+ need_clear = 1;
+
+ while (1) {
+ struct btrfs_block_group_item bgi;
+ struct extent_buffer *leaf;
+ int slot;
+
+ ret = find_first_block_group(info, path, &key);
+ if (ret > 0)
+ break;
+ if (ret != 0)
+ goto error;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_release_path(path);
+ ret = read_one_block_group(info, &bgi, &key, need_clear);
+ if (ret < 0)
+ goto error;
+ key.objectid += key.offset;
+ key.offset = 0;
+ }
+ btrfs_release_path(path);
+
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
+
+ if (!(btrfs_get_alloc_profile(info, space_info->flags) &
+ (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID1_MASK |
+ BTRFS_BLOCK_GROUP_RAID56_MASK |
+ BTRFS_BLOCK_GROUP_DUP)))
+ continue;
+ /*
+ * Avoid allocating from un-mirrored block group if there are
+ * mirrored block groups.
+ */
+ list_for_each_entry(cache,
+ &space_info->block_groups[BTRFS_RAID_RAID0],
+ list)
+ inc_block_group_ro(cache, 1);
+ list_for_each_entry(cache,
+ &space_info->block_groups[BTRFS_RAID_SINGLE],
+ list)
+ inc_block_group_ro(cache, 1);
+ }
+
+ btrfs_init_global_block_rsv(info);
+ ret = check_chunk_block_group_mappings(info);
+error:
+ btrfs_free_path(path);
+ /*
+ * We've hit some error while reading the extent tree, and have
+ * rescue=ibadroots mount option.
+ * Try to fill the tree using dummy block groups so that the user can
+ * continue to mount and grab their data.
+ */
+ if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
+ ret = fill_dummy_bgs(info);
+ return ret;
+}
+
+/*
+ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
+ * allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group_item bgi;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ struct btrfs_key key;
+
+ spin_lock(&block_group->lock);
+ btrfs_set_stack_block_group_used(&bgi, block_group->used);
+ btrfs_set_stack_block_group_chunk_objectid(&bgi,
+ block_group->global_root_id);
+ btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
+ spin_unlock(&block_group->lock);
+
+ return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+}
+
+static int insert_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 chunk_offset,
+ u64 start, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_dev_extent *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+ btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * This function belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_dev_extents(struct btrfs_trans_handle *trans,
+ u64 chunk_offset, u64 chunk_size)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_device *device;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i;
+ int ret = 0;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ stripe_size = em->orig_block_len;
+
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * resulting in persisting a device extent item with such ID.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+ ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
+ * chunk allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *block_group;
+ int ret = 0;
+
+ while (!list_empty(&trans->new_bgs)) {
+ int index;
+
+ block_group = list_first_entry(&trans->new_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ if (ret)
+ goto next;
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+
+ ret = insert_block_group_item(trans, block_group);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ &block_group->runtime_flags)) {
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+ ret = insert_dev_extents(trans, block_group->start,
+ block_group->length);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
+ /* Already aborted the transaction if it failed. */
+next:
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ list_del_init(&block_group->bg_list);
+ clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+ }
+ btrfs_trans_release_chunk_metadata(trans);
+}
+
+/*
+ * For extent tree v2 we use the block_group_item->chunk_offset to point at our
+ * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
+ */
+static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+{
+ u64 div = SZ_1G;
+ u64 index;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+
+ /* If we have a smaller fs index based on 128MiB. */
+ if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
+ div = SZ_128M;
+
+ offset = div64_u64(offset, div);
+ div64_u64_rem(offset, fs_info->nr_global_roots, &index);
+ return index;
+}
+
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ u64 bytes_used, u64 type,
+ u64 chunk_offset, u64 size)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache;
+ int ret;
+
+ btrfs_set_log_full_commit(trans);
+
+ cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
+ if (!cache)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Mark it as new before adding it to the rbtree of block groups or any
+ * list, so that no other task finds it and calls btrfs_mark_bg_unused()
+ * before the new flag is set.
+ */
+ set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
+
+ cache->length = size;
+ set_free_space_tree_thresholds(cache);
+ cache->used = bytes_used;
+ cache->flags = type;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
+
+ ret = btrfs_load_block_group_zone_info(cache, true);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ERR_PTR(ret);
+ }
+
+ ret = exclude_super_stripes(cache);
+ if (ret) {
+ /* We may have excluded something, so call this just in case */
+ btrfs_free_excluded_extents(cache);
+ btrfs_put_block_group(cache);
+ return ERR_PTR(ret);
+ }
+
+ ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
+ btrfs_free_excluded_extents(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Ensure the corresponding space_info object is created and
+ * assigned to our block group. We want our bg to be added to the rbtree
+ * with its ->space_info set.
+ */
+ cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
+ ASSERT(cache->space_info);
+
+ ret = btrfs_add_block_group_cache(fs_info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Now that our block group has its ->space_info set and is inserted in
+ * the rbtree, update the space info's counters.
+ */
+ trace_btrfs_add_block_group(fs_info, cache, 1);
+ btrfs_add_bg_to_space_info(fs_info, cache);
+ btrfs_update_global_block_rsv(fs_info);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ cache->space_info->bytes_used += new_bytes_used >> 1;
+ fragment_free_space(cache);
+ }
+#endif
+
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
+ trans->delayed_ref_updates++;
+ btrfs_update_delayed_refs_rsv(trans);
+
+ set_avail_alloc_bits(fs_info, type);
+ return cache;
+}
+
+/*
+ * Mark one block group RO, can be called several times for the same block
+ * group.
+ *
+ * @cache: the destination block group
+ * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
+ * ensure we still have some free space after marking this
+ * block group RO.
+ */
+int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ bool do_chunk_alloc)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ u64 alloc_flags;
+ int ret;
+ bool dirty_bg_running;
+
+ /*
+ * This can only happen when we are doing read-only scrub on read-only
+ * mount.
+ * In that case we should not start a new transaction on read-only fs.
+ * Thus here we skip all chunk allocations.
+ */
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ ret = inc_block_group_ro(cache, 0);
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ return ret;
+ }
+
+ do {
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ dirty_bg_running = false;
+
+ /*
+ * We're not allowed to set block groups readonly after the dirty
+ * block group cache has started writing. If it already started,
+ * back off and let this transaction commit.
+ */
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
+ u64 transid = trans->transid;
+
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans);
+
+ ret = btrfs_wait_for_commit(fs_info, transid);
+ if (ret)
+ return ret;
+ dirty_bg_running = true;
+ }
+ } while (dirty_bg_running);
+
+ if (do_chunk_alloc) {
+ /*
+ * If we are changing raid levels, try to allocate a
+ * corresponding block group with the new raid level.
+ */
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
+ if (alloc_flags != cache->flags) {
+ ret = btrfs_chunk_alloc(trans, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * ENOSPC is allowed here, we may have enough space
+ * already allocated at the new raid level to carry on
+ */
+ if (ret == -ENOSPC)
+ ret = 0;
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ ret = inc_block_group_ro(cache, 0);
+ if (!ret)
+ goto out;
+ if (ret == -ETXTBSY)
+ goto unlock_out;
+
+ /*
+ * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
+ * chunk allocation storm to exhaust the system chunk array. Otherwise
+ * we still want to try our best to mark the block group read-only.
+ */
+ if (!do_chunk_alloc && ret == -ENOSPC &&
+ (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
+ goto unlock_out;
+
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
+ ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ if (ret < 0)
+ goto out;
+ /*
+ * We have allocated a new chunk. We also need to activate that chunk to
+ * grant metadata tickets for zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
+ if (ret < 0)
+ goto out;
+
+ ret = inc_block_group_ro(cache, 0);
+ if (ret == -ETXTBSY)
+ goto unlock_out;
+out:
+ if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
+ mutex_lock(&fs_info->chunk_mutex);
+ check_system_chunk(trans, alloc_flags);
+ mutex_unlock(&fs_info->chunk_mutex);
+ }
+unlock_out:
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
+{
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+
+ BUG_ON(!cache->ro);
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ if (!--cache->ro) {
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes back */
+ cache->zone_unusable =
+ (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ sinfo->bytes_zone_unusable += cache->zone_unusable;
+ sinfo->bytes_readonly -= cache->zone_unusable;
+ }
+ num_bytes = cache->length - cache->reserved -
+ cache->pinned - cache->bytes_super -
+ cache->zone_unusable - cache->used;
+ sinfo->bytes_readonly -= num_bytes;
+ list_del_init(&cache->ro_list);
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+}
+
+static int update_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ unsigned long bi;
+ struct extent_buffer *leaf;
+ struct btrfs_block_group_item bgi;
+ struct btrfs_key key;
+
+ key.objectid = cache->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = cache->length;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ leaf = path->nodes[0];
+ bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ btrfs_set_stack_block_group_used(&bgi, cache->used);
+ btrfs_set_stack_block_group_chunk_objectid(&bgi,
+ cache->global_root_id);
+ btrfs_set_stack_block_group_flags(&bgi, cache->flags);
+ write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
+ btrfs_mark_buffer_dirty(leaf);
+fail:
+ btrfs_release_path(path);
+ return ret;
+
+}
+
+static int cache_save_setup(struct btrfs_block_group *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct inode *inode = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ u64 alloc_hint = 0;
+ int dcs = BTRFS_DC_ERROR;
+ u64 cache_size = 0;
+ int retries = 0;
+ int ret = 0;
+
+ if (!btrfs_test_opt(fs_info, SPACE_CACHE))
+ return 0;
+
+ /*
+ * If this block group is smaller than 100 megs don't bother caching the
+ * block group.
+ */
+ if (block_group->length < (100 * SZ_1M)) {
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ if (TRANS_ABORTED(trans))
+ return 0;
+again:
+ inode = lookup_free_space_inode(block_group, path);
+ if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+ ret = PTR_ERR(inode);
+ btrfs_release_path(path);
+ goto out;
+ }
+
+ if (IS_ERR(inode)) {
+ BUG_ON(retries);
+ retries++;
+
+ if (block_group->ro)
+ goto out_free;
+
+ ret = create_free_space_inode(trans, block_group, path);
+ if (ret)
+ goto out_free;
+ goto again;
+ }
+
+ /*
+ * We want to set the generation to 0, that way if anything goes wrong
+ * from here on out we know not to trust this cache when we load up next
+ * time.
+ */
+ BTRFS_I(inode)->generation = 0;
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret) {
+ /*
+ * So theoretically we could recover from this, simply set the
+ * super cache generation to 0 so we know to invalidate the
+ * cache, but then we'd have to keep track of the block groups
+ * that fail this way so we know we _have_ to reset this cache
+ * before the next commit or risk reading stale cache. So to
+ * limit our exposure to horrible edge cases lets just abort the
+ * transaction, this only happens in really bad situations
+ * anyway.
+ */
+ btrfs_abort_transaction(trans, ret);
+ goto out_put;
+ }
+ WARN_ON(ret);
+
+ /* We've already setup this transaction, go ahead and exit */
+ if (block_group->cache_generation == trans->transid &&
+ i_size_read(inode)) {
+ dcs = BTRFS_DC_SETUP;
+ goto out_put;
+ }
+
+ if (i_size_read(inode) > 0) {
+ ret = btrfs_check_trunc_cache_free_space(fs_info,
+ &fs_info->global_block_rsv);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
+ if (ret)
+ goto out_put;
+ }
+
+ spin_lock(&block_group->lock);
+ if (block_group->cached != BTRFS_CACHE_FINISHED ||
+ !btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ /*
+ * don't bother trying to write stuff out _if_
+ * a) we're not cached,
+ * b) we're with nospace_cache mount option,
+ * c) we're with v2 space_cache (FREE_SPACE_TREE).
+ */
+ dcs = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ goto out_put;
+ }
+ spin_unlock(&block_group->lock);
+
+ /*
+ * We hit an ENOSPC when setting up the cache in this transaction, just
+ * skip doing the setup, we've already cleared the cache so we're safe.
+ */
+ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ ret = -ENOSPC;
+ goto out_put;
+ }
+
+ /*
+ * Try to preallocate enough space based on how big the block group is.
+ * Keep in mind this has to include any pinned space which could end up
+ * taking up quite a bit since it's not folded into the other space
+ * cache.
+ */
+ cache_size = div_u64(block_group->length, SZ_256M);
+ if (!cache_size)
+ cache_size = 1;
+
+ cache_size *= 16;
+ cache_size *= fs_info->sectorsize;
+
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
+ cache_size, false);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
+ cache_size, cache_size,
+ &alloc_hint);
+ /*
+ * Our cache requires contiguous chunks so that we don't modify a bunch
+ * of metadata or split extents when writing the cache out, which means
+ * we can enospc if we are heavily fragmented in addition to just normal
+ * out of space conditions. So if we hit this just skip setting up any
+ * other block groups for this transaction, maybe we'll unpin enough
+ * space the next time around.
+ */
+ if (!ret)
+ dcs = BTRFS_DC_SETUP;
+ else if (ret == -ENOSPC)
+ set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+
+out_put:
+ iput(inode);
+out_free:
+ btrfs_release_path(path);
+out:
+ spin_lock(&block_group->lock);
+ if (!ret && dcs == BTRFS_DC_SETUP)
+ block_group->cache_generation = trans->transid;
+ block_group->disk_cache_state = dcs;
+ spin_unlock(&block_group->lock);
+
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache, *tmp;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_path *path;
+
+ if (list_empty(&cur_trans->dirty_bgs) ||
+ !btrfs_test_opt(fs_info, SPACE_CACHE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Could add new block groups, use _safe just in case */
+ list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+ dirty_list) {
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ cache_save_setup(cache, trans, path);
+ }
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * Transaction commit does final block group cache writeback during a critical
+ * section where nothing is allowed to change the FS. This is required in
+ * order for the cache to actually match the block group, but can introduce a
+ * lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
+ * There's a chance we'll have to redo some of it if the block group changes
+ * again during the commit, but it greatly reduces the commit latency by
+ * getting rid of the easy block groups while we're still allowing others to
+ * join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ int should_put;
+ struct btrfs_path *path = NULL;
+ LIST_HEAD(dirty);
+ struct list_head *io = &cur_trans->io_bgs;
+ int loops = 0;
+
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cur_trans->dirty_bgs)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ return 0;
+ }
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+ /* Make sure all the block groups on our dirty list actually exist */
+ btrfs_create_pending_block_groups(trans);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /*
+ * cache_write_mutex is here only to save us from balance or automatic
+ * removal of empty block groups deleting this block group while we are
+ * writing out the cache
+ */
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ while (!list_empty(&dirty)) {
+ bool drop_reserve = true;
+
+ cache = list_first_entry(&dirty, struct btrfs_block_group,
+ dirty_list);
+ /*
+ * This can happen if something re-dirties a block group that
+ * is already under IO. Just wait for it to finish and then do
+ * it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(trans, cache, path);
+ btrfs_put_block_group(cache);
+ }
+
+
+ /*
+ * btrfs_wait_cache_io uses the cache->dirty_list to decide if
+ * it should update the cache_state. Don't delete until after
+ * we wait.
+ *
+ * Since we're not running in the commit critical section
+ * we need the dirty_bgs_lock to protect from update_block_group
+ */
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
+ if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ should_put = 0;
+
+ /*
+ * The cache_write_mutex is protecting the
+ * io_list, also refer to the definition of
+ * btrfs_transaction::io_bgs for more details
+ */
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * If we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ ret = update_block_group_item(trans, path, cache);
+ /*
+ * Our block group might still be attached to the list
+ * of new block groups in the transaction handle of some
+ * other task (struct btrfs_trans_handle->new_bgs). This
+ * means its block group item isn't yet in the extent
+ * tree. If this happens ignore the error, as we will
+ * try again later in the critical section of the
+ * transaction commit.
+ */
+ if (ret == -ENOENT) {
+ ret = 0;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &cur_trans->dirty_bgs);
+ btrfs_get_block_group(cache);
+ drop_reserve = false;
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ } else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ }
+ }
+
+ /* If it's not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+ if (drop_reserve)
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ /*
+ * Avoid blocking other tasks for too long. It might even save
+ * us from writing caches for block groups that are going to be
+ * removed.
+ */
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+ if (ret)
+ goto out;
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ }
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ /*
+ * Go through delayed refs for all the stuff we've just kicked off
+ * and then loop back (just once)
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, 0);
+ if (!ret && loops == 0) {
+ loops++;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ /*
+ * dirty_bgs_lock protects us from concurrent block group
+ * deletes too (not just cache_write_mutex).
+ */
+ if (!list_empty(&dirty)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ goto again;
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ }
+out:
+ if (ret < 0) {
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&dirty, &cur_trans->dirty_bgs);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ int should_put;
+ struct btrfs_path *path;
+ struct list_head *io = &cur_trans->io_bgs;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Even though we are in the critical section of the transaction commit,
+ * we can still have concurrent tasks adding elements to this
+ * transaction's list of dirty block groups. These tasks correspond to
+ * endio free space workers started when writeback finishes for a
+ * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+ * allocate new block groups as a result of COWing nodes of the root
+ * tree when updating the free space inode. The writeback for the space
+ * caches is triggered by an earlier call to
+ * btrfs_start_dirty_block_groups() and iterations of the following
+ * loop.
+ * Also we want to do the cache_save_setup first and then run the
+ * delayed refs to make sure we have the best chance at doing this all
+ * in one shot.
+ */
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ while (!list_empty(&cur_trans->dirty_bgs)) {
+ cache = list_first_entry(&cur_trans->dirty_bgs,
+ struct btrfs_block_group,
+ dirty_list);
+
+ /*
+ * This can happen if cache_save_setup re-dirties a block group
+ * that is already under IO. Just wait for it to finish and
+ * then do it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(trans, cache, path);
+ btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ }
+
+ /*
+ * Don't remove from the dirty list until after we've waited on
+ * any pending IO
+ */
+ list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans,
+ (unsigned long) -1);
+
+ if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ should_put = 0;
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * If we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ ret = update_block_group_item(trans, path, cache);
+ /*
+ * One of the free space endio workers might have
+ * created a new block group while updating a free space
+ * cache's inode (at inode.c:btrfs_finish_ordered_io())
+ * and hasn't released its transaction handle yet, in
+ * which case the new block group is still attached to
+ * its transaction handle and its creation has not
+ * finished yet (no block group item in the extent tree
+ * yet, etc). If this is the case, wait for all free
+ * space endio workers to finish and retry. This is a
+ * very rare case so no need for a more efficient and
+ * complex approach.
+ */
+ if (ret == -ENOENT) {
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+ ret = update_block_group_item(trans, path, cache);
+ }
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+
+ /* If its not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
+ while (!list_empty(io)) {
+ cache = list_first_entry(io, struct btrfs_block_group,
+ io_list);
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(trans, cache, path);
+ btrfs_put_block_group(cache);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
+ u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = bg->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = div_factor_fine(bg->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
+int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, bool alloc)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_block_group *cache = NULL;
+ u64 total = num_bytes;
+ u64 old_val;
+ u64 byte_in_group;
+ int factor;
+ int ret = 0;
+
+ /* Block accounting for super block */
+ spin_lock(&info->delalloc_root_lock);
+ old_val = btrfs_super_bytes_used(info->super_copy);
+ if (alloc)
+ old_val += num_bytes;
+ else
+ old_val -= num_bytes;
+ btrfs_set_super_bytes_used(info->super_copy, old_val);
+ spin_unlock(&info->delalloc_root_lock);
+
+ while (total) {
+ struct btrfs_space_info *space_info;
+ bool reclaim = false;
+
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache) {
+ ret = -ENOENT;
+ break;
+ }
+ space_info = cache->space_info;
+ factor = btrfs_bg_type_to_factor(cache->flags);
+
+ /*
+ * If this block group has free space cache written out, we
+ * need to make sure to load it if we are removing space. This
+ * is because we need the unpinning stage to actually add the
+ * space back to the block group, otherwise we will leak space.
+ */
+ if (!alloc && !btrfs_block_group_done(cache))
+ btrfs_cache_block_group(cache, true);
+
+ byte_in_group = bytenr - cache->start;
+ WARN_ON(byte_in_group > cache->length);
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ cache->disk_cache_state < BTRFS_DC_CLEAR)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+
+ old_val = cache->used;
+ num_bytes = min(total, cache->length - byte_in_group);
+ if (alloc) {
+ old_val += num_bytes;
+ cache->used = old_val;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->bytes_used += num_bytes;
+ space_info->disk_used += num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ cache->used = old_val;
+ cache->pinned += num_bytes;
+ btrfs_space_info_update_bytes_pinned(info, space_info,
+ num_bytes);
+ space_info->bytes_used -= num_bytes;
+ space_info->disk_used -= num_bytes * factor;
+
+ reclaim = should_reclaim_block_group(cache, num_bytes);
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+
+ set_extent_dirty(&trans->transaction->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ GFP_NOFS | __GFP_NOFAIL);
+ }
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &trans->transaction->dirty_bgs);
+ trans->delayed_ref_updates++;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /*
+ * No longer have used bytes in this block group, queue it for
+ * deletion. We do this after adding the block group to the
+ * dirty list to avoid races between cleaner kthread and space
+ * cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ if (!btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
+ }
+
+ btrfs_put_block_group(cache);
+ total -= num_bytes;
+ bytenr += num_bytes;
+ }
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ btrfs_update_delayed_refs_rsv(trans);
+ return ret;
+}
+
+/**
+ * btrfs_add_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @ram_bytes: The number of bytes of file content, and will be same to
+ * @num_bytes except for the compress path.
+ * @num_bytes: The number of bytes in question
+ * @delalloc: The blocks are allocated for the delalloc write
+ *
+ * This is called by the allocator when it reserves space. If this is a
+ * reservation and the block group has become read only we cannot make the
+ * reservation and return -EAGAIN, otherwise this function always succeeds.
+ */
+int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
+ u64 ram_bytes, u64 num_bytes, int delalloc)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ trace_btrfs_space_reservation(cache->fs_info, "space_info",
+ space_info->flags, num_bytes, 1);
+ btrfs_space_info_update_bytes_may_use(cache->fs_info,
+ space_info, -ram_bytes);
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake
+ * tickets if that happens
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
+/**
+ * btrfs_free_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @num_bytes: The number of bytes in question
+ * @delalloc: The blocks are allocated for the delalloc write
+ *
+ * This is called by somebody who is freeing space that was never actually used
+ * on disk. For example if you reserve some space for a new leaf in transaction
+ * A and before transaction A commits you free that leaf, you call this with
+ * reserve set to 0 in order to clear the reservation.
+ */
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
+ u64 num_bytes, int delalloc)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->max_extent_size = 0;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
+ spin_unlock(&cache->lock);
+
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ list_for_each_entry(found, head, list) {
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ found->force_alloc = CHUNK_ALLOC_FORCE;
+ }
+}
+
+static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *sinfo, int force)
+{
+ u64 bytes_used = btrfs_space_info_used(sinfo, false);
+ u64 thresh;
+
+ if (force == CHUNK_ALLOC_FORCE)
+ return 1;
+
+ /*
+ * in limited mode, we want to have some free space up to
+ * about 1% of the FS size.
+ */
+ if (force == CHUNK_ALLOC_LIMITED) {
+ thresh = btrfs_super_total_bytes(fs_info->super_copy);
+ thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
+
+ if (sinfo->total_bytes - bytes_used < thresh)
+ return 1;
+ }
+
+ if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
+ return 0;
+ return 1;
+}
+
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
+{
+ u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
+
+ return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+}
+
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+{
+ struct btrfs_block_group *bg;
+ int ret;
+
+ /*
+ * Check if we have enough space in the system space info because we
+ * will need to update device items in the chunk btree and insert a new
+ * chunk item in the chunk btree as well. This will allocate a new
+ * system block group if needed.
+ */
+ check_system_chunk(trans, flags);
+
+ bg = btrfs_create_chunk(trans, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ /*
+ * Normally we are not expected to fail with -ENOSPC here, since we have
+ * previously reserved space in the system space_info and allocated one
+ * new system chunk if necessary. However there are three exceptions:
+ *
+ * 1) We may have enough free space in the system space_info but all the
+ * existing system block groups have a profile which can not be used
+ * for extent allocation.
+ *
+ * This happens when mounting in degraded mode. For example we have a
+ * RAID1 filesystem with 2 devices, lose one device and mount the fs
+ * using the other device in degraded mode. If we then allocate a chunk,
+ * we may have enough free space in the existing system space_info, but
+ * none of the block groups can be used for extent allocation since they
+ * have a RAID1 profile, and because we are in degraded mode with a
+ * single device, we are forced to allocate a new system chunk with a
+ * SINGLE profile. Making check_system_chunk() iterate over all system
+ * block groups and check if they have a usable profile and enough space
+ * can be slow on very large filesystems, so we tolerate the -ENOSPC and
+ * try again after forcing allocation of a new system chunk. Like this
+ * we avoid paying the cost of that search in normal circumstances, when
+ * we were not mounted in degraded mode;
+ *
+ * 2) We had enough free space info the system space_info, and one suitable
+ * block group to allocate from when we called check_system_chunk()
+ * above. However right after we called it, the only system block group
+ * with enough free space got turned into RO mode by a running scrub,
+ * and in this case we have to allocate a new one and retry. We only
+ * need do this allocate and retry once, since we have a transaction
+ * handle and scrub uses the commit root to search for block groups;
+ *
+ * 3) We had one system block group with enough free space when we called
+ * check_system_chunk(), but after that, right before we tried to
+ * allocate the last extent buffer we needed, a discard operation came
+ * in and it temporarily removed the last free space entry from the
+ * block group (discard removes a free space entry, discards it, and
+ * then adds back the entry to the block group cache).
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
+ struct btrfs_block_group *sys_bg;
+
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+out:
+ btrfs_trans_release_chunk_metadata(trans);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ btrfs_get_block_group(bg);
+ return bg;
+}
+
+/*
+ * Chunk allocation is done in 2 phases:
+ *
+ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
+ * the chunk, the chunk mapping, create its block group and add the items
+ * that belong in the chunk btree to it - more specifically, we need to
+ * update device items in the chunk btree and add a new chunk item to it.
+ *
+ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
+ * group item to the extent btree and the device extent items to the devices
+ * btree.
+ *
+ * This is done to prevent deadlocks. For example when COWing a node from the
+ * extent btree we are holding a write lock on the node's parent and if we
+ * trigger chunk allocation and attempted to insert the new block group item
+ * in the extent btree right way, we could deadlock because the path for the
+ * insertion can include that parent node. At first glance it seems impossible
+ * to trigger chunk allocation after starting a transaction since tasks should
+ * reserve enough transaction units (metadata space), however while that is true
+ * most of the time, chunk allocation may still be triggered for several reasons:
+ *
+ * 1) When reserving metadata, we check if there is enough free space in the
+ * metadata space_info and therefore don't trigger allocation of a new chunk.
+ * However later when the task actually tries to COW an extent buffer from
+ * the extent btree or from the device btree for example, it is forced to
+ * allocate a new block group (chunk) because the only one that had enough
+ * free space was just turned to RO mode by a running scrub for example (or
+ * device replace, block group reclaim thread, etc), so we can not use it
+ * for allocating an extent and end up being forced to allocate a new one;
+ *
+ * 2) Because we only check that the metadata space_info has enough free bytes,
+ * we end up not allocating a new metadata chunk in that case. However if
+ * the filesystem was mounted in degraded mode, none of the existing block
+ * groups might be suitable for extent allocation due to their incompatible
+ * profile (for e.g. mounting a 2 devices filesystem, where all block groups
+ * use a RAID1 profile, in degraded mode using a single device). In this case
+ * when the task attempts to COW some extent buffer of the extent btree for
+ * example, it will trigger allocation of a new metadata block group with a
+ * suitable profile (SINGLE profile in the example of the degraded mount of
+ * the RAID1 filesystem);
+ *
+ * 3) The task has reserved enough transaction units / metadata space, but when
+ * it attempts to COW an extent buffer from the extent or device btree for
+ * example, it does not find any free extent in any metadata block group,
+ * therefore forced to try to allocate a new metadata block group.
+ * This is because some other task allocated all available extents in the
+ * meanwhile - this typically happens with tasks that don't reserve space
+ * properly, either intentionally or as a bug. One example where this is
+ * done intentionally is fsync, as it does not reserve any transaction units
+ * and ends up allocating a variable number of metadata extents for log
+ * tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ * before it tries to allocate the last extent buffer it needs, a discard
+ * operation comes in and, temporarily, removes the last free space entry from
+ * the only metadata block group that had free space (discard starts by
+ * removing a free space entry from a block group, then does the discard
+ * operation and, once it's done, it adds back the free space entry to the
+ * block group).
+ *
+ * We also need this 2 phases setup when adding a device to a filesystem with
+ * a seed device - we must create new metadata and system chunks without adding
+ * any of the block group items to the chunk, extent and device btrees. If we
+ * did not do it this way, we would get ENOSPC when attempting to update those
+ * btrees, since all the chunks from the seed device are read-only.
+ *
+ * Phase 1 does the updates and insertions to the chunk btree because if we had
+ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
+ * parallel, we risk having too many system chunks allocated by many tasks if
+ * many tasks reach phase 1 without the previous ones completing phase 2. In the
+ * extreme case this leads to exhaustion of the system chunk array in the
+ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
+ * and with RAID filesystems (so we have more device items in the chunk btree).
+ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
+ * the system chunk array due to concurrent allocations") provides more details.
+ *
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
+ *
+ * The reservation of system space, done through check_system_chunk(), as well
+ * as all the updates and insertions into the chunk btree must be done while
+ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
+ * an extent buffer from the chunks btree we never trigger allocation of a new
+ * system chunk, which would result in a deadlock (trying to lock twice an
+ * extent buffer of the chunk btree, first time before triggering the chunk
+ * allocation and the second time during chunk allocation while attempting to
+ * update the chunks btree). The system chunk array is also updated while holding
+ * that mutex. The same logic applies to removing chunks - we must reserve system
+ * space, update the chunk btree and the system chunk array in the superblock
+ * while holding fs_info->chunk_mutex.
+ *
+ * This function, btrfs_chunk_alloc(), belongs to phase 1.
+ *
+ * If @force is CHUNK_ALLOC_FORCE:
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+ * If @force is NOT CHUNK_ALLOC_FORCE:
+ * - return 0 if it doesn't need to allocate a new chunk,
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+ */
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+ enum btrfs_chunk_alloc_enum force)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_group *ret_bg;
+ bool wait_for_alloc = false;
+ bool should_alloc = false;
+ bool from_extent_allocation = false;
+ int ret = 0;
+
+ if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
+ from_extent_allocation = true;
+ force = CHUNK_ALLOC_FORCE;
+ }
+
+ /* Don't re-enter if we're already allocating a chunk */
+ if (trans->allocating_chunk)
+ return -ENOSPC;
+ /*
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return -ENOSPC;
+
+ space_info = btrfs_find_space_info(fs_info, flags);
+ ASSERT(space_info);
+
+ do {
+ spin_lock(&space_info->lock);
+ if (force < space_info->force_alloc)
+ force = space_info->force_alloc;
+ should_alloc = should_alloc_chunk(fs_info, space_info, force);
+ if (space_info->full) {
+ /* No more free physical space */
+ if (should_alloc)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+ spin_unlock(&space_info->lock);
+ return ret;
+ } else if (!should_alloc) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ } else if (space_info->chunk_alloc) {
+ /*
+ * Someone is already allocating, so we need to block
+ * until this someone is finished and then loop to
+ * recheck if we should continue with our allocation
+ * attempt.
+ */
+ wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
+ spin_unlock(&space_info->lock);
+ mutex_lock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ } else {
+ /* Proceed with allocation */
+ space_info->chunk_alloc = 1;
+ wait_for_alloc = false;
+ spin_unlock(&space_info->lock);
+ }
+
+ cond_resched();
+ } while (wait_for_alloc);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ trans->allocating_chunk = true;
+
+ /*
+ * If we have mixed data/metadata chunks we want to make sure we keep
+ * allocating mixed chunks instead of individual chunks.
+ */
+ if (btrfs_mixed_space_info(space_info))
+ flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
+ /*
+ * if we're doing a data chunk, go ahead and make sure that
+ * we keep a reasonable number of metadata chunks allocated in the
+ * FS as well.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+ fs_info->data_chunk_allocations++;
+ if (!(fs_info->data_chunk_allocations %
+ fs_info->metadata_ratio))
+ force_metadata_allocation(fs_info);
+ }
+
+ ret_bg = do_chunk_alloc(trans, flags);
+ trans->allocating_chunk = false;
+
+ if (IS_ERR(ret_bg)) {
+ ret = PTR_ERR(ret_bg);
+ } else if (from_extent_allocation) {
+ /*
+ * New block group is likely to be used soon. Try to activate
+ * it now. Failure is OK for now.
+ */
+ btrfs_zone_activate(ret_bg);
+ }
+
+ if (!ret)
+ btrfs_put_block_group(ret_bg);
+
+ spin_lock(&space_info->lock);
+ if (ret < 0) {
+ if (ret == -ENOSPC)
+ space_info->full = 1;
+ else
+ goto out;
+ } else {
+ ret = 1;
+ space_info->max_extent_size = 0;
+ }
+
+ space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
+ space_info->chunk_alloc = 0;
+ spin_unlock(&space_info->lock);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ return ret;
+}
+
+static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
+{
+ u64 num_dev;
+
+ num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
+ if (!num_dev)
+ num_dev = fs_info->fs_devices->rw_devices;
+
+ return num_dev;
+}
+
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_space_info *info;
+ u64 left;
+ int ret = 0;
+
+ /*
+ * Needed because we can end up allocating a system chunk and for an
+ * atomic and race free space reservation in the chunk block reserve.
+ */
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ spin_lock(&info->lock);
+ left = info->total_bytes - btrfs_space_info_used(info, true);
+ spin_unlock(&info->lock);
+
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
+ left, bytes, type);
+ btrfs_dump_space_info(fs_info, info, 0, 0);
+ }
+
+ if (left < bytes) {
+ u64 flags = btrfs_system_alloc_profile(fs_info);
+ struct btrfs_block_group *bg;
+
+ /*
+ * Ignore failure to create system chunk. We might end up not
+ * needing it, as we might not need to COW all nodes/leafs from
+ * the paths we visit in the chunk tree (they were already COWed
+ * or created in the current transaction for example).
+ */
+ bg = btrfs_create_chunk(trans, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ } else {
+ /*
+ * We have a new chunk. We also need to activate it for
+ * zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
+ if (ret < 0)
+ return;
+
+ /*
+ * If we fail to add the chunk item here, we end up
+ * trying again at phase 2 of chunk allocation, at
+ * btrfs_create_pending_block_groups(). So ignore
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
+ */
+ btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ }
+ }
+
+ if (!ret) {
+ ret = btrfs_block_rsv_add(fs_info,
+ &fs_info->chunk_block_rsv,
+ bytes, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->chunk_bytes_reserved += bytes;
+ }
+}
+
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group *block_group;
+
+ block_group = btrfs_lookup_first_block_group(info, 0);
+ while (block_group) {
+ btrfs_wait_block_group_cache_done(block_group);
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
+ &block_group->runtime_flags)) {
+ struct inode *inode = block_group->inode;
+
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+
+ ASSERT(block_group->io_ctl.inode == NULL);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ block_group = btrfs_next_block_group(block_group);
+ }
+}
+
+/*
+ * Must be called only after stopping all workers, since we could have block
+ * group caching kthreads running, and therefore they could race with us if we
+ * freed the block groups before stopping them.
+ */
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct rb_node *n;
+
+ write_lock(&info->block_group_cache_lock);
+ while (!list_empty(&info->caching_block_groups)) {
+ caching_ctl = list_entry(info->caching_block_groups.next,
+ struct btrfs_caching_control, list);
+ list_del(&caching_ctl->list);
+ btrfs_put_caching_control(caching_ctl);
+ }
+ write_unlock(&info->block_group_cache_lock);
+
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+
+ while (!list_empty(&info->reclaim_bgs)) {
+ block_group = list_first_entry(&info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
+ spin_lock(&info->zone_active_bgs_lock);
+ while (!list_empty(&info->zone_active_bgs)) {
+ block_group = list_first_entry(&info->zone_active_bgs,
+ struct btrfs_block_group,
+ active_bg_list);
+ list_del_init(&block_group->active_bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->zone_active_bgs_lock);
+
+ write_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
+ block_group = rb_entry(n, struct btrfs_block_group,
+ cache_node);
+ rb_erase_cached(&block_group->cache_node,
+ &info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
+ write_unlock(&info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ list_del(&block_group->list);
+ up_write(&block_group->space_info->groups_sem);
+
+ /*
+ * We haven't cached this block group, which means we could
+ * possibly have excluded extents on this block group.
+ */
+ if (block_group->cached == BTRFS_CACHE_NO ||
+ block_group->cached == BTRFS_CACHE_ERROR)
+ btrfs_free_excluded_extents(block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+ ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
+ ASSERT(list_empty(&block_group->dirty_list));
+ ASSERT(list_empty(&block_group->io_list));
+ ASSERT(list_empty(&block_group->bg_list));
+ ASSERT(refcount_read(&block_group->refs) == 1);
+ ASSERT(block_group->swap_extents == 0);
+ btrfs_put_block_group(block_group);
+
+ write_lock(&info->block_group_cache_lock);
+ }
+ write_unlock(&info->block_group_cache_lock);
+
+ btrfs_release_global_block_rsv(info);
+
+ while (!list_empty(&info->space_info)) {
+ space_info = list_entry(info->space_info.next,
+ struct btrfs_space_info,
+ list);
+
+ /*
+ * Do not hide this behind enospc_debug, this is actually
+ * important and indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 ||
+ space_info->bytes_may_use > 0))
+ btrfs_dump_space_info(info, space_info, 0, 0);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on bytes_reserved > 0 in
+ * that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(info, space_info, 0, 0);
+ }
+
+ WARN_ON(space_info->reclaim_size > 0);
+ list_del(&space_info->list);
+ btrfs_sysfs_remove_space_info(space_info);
+ }
+ return 0;
+}
+
+void btrfs_freeze_block_group(struct btrfs_block_group *cache)
+{
+ atomic_inc(&cache->frozen);
+}
+
+void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ bool cleanup;
+
+ spin_lock(&block_group->lock);
+ cleanup = (atomic_dec_and_test(&block_group->frozen) &&
+ test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
+ spin_unlock(&block_group->lock);
+
+ if (cleanup) {
+ em_tree = &fs_info->mapping_tree;
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, block_group->start,
+ 1);
+ BUG_ON(!em); /* logic error, can't happen */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ /* once for us and once for the tree */
+ free_extent_map(em);
+ free_extent_map(em);
+
+ /*
+ * We may have left one free space entry and other possible
+ * tasks trimming this block group have left 1 entry each one.
+ * Free them if any.
+ */
+ btrfs_remove_free_space_cache(block_group);
+ }
+}
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
+{
+ bool ret = true;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ bg->swap_extents++;
+ spin_unlock(&bg->lock);
+
+ return ret;
+}
+
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
+{
+ spin_lock(&bg->lock);
+ ASSERT(!bg->ro);
+ ASSERT(bg->swap_extents >= amount);
+ bg->swap_extents -= amount;
+ spin_unlock(&bg->lock);
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
new file mode 100644
index 000000000..47a2dcbfe
--- /dev/null
+++ b/fs/btrfs/block-group.h
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_BLOCK_GROUP_H
+#define BTRFS_BLOCK_GROUP_H
+
+#include "free-space-cache.h"
+
+enum btrfs_disk_cache_state {
+ BTRFS_DC_WRITTEN,
+ BTRFS_DC_ERROR,
+ BTRFS_DC_CLEAR,
+ BTRFS_DC_SETUP,
+};
+
+/*
+ * This describes the state of the block_group for async discard. This is due
+ * to the two pass nature of it where extent discarding is prioritized over
+ * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
+ * between lists to prevent contention for discard state variables
+ * (eg. discard_cursor).
+ */
+enum btrfs_discard_state {
+ BTRFS_DISCARD_EXTENTS,
+ BTRFS_DISCARD_BITMAPS,
+ BTRFS_DISCARD_RESET_CURSOR,
+};
+
+/*
+ * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
+ * only allocate a chunk if we really need one.
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few
+ * chunks already allocated. This is used as part of the clustering code to
+ * help make sure we have a good pool of storage to cluster in, without filling
+ * the FS with empty chunks
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
+ * find_free_extent() that also activaes the zone
+ */
+enum btrfs_chunk_alloc_enum {
+ CHUNK_ALLOC_NO_FORCE,
+ CHUNK_ALLOC_LIMITED,
+ CHUNK_ALLOC_FORCE,
+ CHUNK_ALLOC_FORCE_FOR_EXTENT,
+};
+
+/* Block group flags set at runtime */
+enum btrfs_block_group_flags {
+ BLOCK_GROUP_FLAG_IREF,
+ BLOCK_GROUP_FLAG_REMOVED,
+ BLOCK_GROUP_FLAG_TO_COPY,
+ BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
+ BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ /* Does the block group need to be added to the free space tree? */
+ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ /* Indicate that the block group is placed on a sequential zone */
+ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
+ /*
+ * Indicate that block group is in the list of new block groups of a
+ * transaction.
+ */
+ BLOCK_GROUP_FLAG_NEW,
+};
+
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO,
+ BTRFS_CACHE_STARTED,
+ BTRFS_CACHE_FINISHED,
+ BTRFS_CACHE_ERROR,
+};
+
+struct btrfs_caching_control {
+ struct list_head list;
+ struct mutex mutex;
+ wait_queue_head_t wait;
+ struct btrfs_work work;
+ struct btrfs_block_group *block_group;
+ /* Track progress of caching during allocation. */
+ atomic_t progress;
+ refcount_t count;
+};
+
+/* Once caching_thread() finds this much free space, it will wake up waiters. */
+#define CACHING_CTL_WAKE_UP SZ_2M
+
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
+struct btrfs_block_group {
+ struct btrfs_fs_info *fs_info;
+ struct inode *inode;
+ spinlock_t lock;
+ u64 start;
+ u64 length;
+ u64 pinned;
+ u64 reserved;
+ u64 used;
+ u64 delalloc_bytes;
+ u64 bytes_super;
+ u64 flags;
+ u64 cache_generation;
+ u64 global_root_id;
+
+ /*
+ * If the free space extent count exceeds this number, convert the block
+ * group to bitmaps.
+ */
+ u32 bitmap_high_thresh;
+
+ /*
+ * If the free space extent count drops below this number, convert the
+ * block group back to extents.
+ */
+ u32 bitmap_low_thresh;
+
+ /*
+ * It is just used for the delayed data space allocation because
+ * only the data space allocation and the relative metadata update
+ * can be done cross the transaction.
+ */
+ struct rw_semaphore data_rwsem;
+
+ /* For raid56, this is a full stripe, without parity */
+ unsigned long full_stripe_len;
+ unsigned long runtime_flags;
+
+ unsigned int ro;
+
+ int disk_cache_state;
+
+ /* Cache tracking stuff */
+ int cached;
+ struct btrfs_caching_control *caching_ctl;
+
+ struct btrfs_space_info *space_info;
+
+ /* Free space cache stuff */
+ struct btrfs_free_space_ctl *free_space_ctl;
+
+ /* Block group cache stuff */
+ struct rb_node cache_node;
+
+ /* For block groups in the same raid type */
+ struct list_head list;
+
+ refcount_t refs;
+
+ /*
+ * List of struct btrfs_free_clusters for this block group.
+ * Today it will only have one thing on it, but that may change
+ */
+ struct list_head cluster_list;
+
+ /* For delayed block group creation or deletion of empty block groups */
+ struct list_head bg_list;
+
+ /* For read-only block groups */
+ struct list_head ro_list;
+
+ /*
+ * When non-zero it means the block group's logical address and its
+ * device extents can not be reused for future block group allocations
+ * until the counter goes down to 0. This is to prevent them from being
+ * reused while some task is still using the block group after it was
+ * deleted - we want to make sure they can only be reused for new block
+ * groups after that task is done with the deleted block group.
+ */
+ atomic_t frozen;
+
+ /* For discard operations */
+ struct list_head discard_list;
+ int discard_index;
+ u64 discard_eligible_time;
+ u64 discard_cursor;
+ enum btrfs_discard_state discard_state;
+
+ /* For dirty block groups */
+ struct list_head dirty_list;
+ struct list_head io_list;
+
+ struct btrfs_io_ctl io_ctl;
+
+ /*
+ * Incremented when doing extent allocations and holding a read lock
+ * on the space_info's groups_sem semaphore.
+ * Decremented when an ordered extent that represents an IO against this
+ * block group's range is created (after it's added to its inode's
+ * root's list of ordered extents) or immediately after the allocation
+ * if it's a metadata extent or fallocate extent (for these cases we
+ * don't create ordered extents).
+ */
+ atomic_t reservations;
+
+ /*
+ * Incremented while holding the spinlock *lock* by a task checking if
+ * it can perform a nocow write (incremented if the value for the *ro*
+ * field is 0). Decremented by such tasks once they create an ordered
+ * extent or before that if some error happens before reaching that step.
+ * This is to prevent races between block group relocation and nocow
+ * writes through direct IO.
+ */
+ atomic_t nocow_writers;
+
+ /* Lock for free space tree operations. */
+ struct mutex free_space_lock;
+
+ /*
+ * Number of extents in this block group used for swap files.
+ * All accesses protected by the spinlock 'lock'.
+ */
+ int swap_extents;
+
+ /* Record locked full stripes for RAID5/6 block group */
+ struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+
+ /*
+ * Allocation offset for the block group to implement sequential
+ * allocation. This is used only on a zoned filesystem.
+ */
+ u64 alloc_offset;
+ u64 zone_unusable;
+ u64 zone_capacity;
+ u64 meta_write_pointer;
+ struct map_lookup *physical_map;
+ struct list_head active_bg_list;
+ struct work_struct zone_finish_work;
+ struct extent_buffer *last_eb;
+};
+
+static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
+{
+ return (block_group->start + block_group->length);
+}
+
+static inline bool btrfs_is_block_group_data_only(
+ struct btrfs_block_group *block_group)
+{
+ /*
+ * In mixed mode the fragmentation is expected to be high, lowering the
+ * efficiency, so only proper data block groups are considered.
+ */
+ return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
+ !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int btrfs_should_fragment_free_space(
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+ return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
+struct btrfs_block_group *btrfs_lookup_first_block_group(
+ struct btrfs_fs_info *info, u64 bytenr);
+struct btrfs_block_group *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info, u64 bytenr);
+struct btrfs_block_group *btrfs_next_block_group(
+ struct btrfs_block_group *cache);
+void btrfs_get_block_group(struct btrfs_block_group *cache);
+void btrfs_put_block_group(struct btrfs_block_group *cache);
+void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+ const u64 start);
+void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
+void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
+ u64 num_bytes);
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
+void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
+struct btrfs_caching_control *btrfs_get_caching_control(
+ struct btrfs_block_group *cache);
+int add_new_free_space(struct btrfs_block_group *block_group,
+ u64 start, u64 end, u64 *total_added_ret);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ u64 group_start, struct extent_map *em);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+void btrfs_reclaim_bgs_work(struct work_struct *work);
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
+int btrfs_read_block_groups(struct btrfs_fs_info *info);
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ u64 bytes_used, u64 type,
+ u64 chunk_offset, u64 size);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ bool do_chunk_alloc);
+void btrfs_dec_block_group_ro(struct btrfs_block_group *cache);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
+int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, bool alloc);
+int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
+ u64 ram_bytes, u64 num_bytes, int delalloc);
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
+ u64 num_bytes, int delalloc);
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+ enum btrfs_chunk_alloc_enum force);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
+void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion);
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len);
+
+static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
+}
+
+static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+}
+
+static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+}
+
+static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
+{
+ smp_mb();
+ return cache->cached == BTRFS_CACHE_FINISHED ||
+ cache->cached == BTRFS_CACHE_ERROR;
+}
+
+void btrfs_freeze_block_group(struct btrfs_block_group *cache);
+void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
+
+#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
new file mode 100644
index 000000000..507b44d18
--- /dev/null
+++ b/fs/btrfs/block-rsv.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "ctree.h"
+#include "block-rsv.h"
+#include "space-info.h"
+#include "transaction.h"
+#include "block-group.h"
+#include "disk-io.h"
+
+/*
+ * HOW DO BLOCK RESERVES WORK
+ *
+ * Think of block_rsv's as buckets for logically grouped metadata
+ * reservations. Each block_rsv has a ->size and a ->reserved. ->size is
+ * how large we want our block rsv to be, ->reserved is how much space is
+ * currently reserved for this block reserve.
+ *
+ * ->failfast exists for the truncate case, and is described below.
+ *
+ * NORMAL OPERATION
+ *
+ * -> Reserve
+ * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
+ *
+ * We call into btrfs_reserve_metadata_bytes() with our bytes, which is
+ * accounted for in space_info->bytes_may_use, and then add the bytes to
+ * ->reserved, and ->size in the case of btrfs_block_rsv_add.
+ *
+ * ->size is an over-estimation of how much we may use for a particular
+ * operation.
+ *
+ * -> Use
+ * Entrance: btrfs_use_block_rsv
+ *
+ * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
+ * to determine the appropriate block_rsv to use, and then verify that
+ * ->reserved has enough space for our tree block allocation. Once
+ * successful we subtract fs_info->nodesize from ->reserved.
+ *
+ * -> Finish
+ * Entrance: btrfs_block_rsv_release
+ *
+ * We are finished with our operation, subtract our individual reservation
+ * from ->size, and then subtract ->size from ->reserved and free up the
+ * excess if there is any.
+ *
+ * There is some logic here to refill the delayed refs rsv or the global rsv
+ * as needed, otherwise the excess is subtracted from
+ * space_info->bytes_may_use.
+ *
+ * TYPES OF BLOCK RESERVES
+ *
+ * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
+ * These behave normally, as described above, just within the confines of the
+ * lifetime of their particular operation (transaction for the whole trans
+ * handle lifetime, for example).
+ *
+ * BLOCK_RSV_GLOBAL
+ * It is impossible to properly account for all the space that may be required
+ * to make our extent tree updates. This block reserve acts as an overflow
+ * buffer in case our delayed refs reserve does not reserve enough space to
+ * update the extent tree.
+ *
+ * We can steal from this in some cases as well, notably on evict() or
+ * truncate() in order to help users recover from ENOSPC conditions.
+ *
+ * BLOCK_RSV_DELALLOC
+ * The individual item sizes are determined by the per-inode size
+ * calculations, which are described with the delalloc code. This is pretty
+ * straightforward, it's just the calculation of ->size encodes a lot of
+ * different items, and thus it gets used when updating inodes, inserting file
+ * extents, and inserting checksums.
+ *
+ * BLOCK_RSV_DELREFS
+ * We keep a running tally of how many delayed refs we have on the system.
+ * We assume each one of these delayed refs are going to use a full
+ * reservation. We use the transaction items and pre-reserve space for every
+ * operation, and use this reservation to refill any gap between ->size and
+ * ->reserved that may exist.
+ *
+ * From there it's straightforward, removing a delayed ref means we remove its
+ * count from ->size and free up reservations as necessary. Since this is
+ * the most dynamic block reserve in the system, we will try to refill this
+ * block reserve first with any excess returned by any other block reserve.
+ *
+ * BLOCK_RSV_EMPTY
+ * This is the fallback block reserve to make us try to reserve space if we
+ * don't have a specific bucket for this allocation. It is mostly used for
+ * updating the device tree and such, since that is a separate pool we're
+ * content to just reserve space from the space_info on demand.
+ *
+ * BLOCK_RSV_TEMP
+ * This is used by things like truncate and iput. We will temporarily
+ * allocate a block reserve, set it to some size, and then truncate bytes
+ * until we have no space left. With ->failfast set we'll simply return
+ * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
+ * to make a new reservation. This is because these operations are
+ * unbounded, so we want to do as much work as we can, and then back off and
+ * re-reserve.
+ */
+
+static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ u64 *qgroup_to_release_ret)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 qgroup_to_release = 0;
+ u64 ret;
+
+ spin_lock(&block_rsv->lock);
+ if (num_bytes == (u64)-1) {
+ num_bytes = block_rsv->size;
+ qgroup_to_release = block_rsv->qgroup_rsv_size;
+ }
+ block_rsv->size -= num_bytes;
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = true;
+ } else {
+ num_bytes = 0;
+ }
+ if (qgroup_to_release_ret &&
+ block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+ qgroup_to_release = block_rsv->qgroup_rsv_reserved -
+ block_rsv->qgroup_rsv_size;
+ block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
+ } else {
+ qgroup_to_release = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ ret = num_bytes;
+ if (num_bytes > 0) {
+ if (dest) {
+ spin_lock(&dest->lock);
+ if (!dest->full) {
+ u64 bytes_to_add;
+
+ bytes_to_add = dest->size - dest->reserved;
+ bytes_to_add = min(num_bytes, bytes_to_add);
+ dest->reserved += bytes_to_add;
+ if (dest->reserved >= dest->size)
+ dest->full = true;
+ num_bytes -= bytes_to_add;
+ }
+ spin_unlock(&dest->lock);
+ }
+ if (num_bytes)
+ btrfs_space_info_free_bytes_may_use(fs_info,
+ space_info,
+ num_bytes);
+ }
+ if (qgroup_to_release_ret)
+ *qgroup_to_release_ret = qgroup_to_release;
+ return ret;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes,
+ bool update_size)
+{
+ int ret;
+
+ ret = btrfs_block_rsv_use_bytes(src, num_bytes);
+ if (ret)
+ return ret;
+
+ btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
+ return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type)
+{
+ memset(rsv, 0, sizeof(*rsv));
+ spin_lock_init(&rsv->lock);
+ rsv->type = type;
+}
+
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv,
+ enum btrfs_rsv_type type)
+{
+ btrfs_init_block_rsv(rsv, type);
+ rsv->space_info = btrfs_find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
+ enum btrfs_rsv_type type)
+{
+ struct btrfs_block_rsv *block_rsv;
+
+ block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+ if (!block_rsv)
+ return NULL;
+
+ btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
+ return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv)
+{
+ if (!rsv)
+ return;
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
+ kfree(rsv);
+}
+
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ if (num_bytes == 0)
+ return 0;
+
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ if (!ret)
+ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
+
+ return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
+{
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ num_bytes = div_factor(block_rsv->size, min_factor);
+ if (block_rsv->reserved >= num_bytes)
+ ret = 0;
+ spin_unlock(&block_rsv->lock);
+
+ return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ num_bytes = min_reserved;
+ if (block_rsv->reserved >= num_bytes)
+ ret = 0;
+ else
+ num_bytes -= block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (!ret)
+ return 0;
+
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ if (!ret) {
+ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
+ return 0;
+ }
+
+ return ret;
+}
+
+u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ u64 *qgroup_to_release)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *target = NULL;
+
+ /*
+ * If we are the delayed_rsv then push to the global rsv, otherwise dump
+ * into the delayed rsv if it is not full.
+ */
+ if (block_rsv == delayed_rsv)
+ target = global_rsv;
+ else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
+ target = delayed_rsv;
+
+ if (target && block_rsv->space_info != target->space_info)
+ target = NULL;
+
+ return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
+ qgroup_to_release);
+}
+
+int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
+{
+ int ret = -ENOSPC;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved >= num_bytes) {
+ block_rsv->reserved -= num_bytes;
+ if (block_rsv->reserved < block_rsv->size)
+ block_rsv->full = false;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+ return ret;
+}
+
+void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, bool update_size)
+{
+ spin_lock(&block_rsv->lock);
+ block_rsv->reserved += num_bytes;
+ if (update_size)
+ block_rsv->size += num_bytes;
+ else if (block_rsv->reserved >= block_rsv->size)
+ block_rsv->full = true;
+ spin_unlock(&block_rsv->lock);
+}
+
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (global_rsv->space_info != dest->space_info)
+ return -ENOSPC;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, min_factor);
+ if (global_rsv->reserved < min_bytes + num_bytes) {
+ spin_unlock(&global_rsv->lock);
+ return -ENOSPC;
+ }
+ global_rsv->reserved -= num_bytes;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = false;
+ spin_unlock(&global_rsv->lock);
+
+ btrfs_block_rsv_add_bytes(dest, num_bytes, true);
+ return 0;
+}
+
+void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ struct btrfs_space_info *sinfo = block_rsv->space_info;
+ struct btrfs_root *root, *tmp;
+ u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
+ unsigned int min_items = 1;
+
+ /*
+ * The global block rsv is based on the size of the extent tree, the
+ * checksum tree and the root tree. If the fs is empty we want to set
+ * it to a minimal amount for safety.
+ *
+ * We also are going to need to modify the minimum of the tree root and
+ * any global roots we could touch.
+ */
+ read_lock(&fs_info->global_root_lock);
+ rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
+ rb_node) {
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ num_bytes += btrfs_root_used(&root->root_item);
+ min_items++;
+ }
+ }
+ read_unlock(&fs_info->global_root_lock);
+
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ num_bytes += btrfs_root_used(&fs_info->block_group_root->root_item);
+ min_items++;
+ }
+
+ /*
+ * But we also want to reserve enough space so we can do the fallback
+ * global reserve for an unlink, which is an additional 5 items (see the
+ * comment in __unlink_start_trans for what we're modifying.)
+ *
+ * But we also need space for the delayed ref updates from the unlink,
+ * so its 10, 5 for the actual operation, and 5 for the delayed ref
+ * updates.
+ */
+ min_items += 10;
+
+ num_bytes = max_t(u64, num_bytes,
+ btrfs_calc_insert_metadata_size(fs_info, min_items));
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&block_rsv->lock);
+
+ block_rsv->size = min_t(u64, num_bytes, SZ_512M);
+
+ if (block_rsv->reserved < block_rsv->size) {
+ num_bytes = block_rsv->size - block_rsv->reserved;
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
+ num_bytes);
+ block_rsv->reserved = block_rsv->size;
+ } else if (block_rsv->reserved > block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
+ -num_bytes);
+ block_rsv->reserved = block_rsv->size;
+ btrfs_try_granting_tickets(fs_info, sinfo);
+ }
+
+ block_rsv->full = (block_rsv->reserved == block_rsv->size);
+
+ if (block_rsv->size >= sinfo->total_bytes)
+ sinfo->force_alloc = CHUNK_ALLOC_FORCE;
+ spin_unlock(&block_rsv->lock);
+ spin_unlock(&sinfo->lock);
+}
+
+void btrfs_init_root_block_rsv(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ switch (root->root_key.objectid) {
+ case BTRFS_CSUM_TREE_OBJECTID:
+ case BTRFS_EXTENT_TREE_OBJECTID:
+ case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+ root->block_rsv = &fs_info->delayed_refs_rsv;
+ break;
+ case BTRFS_ROOT_TREE_OBJECTID:
+ case BTRFS_DEV_TREE_OBJECTID:
+ case BTRFS_QUOTA_TREE_OBJECTID:
+ root->block_rsv = &fs_info->global_block_rsv;
+ break;
+ case BTRFS_CHUNK_TREE_OBJECTID:
+ root->block_rsv = &fs_info->chunk_block_rsv;
+ break;
+ default:
+ root->block_rsv = NULL;
+ break;
+ }
+}
+
+void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ fs_info->chunk_block_rsv.space_info = space_info;
+
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ fs_info->global_block_rsv.space_info = space_info;
+ fs_info->trans_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.space_info = space_info;
+ fs_info->delayed_block_rsv.space_info = space_info;
+ fs_info->delayed_refs_rsv.space_info = space_info;
+
+ btrfs_update_global_block_rsv(fs_info);
+}
+
+void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
+ NULL);
+ WARN_ON(fs_info->trans_block_rsv.size > 0);
+ WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+ WARN_ON(fs_info->chunk_block_rsv.size > 0);
+ WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_block_rsv.size > 0);
+ WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_refs_rsv.size > 0);
+}
+
+static struct btrfs_block_rsv *get_block_rsv(
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *block_rsv = NULL;
+
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
+ (root == fs_info->uuid_root) ||
+ (trans->adding_csums &&
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
+ block_rsv = trans->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = root->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = &fs_info->empty_block_rsv;
+
+ return block_rsv;
+}
+
+struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u32 blocksize)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ int ret;
+ bool global_updated = false;
+
+ block_rsv = get_block_rsv(trans, root);
+
+ if (unlikely(block_rsv->size == 0))
+ goto try_reserve;
+again:
+ ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
+ if (!ret)
+ return block_rsv;
+
+ if (block_rsv->failfast)
+ return ERR_PTR(ret);
+
+ if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+ global_updated = true;
+ btrfs_update_global_block_rsv(fs_info);
+ goto again;
+ }
+
+ /*
+ * The global reserve still exists to save us from ourselves, so don't
+ * warn_on if we are short on our delayed refs reserve.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
+ btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL * 10,
+ /*DEFAULT_RATELIMIT_BURST*/ 1);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG
+ "BTRFS: block rsv %d returned %d\n",
+ block_rsv->type, ret);
+ }
+try_reserve:
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ return block_rsv;
+ /*
+ * If we couldn't reserve metadata bytes try and use some from
+ * the global reserve if its space type is the same as the global
+ * reservation.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+ block_rsv->space_info == global_rsv->space_info) {
+ ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
+ if (!ret)
+ return global_rsv;
+ }
+ return ERR_PTR(ret);
+}
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
new file mode 100644
index 000000000..578c3497a
--- /dev/null
+++ b/fs/btrfs/block-rsv.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_BLOCK_RSV_H
+#define BTRFS_BLOCK_RSV_H
+
+struct btrfs_trans_handle;
+enum btrfs_reserve_flush_enum;
+
+/*
+ * Types of block reserves
+ */
+enum btrfs_rsv_type {
+ BTRFS_BLOCK_RSV_GLOBAL,
+ BTRFS_BLOCK_RSV_DELALLOC,
+ BTRFS_BLOCK_RSV_TRANS,
+ BTRFS_BLOCK_RSV_CHUNK,
+ BTRFS_BLOCK_RSV_DELOPS,
+ BTRFS_BLOCK_RSV_DELREFS,
+ BTRFS_BLOCK_RSV_EMPTY,
+ BTRFS_BLOCK_RSV_TEMP,
+};
+
+struct btrfs_block_rsv {
+ u64 size;
+ u64 reserved;
+ struct btrfs_space_info *space_info;
+ spinlock_t lock;
+ bool full;
+ bool failfast;
+ /* Block reserve type, one of BTRFS_BLOCK_RSV_* */
+ enum btrfs_rsv_type type:8;
+
+ /*
+ * Qgroup equivalent for @size @reserved
+ *
+ * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
+ * about things like csum size nor how many tree blocks it will need to
+ * reserve.
+ *
+ * Qgroup cares more about net change of the extent usage.
+ *
+ * So for one newly inserted file extent, in worst case it will cause
+ * leaf split and level increase, nodesize for each file extent is
+ * already too much.
+ *
+ * In short, qgroup_size/reserved is the upper limit of possible needed
+ * qgroup metadata reservation.
+ */
+ u64 qgroup_rsv_size;
+ u64 qgroup_rsv_reserved;
+};
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type);
+void btrfs_init_root_block_rsv(struct btrfs_root *root);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
+ enum btrfs_rsv_type type);
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv,
+ enum btrfs_rsv_type type);
+void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
+ bool update_size);
+int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor);
+void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, bool update_size);
+u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, u64 *qgroup_to_release);
+void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
+void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info);
+void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
+struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u32 blocksize);
+static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u32 blocksize)
+{
+ btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
+ btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
+}
+
+/*
+ * Fast path to check if the reserve is full, may be carefully used outside of
+ * locks.
+ */
+static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
+{
+ return data_race(rsv->full);
+}
+
+#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000..54c2ccb36
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,439 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_INODE_H
+#define BTRFS_INODE_H
+
+#include <linux/hash.h>
+#include <linux/refcount.h>
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+#include "delayed-inode.h"
+
+/*
+ * Since we search a directory based on f_pos (struct dir_context::pos) we have
+ * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
+ * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
+ */
+#define BTRFS_DIR_START_INDEX 2
+
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+enum {
+ BTRFS_INODE_FLUSH_ON_CLOSE,
+ BTRFS_INODE_DUMMY,
+ BTRFS_INODE_IN_DEFRAG,
+ BTRFS_INODE_HAS_ASYNC_EXTENT,
+ /*
+ * Always set under the VFS' inode lock, otherwise it can cause races
+ * during fsync (we start as a fast fsync and then end up in a full
+ * fsync racing with ordered extent completion).
+ */
+ BTRFS_INODE_NEEDS_FULL_SYNC,
+ BTRFS_INODE_COPY_EVERYTHING,
+ BTRFS_INODE_IN_DELALLOC_LIST,
+ BTRFS_INODE_HAS_PROPS,
+ BTRFS_INODE_SNAPSHOT_FLUSH,
+ /*
+ * Set and used when logging an inode and it serves to signal that an
+ * inode does not have xattrs, so subsequent fsyncs can avoid searching
+ * for xattrs to log. This bit must be cleared whenever a xattr is added
+ * to an inode.
+ */
+ BTRFS_INODE_NO_XATTRS,
+ /*
+ * Set when we are in a context where we need to start a transaction and
+ * have dirty pages with the respective file range locked. This is to
+ * ensure that when reserving space for the transaction, if we are low
+ * on available space and need to flush delalloc, we will not flush
+ * delalloc for this inode, because that could result in a deadlock (on
+ * the file range, inode's io_tree).
+ */
+ BTRFS_INODE_NO_DELALLOC_FLUSH,
+ /*
+ * Set when we are working on enabling verity for a file. Computing and
+ * writing the whole Merkle tree can take a while so we want to prevent
+ * races where two separate tasks attempt to simultaneously start verity
+ * on the same file.
+ */
+ BTRFS_INODE_VERITY_IN_PROGRESS,
+ /* Set when this inode is a free space inode. */
+ BTRFS_INODE_FREE_SPACE_INODE,
+};
+
+/* in memory btrfs inode */
+struct btrfs_inode {
+ /* which subvolume this inode belongs to */
+ struct btrfs_root *root;
+
+ /* key used to find this inode on disk. This is used by the code
+ * to read in roots of subvolumes
+ */
+ struct btrfs_key location;
+
+ /*
+ * Lock for counters and all fields used to determine if the inode is in
+ * the log or not (last_trans, last_sub_trans, last_log_commit,
+ * logged_trans), to access/update new_delalloc_bytes and to update the
+ * VFS' inode number of bytes used.
+ */
+ spinlock_t lock;
+
+ /* the extent_tree has caches of all the extent mappings to disk */
+ struct extent_map_tree extent_tree;
+
+ /* the io_tree does range state (DIRTY, LOCKED etc) */
+ struct extent_io_tree io_tree;
+
+ /* special utility tree used to record which mirrors have already been
+ * tried when checksums fail for a given block
+ */
+ struct rb_root io_failure_tree;
+ spinlock_t io_failure_lock;
+
+ /*
+ * Keep track of where the inode has extent items mapped in order to
+ * make sure the i_size adjustments are accurate
+ */
+ struct extent_io_tree file_extent_tree;
+
+ /* held while logging the inode in tree-log.c */
+ struct mutex log_mutex;
+
+ /* used to order data wrt metadata */
+ struct btrfs_ordered_inode_tree ordered_tree;
+
+ /* list of all the delalloc inodes in the FS. There are times we need
+ * to write all the delalloc pages to disk, and this list is used
+ * to walk them all.
+ */
+ struct list_head delalloc_inodes;
+
+ /* node for the red-black tree that links inodes in subvolume root */
+ struct rb_node rb_node;
+
+ unsigned long runtime_flags;
+
+ /* Keep track of who's O_SYNC/fsyncing currently */
+ atomic_t sync_writers;
+
+ /* full 64 bit generation number, struct vfs_inode doesn't have a big
+ * enough field for this.
+ */
+ u64 generation;
+
+ /*
+ * transid of the trans_handle that last modified this inode
+ */
+ u64 last_trans;
+
+ /*
+ * transid that last logged this inode
+ */
+ u64 logged_trans;
+
+ /*
+ * log transid when this inode was last modified
+ */
+ int last_sub_trans;
+
+ /* a local copy of root's last_log_commit */
+ int last_log_commit;
+
+ /*
+ * Total number of bytes pending delalloc, used by stat to calculate the
+ * real block usage of the file. This is used only for files.
+ */
+ u64 delalloc_bytes;
+
+ union {
+ /*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes and this
+ * is used only for files.
+ */
+ u64 new_delalloc_bytes;
+ /*
+ * The offset of the last dir index key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_index_offset;
+ };
+
+ /*
+ * total number of bytes pending defrag, used by stat to check whether
+ * it needs COW.
+ */
+ u64 defrag_bytes;
+
+ /*
+ * the size of the file stored in the metadata on disk. data=ordered
+ * means the in-memory i_size might be larger than the size on disk
+ * because not all the blocks are written yet.
+ */
+ u64 disk_i_size;
+
+ /*
+ * If this is a directory then index_cnt is the counter for the index
+ * number for new files that are created. For an empty directory, this
+ * must be initialized to BTRFS_DIR_START_INDEX.
+ */
+ u64 index_cnt;
+
+ /* Cache the directory index number to speed the dir/file remove */
+ u64 dir_index;
+
+ /* the fsync log has some corner cases that mean we have to check
+ * directories to see if any unlinks have been done before
+ * the directory was logged. See tree-log.c for all the
+ * details
+ */
+ u64 last_unlink_trans;
+
+ /*
+ * The id/generation of the last transaction where this inode was
+ * either the source or the destination of a clone/dedupe operation.
+ * Used when logging an inode to know if there are shared extents that
+ * need special care when logging checksum items, to avoid duplicate
+ * checksum items in a log (which can lead to a corruption where we end
+ * up with missing checksum ranges after log replay).
+ * Protected by the vfs inode lock.
+ */
+ u64 last_reflink_trans;
+
+ /*
+ * Number of bytes outstanding that are going to need csums. This is
+ * used in ENOSPC accounting.
+ */
+ u64 csum_bytes;
+
+ /* Backwards incompatible flags, lower half of inode_item::flags */
+ u32 flags;
+ /* Read-only compatibility flags, upper half of inode_item::flags */
+ u32 ro_flags;
+
+ /*
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for.
+ */
+ unsigned outstanding_extents;
+
+ struct btrfs_block_rsv block_rsv;
+
+ /*
+ * Cached values of inode properties
+ */
+ unsigned prop_compress; /* per-file compression algorithm */
+ /*
+ * Force compression on the file using the defrag ioctl, could be
+ * different from prop_compress and takes precedence if set
+ */
+ unsigned defrag_compress;
+
+ struct btrfs_delayed_node *delayed_node;
+
+ /* File creation time. */
+ struct timespec64 i_otime;
+
+ /* Hook into fs_info->delayed_iputs */
+ struct list_head delayed_iput;
+
+ struct rw_semaphore i_mmap_lock;
+ struct inode vfs_inode;
+};
+
+static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
+{
+ return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+static inline unsigned long btrfs_inode_hash(u64 objectid,
+ const struct btrfs_root *root)
+{
+ u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
+
+#if BITS_PER_LONG == 32
+ h = (h >> 32) ^ (h & 0xffffffff);
+#endif
+
+ return (unsigned long)h;
+}
+
+#if BITS_PER_LONG == 32
+
+/*
+ * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
+ * we use the inode's location objectid which is a u64 to avoid truncation.
+ */
+static inline u64 btrfs_ino(const struct btrfs_inode *inode)
+{
+ u64 ino = inode->location.objectid;
+
+ /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
+ if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
+ ino = inode->vfs_inode.i_ino;
+ return ino;
+}
+
+#else
+
+static inline u64 btrfs_ino(const struct btrfs_inode *inode)
+{
+ return inode->vfs_inode.i_ino;
+}
+
+#endif
+
+static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
+{
+ i_size_write(&inode->vfs_inode, size);
+ inode->disk_i_size = size;
+}
+
+static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
+{
+ return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
+}
+
+static inline bool is_data_inode(struct inode *inode)
+{
+ return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
+}
+
+static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
+ int mod)
+{
+ lockdep_assert_held(&inode->lock);
+ inode->outstanding_extents += mod;
+ if (btrfs_is_free_space_inode(inode))
+ return;
+ trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
+ mod);
+}
+
+/*
+ * Called every time after doing a buffered, direct IO or memory mapped write.
+ *
+ * This is to ensure that if we write to a file that was previously fsynced in
+ * the current transaction, then try to fsync it again in the same transaction,
+ * we will know that there were changes in the file and that it needs to be
+ * logged.
+ */
+static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
+{
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
+}
+
+/*
+ * Should be called while holding the inode's VFS lock in exclusive mode or in a
+ * context where no one else can access the inode concurrently (during inode
+ * creation or when loading an inode from disk).
+ */
+static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
+{
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ /*
+ * The inode may have been part of a reflink operation in the last
+ * transaction that modified it, and then a fsync has reset the
+ * last_reflink_trans to avoid subsequent fsyncs in the same
+ * transaction to do unnecessary work. So update last_reflink_trans
+ * to the last_trans value (we have to be pessimistic and assume a
+ * reflink happened).
+ *
+ * The ->last_trans is protected by the inode's spinlock and we can
+ * have a concurrent ordered extent completion update it. Also set
+ * last_reflink_trans to ->last_trans only if the former is less than
+ * the later, because we can be called in a context where
+ * last_reflink_trans was set to the current transaction generation
+ * while ->last_trans was not yet updated in the current transaction,
+ * and therefore has a lower value.
+ */
+ spin_lock(&inode->lock);
+ if (inode->last_reflink_trans < inode->last_trans)
+ inode->last_reflink_trans = inode->last_trans;
+ spin_unlock(&inode->lock);
+}
+
+static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
+{
+ bool ret = false;
+
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == generation &&
+ inode->last_sub_trans <= inode->last_log_commit &&
+ inode->last_sub_trans <= inode->root->last_log_commit)
+ ret = true;
+ spin_unlock(&inode->lock);
+ return ret;
+}
+
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
+
+/*
+ * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
+ * separate u32s. These two functions convert between the two representations.
+ */
+static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
+{
+ return (flags | ((u64)ro_flags << 32));
+}
+
+static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+ u32 *flags, u32 *ro_flags)
+{
+ *flags = (u32)inode_item_flags;
+ *ro_flags = (u32)(inode_item_flags >> 32);
+}
+
+/* Array of bytes with variable length, hexadecimal format 0x1234 */
+#define CSUM_FMT "0x%*phN"
+#define CSUM_FMT_VALUE(size, bytes) size, bytes
+
+static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
+ u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
+{
+ struct btrfs_root *root = inode->root;
+ const u32 csum_size = root->fs_info->csum_size;
+
+ /* Output minus objectid, which is more meaningful */
+ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+ btrfs_warn_rl(root->fs_info,
+"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+ root->root_key.objectid, btrfs_ino(inode),
+ logical_start,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
+ else
+ btrfs_warn_rl(root->fs_info,
+"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
+ root->root_key.objectid, btrfs_ino(inode),
+ logical_start,
+ CSUM_FMT_VALUE(csum_size, csum),
+ CSUM_FMT_VALUE(csum_size, csum_expected),
+ mirror_num);
+}
+
+#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000..98c6e5fea
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,2864 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ * currently referenced by the super block (either directly
+ * or indirectly).
+ * 2. When a super block is written, it is verified that all
+ * referenced (directly or indirectly) blocks fulfill the
+ * following requirements:
+ * 2a. All referenced blocks have either been present when
+ * the file system was mounted, (i.e., they have been
+ * referenced by the super block) or they have been
+ * written since then and the write completion callback
+ * was called and no write error was indicated and a
+ * FLUSH request to the device where these blocks are
+ * located was received and completed.
+ * 2b. All referenced blocks need to have a generation
+ * number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ *
+ * Expect millions of lines of information in the kernel log with an
+ * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
+ * kernel config to at least 26 (which is 64MB). Usually the value is
+ * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
+ * changed like this before LOG_BUF_SHIFT can be set to a high value:
+ * config LOG_BUF_SHIFT
+ * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
+ * range 12 30
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <crypto/hash.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "compression.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
+ * excluding " [...]" */
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+ u32 magic_num; /* only used for debug purposes */
+ unsigned int is_metadata:1; /* if it is meta-data, not data-data */
+ unsigned int is_superblock:1; /* if it is one of the superblocks */
+ unsigned int is_iodone:1; /* if is done by lower subsystem */
+ unsigned int iodone_w_error:1; /* error was indicated to endio */
+ unsigned int never_written:1; /* block was added because it was
+ * referenced, not because it was
+ * written */
+ unsigned int mirror_num; /* large enough to hold
+ * BTRFS_SUPER_MIRROR_MAX */
+ struct btrfsic_dev_state *dev_state;
+ u64 dev_bytenr; /* key, physical byte num on disk */
+ u64 logical_bytenr; /* logical byte num on disk */
+ u64 generation;
+ struct btrfs_disk_key disk_key; /* extra info to print in case of
+ * issues, will not always be correct */
+ struct list_head collision_resolving_node; /* list node */
+ struct list_head all_blocks_node; /* list node */
+
+ /* the following two lists contain block_link items */
+ struct list_head ref_to_list; /* list */
+ struct list_head ref_from_list; /* list */
+ struct btrfsic_block *next_in_same_bio;
+ void *orig_bio_private;
+ bio_end_io_t *orig_bio_end_io;
+ blk_opf_t submit_bio_bh_rw;
+ u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block referred from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+ u32 magic_num; /* only used for debug purposes */
+ u32 ref_cnt;
+ struct list_head node_ref_to; /* list node */
+ struct list_head node_ref_from; /* list node */
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block *block_ref_to;
+ struct btrfsic_block *block_ref_from;
+ u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+ u32 magic_num; /* only used for debug purposes */
+ struct block_device *bdev;
+ struct btrfsic_state *state;
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block dummy_block_for_bio_bh_flush;
+ u64 last_flush_gen;
+};
+
+struct btrfsic_block_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+ struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+ u64 start; /* virtual bytenr */
+ u64 dev_bytenr; /* physical bytenr on device */
+ u32 len;
+ struct btrfsic_dev_state *dev;
+ char **datav;
+ struct page **pagev;
+ void *mem_to_free;
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+ u32 magic;
+ u32 nr;
+ int error;
+ int i;
+ int limit_nesting;
+ int num_copies;
+ int mirror_num;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx *block_ctx;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfs_header *hdr;
+ struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+ u32 print_mask;
+ int include_extent_data;
+ struct list_head all_blocks_list;
+ struct btrfsic_block_hashtable block_hashtable;
+ struct btrfsic_block_link_hashtable block_link_hashtable;
+ struct btrfs_fs_info *fs_info;
+ u64 max_superblock_generation;
+ struct btrfsic_block *latest_superblock;
+ u32 metablock_size;
+ u32 datablock_size;
+};
+
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dst, u32 offset, size_t len);
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx
+ *block_ctx, u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+ b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+ b->dev_state = NULL;
+ b->dev_bytenr = 0;
+ b->logical_bytenr = 0;
+ b->generation = BTRFSIC_GENERATION_UNKNOWN;
+ b->disk_key.objectid = 0;
+ b->disk_key.type = 0;
+ b->disk_key.offset = 0;
+ b->is_metadata = 0;
+ b->is_superblock = 0;
+ b->is_iodone = 0;
+ b->iodone_w_error = 0;
+ b->never_written = 0;
+ b->mirror_num = 0;
+ b->next_in_same_bio = NULL;
+ b->orig_bio_private = NULL;
+ b->orig_bio_end_io = NULL;
+ INIT_LIST_HEAD(&b->collision_resolving_node);
+ INIT_LIST_HEAD(&b->all_blocks_node);
+ INIT_LIST_HEAD(&b->ref_to_list);
+ INIT_LIST_HEAD(&b->ref_from_list);
+ b->submit_bio_bh_rw = 0;
+ b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+ struct btrfsic_block *b;
+
+ b = kzalloc(sizeof(*b), GFP_NOFS);
+ if (NULL != b)
+ btrfsic_block_init(b);
+
+ return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+ BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+ kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+ l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+ l->ref_cnt = 1;
+ INIT_LIST_HEAD(&l->node_ref_to);
+ INIT_LIST_HEAD(&l->node_ref_from);
+ INIT_LIST_HEAD(&l->collision_resolving_node);
+ l->block_ref_to = NULL;
+ l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+ struct btrfsic_block_link *l;
+
+ l = kzalloc(sizeof(*l), GFP_NOFS);
+ if (NULL != l)
+ btrfsic_block_link_init(l);
+
+ return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+ BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+ kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+ ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+ ds->bdev = NULL;
+ ds->state = NULL;
+ INIT_LIST_HEAD(&ds->collision_resolving_node);
+ ds->last_flush_gen = 0;
+ btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+ ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+ ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = kzalloc(sizeof(*ds), GFP_NOFS);
+ if (NULL != ds)
+ btrfsic_dev_state_init(ds);
+
+ return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+ BUG_ON(!(NULL == ds ||
+ BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+ kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(b->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+ list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+ list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+ struct btrfsic_block *b;
+
+ list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
+ if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+ return b;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+ ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+ & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+ list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+ ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_from))) &
+ (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+ struct btrfsic_block_link *l;
+
+ list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+ l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+ l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+ l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+ return l;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+ list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+ list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1);
+ struct btrfsic_dev_state *ds;
+
+ list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
+ if (ds->bdev->bd_dev == dev)
+ return ds;
+ }
+
+ return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_super_block *selected_super;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+ struct btrfsic_dev_state *selected_dev_state = NULL;
+ int ret = 0;
+ int pass;
+
+ selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
+ if (!selected_super)
+ return -ENOMEM;
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ int i;
+ struct btrfsic_dev_state *dev_state;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev);
+ BUG_ON(NULL == dev_state);
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ ret = btrfsic_process_superblock_dev_mirror(
+ state, dev_state, device, i,
+ &selected_dev_state, selected_super);
+ if (0 != ret && 0 == i) {
+ kfree(selected_super);
+ return ret;
+ }
+ }
+ }
+
+ if (NULL == state->latest_superblock) {
+ pr_info("btrfsic: no superblock found!\n");
+ kfree(selected_super);
+ return -1;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ int num_copies;
+ int mirror_num;
+ u64 next_bytenr;
+
+ switch (pass) {
+ case 0:
+ next_bytenr = btrfs_super_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ pr_info("root@%llu\n", next_bytenr);
+ break;
+ case 1:
+ next_bytenr = btrfs_super_chunk_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ pr_info("chunk@%llu\n", next_bytenr);
+ break;
+ case 2:
+ next_bytenr = btrfs_super_log_root(selected_super);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ pr_info("log@%llu\n", next_bytenr);
+ break;
+ }
+
+ num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
+ state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ pr_info("num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, num_copies);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+
+ ret = btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n",
+ next_bytenr, mirror_num);
+ kfree(selected_super);
+ return -1;
+ }
+
+ next_block = btrfsic_block_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ &state->block_hashtable);
+ BUG_ON(NULL == next_block);
+
+ l = btrfsic_block_link_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ state->latest_superblock->dev_state->
+ bdev,
+ state->latest_superblock->dev_bytenr,
+ &state->block_link_hashtable);
+ BUG_ON(NULL == l);
+
+ ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+ if (ret < (int)PAGE_SIZE) {
+ pr_info("btrfsic: read @logical %llu failed!\n",
+ tmp_next_block_ctx.start);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ kfree(selected_super);
+ return -1;
+ }
+
+ ret = btrfsic_process_metablock(state,
+ next_block,
+ &tmp_next_block_ctx,
+ BTRFS_MAX_LEVEL + 3, 1);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ }
+ }
+
+ kfree(selected_super);
+ return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super)
+{
+ struct btrfs_fs_info *fs_info = state->fs_info;
+ struct btrfs_super_block *super_tmp;
+ u64 dev_bytenr;
+ struct btrfsic_block *superblock_tmp;
+ int pass;
+ struct block_device *const superblock_bdev = device->bdev;
+ struct page *page;
+ struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
+ int ret = 0;
+
+ /* super block bytenr is always the unmapped device bytenr */
+ dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
+ return -1;
+
+ page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page))
+ return -1;
+
+ super_tmp = page_address(page);
+
+ if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+ btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
+ memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+ btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+ btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
+ ret = 0;
+ goto out;
+ }
+
+ superblock_tmp =
+ btrfsic_block_hashtable_lookup(superblock_bdev,
+ dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == superblock_tmp) {
+ superblock_tmp = btrfsic_block_alloc();
+ if (NULL == superblock_tmp) {
+ ret = -1;
+ goto out;
+ }
+ /* for superblock, only the dev_bytenr makes sense */
+ superblock_tmp->dev_bytenr = dev_bytenr;
+ superblock_tmp->dev_state = dev_state;
+ superblock_tmp->logical_bytenr = dev_bytenr;
+ superblock_tmp->generation = btrfs_super_generation(super_tmp);
+ superblock_tmp->is_metadata = 1;
+ superblock_tmp->is_superblock = 1;
+ superblock_tmp->is_iodone = 1;
+ superblock_tmp->never_written = 0;
+ superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ btrfs_info_in_rcu(fs_info,
+ "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
+ superblock_bdev,
+ rcu_str_deref(device->name), dev_bytenr,
+ dev_state->bdev, dev_bytenr,
+ superblock_mirror_num);
+ list_add(&superblock_tmp->all_blocks_node,
+ &state->all_blocks_list);
+ btrfsic_block_hashtable_add(superblock_tmp,
+ &state->block_hashtable);
+ }
+
+ /* select the one with the highest generation field */
+ if (btrfs_super_generation(super_tmp) >
+ state->max_superblock_generation ||
+ 0 == state->max_superblock_generation) {
+ memcpy(selected_super, super_tmp, sizeof(*selected_super));
+ *selected_dev_state = dev_state;
+ state->max_superblock_generation =
+ btrfs_super_generation(super_tmp);
+ state->latest_superblock = superblock_tmp;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ u64 next_bytenr;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key;
+
+ tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_disk_key.offset = 0;
+ switch (pass) {
+ case 0:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "initial root ";
+ next_bytenr = btrfs_super_root(super_tmp);
+ break;
+ case 1:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "initial chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_tmp);
+ break;
+ case 2:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "initial log ";
+ next_bytenr = btrfs_super_log_root(super_tmp);
+ if (0 == next_bytenr)
+ continue;
+ break;
+ }
+
+ num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ pr_info("num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+
+ if (btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ &tmp_next_block_ctx,
+ mirror_num)) {
+ pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
+ next_bytenr, mirror_num);
+ ret = -1;
+ goto out;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ additional_string, 1, 1, 0,
+ mirror_num, NULL);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ ret = -1;
+ goto out;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ next_block, superblock_tmp,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l) {
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+ btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+out:
+ put_page(page);
+ return ret;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+ struct btrfsic_stack_frame *sf;
+
+ sf = kzalloc(sizeof(*sf), GFP_NOFS);
+ if (sf)
+ sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+ return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+ BUG_ON(!(NULL == sf ||
+ BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+ kfree(sf);
+}
+
+static noinline_for_stack int btrfsic_process_metablock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const first_block,
+ struct btrfsic_block_data_ctx *const first_block_ctx,
+ int first_limit_nesting, int force_iodone_flag)
+{
+ struct btrfsic_stack_frame initial_stack_frame = { 0 };
+ struct btrfsic_stack_frame *sf;
+ struct btrfsic_stack_frame *next_stack;
+ struct btrfs_header *const first_hdr =
+ (struct btrfs_header *)first_block_ctx->datav[0];
+
+ BUG_ON(!first_hdr);
+ sf = &initial_stack_frame;
+ sf->error = 0;
+ sf->i = -1;
+ sf->limit_nesting = first_limit_nesting;
+ sf->block = first_block;
+ sf->block_ctx = first_block_ctx;
+ sf->next_block = NULL;
+ sf->hdr = first_hdr;
+ sf->prev = NULL;
+
+continue_with_new_stack_frame:
+ sf->block->generation = btrfs_stack_header_generation(sf->hdr);
+ if (0 == sf->hdr->level) {
+ struct btrfs_leaf *const leafhdr =
+ (struct btrfs_leaf *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("leaf %llu items %d generation %llu owner %llu\n",
+ sf->block_ctx->start, sf->nr,
+ btrfs_stack_header_generation(
+ &leafhdr->header),
+ btrfs_stack_header_owner(
+ &leafhdr->header));
+ }
+
+continue_with_current_leaf_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_item disk_item;
+ u32 disk_item_offset =
+ (uintptr_t)(leafhdr->items + sf->i) -
+ (uintptr_t)leafhdr;
+ struct btrfs_disk_key *disk_key;
+ u8 type;
+ u32 item_offset;
+ u32 item_size;
+
+ if (disk_item_offset + sizeof(struct btrfs_item) >
+ sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+ pr_info(
+ "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->bdev);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(sf->block_ctx,
+ &disk_item,
+ disk_item_offset,
+ sizeof(struct btrfs_item));
+ item_offset = btrfs_stack_item_offset(&disk_item);
+ item_size = btrfs_stack_item_size(&disk_item);
+ disk_key = &disk_item.key;
+ type = btrfs_disk_key_type(disk_key);
+
+ if (BTRFS_ROOT_ITEM_KEY == type) {
+ struct btrfs_root_item root_item;
+ u32 root_item_offset;
+ u64 next_bytenr;
+
+ root_item_offset = item_offset +
+ offsetof(struct btrfs_leaf, items);
+ if (root_item_offset + item_size >
+ sf->block_ctx->len)
+ goto leaf_item_out_of_bounce_error;
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &root_item,
+ root_item_offset,
+ item_size);
+ next_bytenr = btrfs_root_bytenr(&root_item);
+
+ sf->error =
+ btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ disk_key,
+ btrfs_root_generation(
+ &root_item));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.datav[0];
+
+ next_stack =
+ btrfsic_stack_frame_alloc();
+ if (NULL == next_stack) {
+ sf->error = -1;
+ btrfsic_release_block_ctx(
+ &sf->
+ next_block_ctx);
+ goto one_stack_frame_backwards;
+ }
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx =
+ &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+ } else if (BTRFS_EXTENT_DATA_KEY == type &&
+ state->include_extent_data) {
+ sf->error = btrfsic_handle_extent_data(
+ state,
+ sf->block,
+ sf->block_ctx,
+ item_offset,
+ force_iodone_flag);
+ if (sf->error)
+ goto one_stack_frame_backwards;
+ }
+
+ goto continue_with_current_leaf_stack_frame;
+ }
+ } else {
+ struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("node %llu level %d items %d generation %llu owner %llu\n",
+ sf->block_ctx->start,
+ nodehdr->header.level, sf->nr,
+ btrfs_stack_header_generation(
+ &nodehdr->header),
+ btrfs_stack_header_owner(
+ &nodehdr->header));
+ }
+
+continue_with_current_node_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_key_ptr key_ptr;
+ u32 key_ptr_offset;
+ u64 next_bytenr;
+
+ key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+ (uintptr_t)nodehdr;
+ if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+ sf->block_ctx->len) {
+ pr_info(
+ "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->bdev);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &key_ptr, key_ptr_offset,
+ sizeof(struct btrfs_key_ptr));
+ next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
+
+ sf->error = btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ &key_ptr.key,
+ btrfs_stack_key_generation(&key_ptr));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.datav[0];
+
+ next_stack = btrfsic_stack_frame_alloc();
+ if (NULL == next_stack) {
+ sf->error = -1;
+ goto one_stack_frame_backwards;
+ }
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx = &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+
+ goto continue_with_current_node_stack_frame;
+ }
+ }
+
+one_stack_frame_backwards:
+ if (NULL != sf->prev) {
+ struct btrfsic_stack_frame *const prev = sf->prev;
+
+ /* the one for the initial block is freed in the caller */
+ btrfsic_release_block_ctx(sf->block_ctx);
+
+ if (sf->error) {
+ prev->error = sf->error;
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto one_stack_frame_backwards;
+ }
+
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto continue_with_new_stack_frame;
+ } else {
+ BUG_ON(&initial_stack_frame != sf);
+ }
+
+ return sf->error;
+}
+
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dstv, u32 offset, size_t len)
+{
+ size_t cur;
+ size_t pgoff;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ size_t start_offset = offset_in_page(block_ctx->start);
+ unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
+
+ WARN_ON(offset + len > block_ctx->len);
+ pgoff = offset_in_page(start_offset + offset);
+
+ while (len > 0) {
+ cur = min(len, ((size_t)PAGE_SIZE - pgoff));
+ BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
+ kaddr = block_ctx->datav[i];
+ memcpy(dst, kaddr + pgoff, cur);
+
+ dst += cur;
+ len -= cur;
+ pgoff = 0;
+ i++;
+ }
+}
+
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation)
+{
+ struct btrfs_fs_info *fs_info = state->fs_info;
+ struct btrfsic_block *next_block = NULL;
+ int ret;
+ struct btrfsic_block_link *l;
+ int did_alloc_block_link;
+ int block_was_created;
+
+ *next_blockp = NULL;
+ if (0 == *num_copiesp) {
+ *num_copiesp = btrfs_num_copies(fs_info, next_bytenr,
+ state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ pr_info("num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, *num_copiesp);
+ *mirror_nump = 1;
+ }
+
+ if (*mirror_nump > *num_copiesp)
+ return 0;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+ *mirror_nump);
+ ret = btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ next_block_ctx, *mirror_nump);
+ if (ret) {
+ pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+ next_bytenr, *mirror_nump);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(state,
+ next_block_ctx, "referenced ",
+ 1, force_iodone_flag,
+ !force_iodone_flag,
+ *mirror_nump,
+ &block_was_created);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+ if (block_was_created) {
+ l = NULL;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ } else {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr))
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ next_bytenr, next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block),
+ next_block->logical_bytenr);
+ else
+ pr_info(
+ "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ next_bytenr, next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block));
+ }
+ next_block->logical_bytenr = next_bytenr;
+
+ next_block->mirror_num = *mirror_nump;
+ l = btrfsic_block_link_hashtable_lookup(
+ next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_link_hashtable);
+ }
+
+ next_block->disk_key = *disk_key;
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ did_alloc_block_link = 1;
+ l->block_ref_to = next_block;
+ l->block_ref_from = block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ did_alloc_block_link = 0;
+ if (0 == limit_nesting) {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+ }
+
+ if (limit_nesting > 0 && did_alloc_block_link) {
+ ret = btrfsic_read_block(state, next_block_ctx);
+ if (ret < (int)next_block_ctx->len) {
+ pr_info("btrfsic: read block @logical %llu failed!\n",
+ next_bytenr);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ *next_blockp = next_block;
+ } else {
+ *next_blockp = NULL;
+ }
+ (*mirror_nump)++;
+
+ return 0;
+}
+
+static int btrfsic_handle_extent_data(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag)
+{
+ struct btrfs_fs_info *fs_info = state->fs_info;
+ struct btrfs_file_extent_item file_extent_item;
+ u64 file_extent_item_offset;
+ u64 next_bytenr;
+ u64 num_bytes;
+ u64 generation;
+ struct btrfsic_block_link *l;
+ int ret;
+
+ file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+ item_offset;
+ if (file_extent_item_offset +
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+ block_ctx->len) {
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
+ return -1;
+ }
+
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+ if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+ btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ pr_info("extent_data: type %u, disk_bytenr = %llu\n",
+ file_extent_item.type,
+ btrfs_stack_file_extent_disk_bytenr(
+ &file_extent_item));
+ return 0;
+ }
+
+ if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+ block_ctx->len) {
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
+ return -1;
+ }
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ sizeof(struct btrfs_file_extent_item));
+ next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
+ if (btrfs_stack_file_extent_compression(&file_extent_item) ==
+ BTRFS_COMPRESS_NONE) {
+ next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
+ num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+ } else {
+ num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
+ }
+ generation = btrfs_stack_file_extent_generation(&file_extent_item);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n",
+ file_extent_item.type,
+ btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
+ btrfs_stack_file_extent_offset(&file_extent_item),
+ num_bytes);
+ while (num_bytes > 0) {
+ u32 chunk_len;
+ int num_copies;
+ int mirror_num;
+
+ if (num_bytes > state->datablock_size)
+ chunk_len = state->datablock_size;
+ else
+ chunk_len = num_bytes;
+
+ num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ state->datablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ pr_info("num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfsic_block *next_block;
+ int block_was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n",
+ mirror_num);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ pr_info("\tdisk_bytenr = %llu, num_bytes %u\n",
+ next_bytenr, chunk_len);
+ ret = btrfsic_map_block(state, next_bytenr,
+ chunk_len, &next_block_ctx,
+ mirror_num);
+ if (ret) {
+ pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+ next_bytenr, mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &next_block_ctx,
+ "referenced ",
+ 0,
+ force_iodone_flag,
+ !force_iodone_flag,
+ mirror_num,
+ &block_was_created);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(&next_block_ctx);
+ return -1;
+ }
+ if (!block_was_created) {
+ if ((state->print_mask &
+ BTRFSIC_PRINT_MASK_VERBOSE) &&
+ next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr)) {
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
+ next_bytenr,
+ next_block_ctx.dev->bdev,
+ next_block_ctx.dev_bytenr,
+ mirror_num,
+ next_block->logical_bytenr);
+ }
+ next_block->logical_bytenr = next_bytenr;
+ next_block->mirror_num = mirror_num;
+ }
+
+ l = btrfsic_block_link_lookup_or_add(state,
+ &next_block_ctx,
+ next_block, block,
+ generation);
+ btrfsic_release_block_ctx(&next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+
+ next_bytenr += chunk_len;
+ num_bytes -= chunk_len;
+ }
+
+ return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = state->fs_info;
+ int ret;
+ u64 length;
+ struct btrfs_io_context *multi = NULL;
+ struct btrfs_device *device;
+
+ length = len;
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
+ bytenr, &length, &multi, mirror_num);
+
+ if (ret) {
+ block_ctx_out->start = 0;
+ block_ctx_out->dev_bytenr = 0;
+ block_ctx_out->len = 0;
+ block_ctx_out->dev = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+
+ return ret;
+ }
+
+ device = multi->stripes[0].dev;
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
+ !device->bdev || !device->name)
+ block_ctx_out->dev = NULL;
+ else
+ block_ctx_out->dev = btrfsic_dev_state_lookup(
+ device->bdev->bd_dev);
+ block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+ block_ctx_out->start = bytenr;
+ block_ctx_out->len = len;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+
+ kfree(multi);
+ if (NULL == block_ctx_out->dev) {
+ ret = -ENXIO;
+ pr_info("btrfsic: error, cannot lookup dev (#1)!\n");
+ }
+
+ return ret;
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+ if (block_ctx->mem_to_free) {
+ unsigned int num_pages;
+
+ BUG_ON(!block_ctx->datav);
+ BUG_ON(!block_ctx->pagev);
+ num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ /* Pages must be unmapped in reverse order */
+ while (num_pages > 0) {
+ num_pages--;
+ if (block_ctx->datav[num_pages])
+ block_ctx->datav[num_pages] = NULL;
+ if (block_ctx->pagev[num_pages]) {
+ __free_page(block_ctx->pagev[num_pages]);
+ block_ctx->pagev[num_pages] = NULL;
+ }
+ }
+
+ kfree(block_ctx->mem_to_free);
+ block_ctx->mem_to_free = NULL;
+ block_ctx->pagev = NULL;
+ block_ctx->datav = NULL;
+ }
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx)
+{
+ unsigned int num_pages;
+ unsigned int i;
+ size_t size;
+ u64 dev_bytenr;
+ int ret;
+
+ BUG_ON(block_ctx->datav);
+ BUG_ON(block_ctx->pagev);
+ BUG_ON(block_ctx->mem_to_free);
+ if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
+ pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
+ block_ctx->dev_bytenr);
+ return -1;
+ }
+
+ num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
+ block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
+ if (!block_ctx->mem_to_free)
+ return -ENOMEM;
+ block_ctx->datav = block_ctx->mem_to_free;
+ block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+ ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
+ if (ret)
+ return ret;
+
+ dev_bytenr = block_ctx->dev_bytenr;
+ for (i = 0; i < num_pages;) {
+ struct bio *bio;
+ unsigned int j;
+
+ bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
+ REQ_OP_READ, GFP_NOFS);
+ bio->bi_iter.bi_sector = dev_bytenr >> 9;
+
+ for (j = i; j < num_pages; j++) {
+ ret = bio_add_page(bio, block_ctx->pagev[j],
+ PAGE_SIZE, 0);
+ if (PAGE_SIZE != ret)
+ break;
+ }
+ if (j == i) {
+ pr_info("btrfsic: error, failed to add a single page!\n");
+ return -1;
+ }
+ if (submit_bio_wait(bio)) {
+ pr_info("btrfsic: read error at logical %llu dev %pg!\n",
+ block_ctx->start, block_ctx->dev->bdev);
+ bio_put(bio);
+ return -1;
+ }
+ bio_put(bio);
+ dev_bytenr += (j - i) * PAGE_SIZE;
+ i = j;
+ }
+ for (i = 0; i < num_pages; i++)
+ block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
+
+ return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+ const struct btrfsic_block *b_all;
+
+ BUG_ON(NULL == state);
+
+ pr_info("all_blocks_list:\n");
+ list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
+ const struct btrfsic_block_link *l;
+
+ pr_info("%c-block @%llu (%pg/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ b_all->logical_bytenr, b_all->dev_state->bdev,
+ b_all->dev_bytenr, b_all->mirror_num);
+
+ list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ b_all->logical_bytenr, b_all->dev_state->bdev,
+ b_all->dev_bytenr, b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ }
+
+ list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ b_all->logical_bytenr, b_all->dev_state->bdev,
+ b_all->dev_bytenr, b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->bdev,
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ }
+
+ pr_info("\n");
+ }
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static noinline_for_stack int btrfsic_test_for_metadata(
+ struct btrfsic_state *state,
+ char **datav, unsigned int num_pages)
+{
+ struct btrfs_fs_info *fs_info = state->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ struct btrfs_header *h;
+ u8 csum[BTRFS_CSUM_SIZE];
+ unsigned int i;
+
+ if (num_pages * PAGE_SIZE < state->metablock_size)
+ return 1; /* not metadata */
+ num_pages = state->metablock_size >> PAGE_SHIFT;
+ h = (struct btrfs_header *)datav[0];
+
+ if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
+ return 1;
+
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+
+ for (i = 0; i < num_pages; i++) {
+ u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+ size_t sublen = i ? PAGE_SIZE :
+ (PAGE_SIZE - BTRFS_CSUM_SIZE);
+
+ crypto_shash_update(shash, data, sublen);
+ }
+ crypto_shash_final(shash, csum);
+ if (memcmp(csum, h->csum, fs_info->csum_size))
+ return 1;
+
+ return 0; /* is metadata */
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
+ blk_opf_t submit_bio_bh_rw)
+{
+ int is_metadata;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx block_ctx;
+ int ret;
+ struct btrfsic_state *state = dev_state->state;
+ struct block_device *bdev = dev_state->bdev;
+ unsigned int processed_len;
+
+ if (NULL != bio_is_patched)
+ *bio_is_patched = 0;
+
+again:
+ if (num_pages == 0)
+ return;
+
+ processed_len = 0;
+ is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+ num_pages));
+
+ block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+ &state->block_hashtable);
+ if (NULL != block) {
+ u64 bytenr = 0;
+ struct btrfsic_block_link *l, *tmp;
+
+ if (block->is_superblock) {
+ bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
+ mapped_datav[0]);
+ if (num_pages * PAGE_SIZE <
+ BTRFS_SUPER_INFO_SIZE) {
+ pr_info("btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ is_metadata = 1;
+ BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
+ processed_len = BTRFS_SUPER_INFO_SIZE;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+ pr_info("[before new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ }
+ if (is_metadata) {
+ if (!block->is_superblock) {
+ if (num_pages * PAGE_SIZE <
+ state->metablock_size) {
+ pr_info("btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->metablock_size;
+ bytenr = btrfs_stack_header_bytenr(
+ (struct btrfs_header *)
+ mapped_datav[0]);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+ dev_state,
+ dev_bytenr);
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (block->logical_bytenr != bytenr &&
+ !(!block->is_metadata &&
+ block->logical_bytenr == 0))
+ pr_info(
+"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ bytenr, dev_state->bdev,
+ dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state,
+ block),
+ block->logical_bytenr);
+ else
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev,
+ dev_bytenr, block->mirror_num,
+ btrfsic_get_block_type(state,
+ block));
+ }
+ block->logical_bytenr = bytenr;
+ } else {
+ if (num_pages * PAGE_SIZE <
+ state->datablock_size) {
+ pr_info("btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->datablock_size;
+ bytenr = block->logical_bytenr;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev, dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block));
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("ref_to_list: %cE, ref_from_list: %cE\n",
+ list_empty(&block->ref_to_list) ? ' ' : '!',
+ list_empty(&block->ref_from_list) ? ' ' : '!');
+ if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
+ btrfsic_get_block_type(state, block), bytenr,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
+ block->generation,
+ btrfs_disk_key_objectid(&block->disk_key),
+ block->disk_key.type,
+ btrfs_disk_key_offset(&block->disk_key),
+ btrfs_stack_header_generation(
+ (struct btrfs_header *) mapped_datav[0]),
+ state->max_superblock_generation);
+ btrfsic_dump_tree(state);
+ }
+
+ if (!block->is_iodone && !block->never_written) {
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
+ btrfsic_get_block_type(state, block), bytenr,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
+ block->generation,
+ btrfs_stack_header_generation(
+ (struct btrfs_header *)
+ mapped_datav[0]));
+ /* it would not be safe to go on */
+ btrfsic_dump_tree(state);
+ goto continue_loop;
+ }
+
+ /*
+ * Clear all references of this block. Do not free
+ * the block itself even if is not referenced anymore
+ * because it still carries valuable information
+ * like whether it was ever written and IO completed.
+ */
+ list_for_each_entry_safe(l, tmp, &block->ref_to_list,
+ node_ref_to) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+ l->ref_cnt--;
+ if (0 == l->ref_cnt) {
+ list_del(&l->node_ref_to);
+ list_del(&l->node_ref_from);
+ btrfsic_block_link_hashtable_remove(l);
+ btrfsic_block_link_free(l);
+ }
+ }
+
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
+
+ if (is_metadata || state->include_extent_data) {
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_private =
+ bio->bi_private;
+ block->orig_bio_end_io =
+ bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_private =
+ chained_block->orig_bio_private;
+ block->orig_bio_end_io =
+ chained_block->orig_bio_end_io;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_private = NULL;
+ block->orig_bio_end_io = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ }
+
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (is_metadata) {
+ block->logical_bytenr = bytenr;
+ block->is_metadata = 1;
+ if (block->is_superblock) {
+ BUG_ON(PAGE_SIZE !=
+ BTRFS_SUPER_INFO_SIZE);
+ ret = btrfsic_process_written_superblock(
+ state,
+ block,
+ (struct btrfs_super_block *)
+ mapped_datav[0]);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+ pr_info("[after new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ } else {
+ block->mirror_num = 0; /* unknown */
+ ret = btrfsic_process_metablock(
+ state,
+ block,
+ &block_ctx,
+ 0, 0);
+ }
+ if (ret)
+ pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n",
+ dev_bytenr);
+ } else {
+ block->is_metadata = 0;
+ block->mirror_num = 0; /* unknown */
+ block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ if (!state->include_extent_data
+ && list_empty(&block->ref_from_list)) {
+ /*
+ * disk block is overwritten with extent
+ * data (not meta data) and we are configured
+ * to not include extent data: take the
+ * chance and free the block's memory
+ */
+ btrfsic_block_hashtable_remove(block);
+ list_del(&block->all_blocks_node);
+ btrfsic_block_free(block);
+ }
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ } else {
+ /* block has not been found in hash table */
+ u64 bytenr;
+
+ if (!is_metadata) {
+ processed_len = state->datablock_size;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info(
+ "written block (%pg/%llu/?) !found in hash table, D\n",
+ dev_state->bdev, dev_bytenr);
+ if (!state->include_extent_data) {
+ /* ignore that written D block */
+ goto continue_loop;
+ }
+
+ /* this is getting ugly for the
+ * include_extent_data case... */
+ bytenr = 0; /* unknown */
+ } else {
+ processed_len = state->metablock_size;
+ bytenr = btrfs_stack_header_bytenr(
+ (struct btrfs_header *)
+ mapped_datav[0]);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+ dev_bytenr);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info(
+ "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
+ bytenr, dev_state->bdev, dev_bytenr);
+ }
+
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ btrfsic_release_block_ctx(&block_ctx);
+ goto continue_loop;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = dev_bytenr;
+ block->logical_bytenr = bytenr;
+ block->is_metadata = is_metadata;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->mirror_num = 0; /* unknown */
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_private =
+ chained_block->orig_bio_private;
+ block->orig_bio_end_io =
+ chained_block->orig_bio_end_io;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_private = NULL;
+ block->orig_bio_end_io = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
+ is_metadata ? 'M' : 'D',
+ block->logical_bytenr, block->dev_state->bdev,
+ block->dev_bytenr, block->mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+ if (is_metadata) {
+ ret = btrfsic_process_metablock(state, block,
+ &block_ctx, 0, 0);
+ if (ret)
+ pr_info("btrfsic: process_metablock(root @%llu) failed!\n",
+ dev_bytenr);
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+
+continue_loop:
+ BUG_ON(!processed_len);
+ dev_bytenr += processed_len;
+ mapped_datav += processed_len >> PAGE_SHIFT;
+ num_pages -= processed_len >> PAGE_SHIFT;
+ goto again;
+}
+
+static void btrfsic_bio_end_io(struct bio *bp)
+{
+ struct btrfsic_block *block = bp->bi_private;
+ int iodone_w_error;
+
+ /* mutex is not held! This is not save if IO is not yet completed
+ * on umount */
+ iodone_w_error = 0;
+ if (bp->bi_status)
+ iodone_w_error = 1;
+
+ BUG_ON(NULL == block);
+ bp->bi_private = block->orig_bio_private;
+ bp->bi_end_io = block->orig_bio_end_io;
+
+ do {
+ struct btrfsic_block *next_block;
+ struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
+ bp->bi_status,
+ btrfsic_get_block_type(dev_state->state, block),
+ block->logical_bytenr, dev_state->bdev,
+ block->dev_bytenr, block->mirror_num);
+ next_block = block->next_in_same_bio;
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ pr_info("bio_end_io() new %pg flush_gen=%llu\n",
+ dev_state->bdev,
+ dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is
+ * on disk */
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ block = next_block;
+ } while (NULL != block);
+
+ bp->bi_end_io(bp);
+}
+
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const superblock,
+ struct btrfs_super_block *const super_hdr)
+{
+ struct btrfs_fs_info *fs_info = state->fs_info;
+ int pass;
+
+ superblock->generation = btrfs_super_generation(super_hdr);
+ if (!(superblock->generation > state->max_superblock_generation ||
+ 0 == state->max_superblock_generation)) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ pr_info(
+ "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
+ superblock->logical_bytenr,
+ superblock->dev_state->bdev,
+ superblock->dev_bytenr, superblock->mirror_num,
+ btrfs_super_generation(super_hdr),
+ state->max_superblock_generation);
+ } else {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ pr_info(
+ "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
+ superblock->logical_bytenr,
+ superblock->dev_state->bdev,
+ superblock->dev_bytenr, superblock->mirror_num,
+ btrfs_super_generation(super_hdr),
+ state->max_superblock_generation);
+
+ state->max_superblock_generation =
+ btrfs_super_generation(super_hdr);
+ state->latest_superblock = superblock;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ int ret;
+ u64 next_bytenr;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key = {0};
+
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_ROOT_ITEM_KEY);
+ btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
+
+ switch (pass) {
+ case 0:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "root ";
+ next_bytenr = btrfs_super_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ pr_info("root@%llu\n", next_bytenr);
+ break;
+ case 1:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ pr_info("chunk@%llu\n", next_bytenr);
+ break;
+ case 2:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "log ";
+ next_bytenr = btrfs_super_log_root(super_hdr);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ pr_info("log@%llu\n", next_bytenr);
+ break;
+ }
+
+ num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ BTRFS_SUPER_INFO_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ pr_info("num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ int was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num);
+ ret = btrfsic_map_block(state, next_bytenr,
+ BTRFS_SUPER_INFO_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+ next_bytenr, mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ additional_string,
+ 1, 0, 1,
+ mirror_num,
+ &was_created);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ if (was_created)
+ next_block->generation =
+ BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ next_block,
+ superblock,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+ }
+
+ if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
+ btrfsic_dump_tree(state);
+
+ return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level)
+{
+ const struct btrfsic_block_link *l;
+ int ret = 0;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /*
+ * Note that this situation can happen and does not
+ * indicate an error in regular cases. It happens
+ * when disk blocks are freed and later reused.
+ * The check-integrity module is not aware of any
+ * block free operations, it just recognizes block
+ * write operations. Therefore it keeps the linkage
+ * information for a block until a block is
+ * rewritten. This can temporarily cause incorrect
+ * and even circular linkage information. This
+ * causes no harm unless such blocks are referenced
+ * by the most recent super block.
+ */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("btrfsic: abort cyclic linkage (case 1).\n");
+
+ return ret;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack
+ * space is very small and the max recursion depth is limited.
+ */
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ block->logical_bytenr, block->dev_state->bdev,
+ block->dev_bytenr, block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ if (l->block_ref_to->never_written) {
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (!l->block_ref_to->is_iodone) {
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (l->block_ref_to->iodone_w_error) {
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (l->parent_generation !=
+ l->block_ref_to->generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->parent_generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->block_ref_to->generation) {
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num,
+ l->block_ref_to->generation,
+ l->parent_generation);
+ ret = -1;
+ } else if (l->block_ref_to->flush_gen >
+ l->block_ref_to->dev_state->last_flush_gen) {
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num, block->flush_gen,
+ l->block_ref_to->dev_state->last_flush_gen);
+ ret = -1;
+ } else if (-1 == btrfsic_check_all_ref_blocks(state,
+ l->block_ref_to,
+ recursion_level +
+ 1)) {
+ ret = -1;
+ }
+ }
+
+ return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+ const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level)
+{
+ const struct btrfsic_block_link *l;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /* refer to comment at "abort cyclic linkage (case 1)" */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("btrfsic: abort cyclic linkage (case 2).\n");
+
+ return 0;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ block->logical_bytenr, block->dev_state->bdev,
+ block->dev_bytenr, block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->bdev,
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ if (l->block_ref_from->is_superblock &&
+ state->latest_superblock->dev_bytenr ==
+ l->block_ref_from->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev ==
+ l->block_ref_from->dev_state->bdev)
+ return 1;
+ else if (btrfsic_is_block_ref_by_superblock(state,
+ l->block_ref_from,
+ recursion_level +
+ 1))
+ return 1;
+ }
+
+ return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->bdev,
+ l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->bdev,
+ l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block)
+{
+ if (block->is_superblock &&
+ state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+ return 'S';
+ else if (block->is_superblock)
+ return 's';
+ else if (block->is_metadata)
+ return 'M';
+ else
+ return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+ btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level)
+{
+ const struct btrfsic_block_link *l;
+ int indent_add;
+ static char buf[80];
+ int cursor_position;
+
+ /*
+ * Should better fill an on-stack buffer with a complete line and
+ * dump it at once when it is time to print a newline character.
+ */
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
+ btrfsic_get_block_type(state, block),
+ block->logical_bytenr, block->dev_state->bdev,
+ block->dev_bytenr, block->mirror_num);
+ if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ return;
+ }
+ printk(buf);
+ indent_level += indent_add;
+ if (list_empty(&block->ref_to_list)) {
+ printk("\n");
+ return;
+ }
+ if (block->mirror_num > 1 &&
+ !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+ printk(" [...]\n");
+ return;
+ }
+
+ cursor_position = indent_level;
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
+ while (cursor_position < indent_level) {
+ printk(" ");
+ cursor_position++;
+ }
+ if (l->ref_cnt > 1)
+ indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+ else
+ indent_add = sprintf(buf, " --> ");
+ if (indent_level + indent_add >
+ BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ cursor_position = 0;
+ continue;
+ }
+
+ printk(buf);
+
+ btrfsic_dump_tree_sub(state, l->block_ref_to,
+ indent_level + indent_add);
+ cursor_position = 0;
+ }
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation)
+{
+ struct btrfsic_block_link *l;
+
+ l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ from_block->dev_state->bdev,
+ from_block->dev_bytenr,
+ &state->block_link_hashtable);
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (!l)
+ return NULL;
+
+ l->block_ref_to = next_block;
+ l->block_ref_from = from_block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &from_block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+
+ return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created)
+{
+ struct btrfsic_block *block;
+
+ block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == block) {
+ struct btrfsic_dev_state *dev_state;
+
+ block = btrfsic_block_alloc();
+ if (!block)
+ return NULL;
+
+ dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
+ if (NULL == dev_state) {
+ pr_info("btrfsic: error, lookup dev_state failed!\n");
+ btrfsic_block_free(block);
+ return NULL;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = block_ctx->dev_bytenr;
+ block->logical_bytenr = block_ctx->start;
+ block->is_metadata = is_metadata;
+ block->is_iodone = is_iodone;
+ block->never_written = never_written;
+ block->mirror_num = mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
+ additional_string,
+ btrfsic_get_block_type(state, block),
+ block->logical_bytenr, dev_state->bdev,
+ block->dev_bytenr, mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+ if (NULL != was_created)
+ *was_created = 1;
+ } else {
+ if (NULL != was_created)
+ *was_created = 0;
+ }
+
+ return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr)
+{
+ struct btrfs_fs_info *fs_info = state->fs_info;
+ struct btrfsic_block_data_ctx block_ctx;
+ int num_copies;
+ int mirror_num;
+ int match = 0;
+ int ret;
+
+ num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr, state->metablock_size,
+ &block_ctx, mirror_num);
+ if (ret) {
+ pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n",
+ bytenr, mirror_num);
+ continue;
+ }
+
+ if (dev_state->bdev == block_ctx.dev->bdev &&
+ dev_bytenr == block_ctx.dev_bytenr) {
+ match++;
+ btrfsic_release_block_ctx(&block_ctx);
+ break;
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+
+ if (WARN_ON(!match)) {
+ pr_info(
+"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
+ bytenr, dev_state->bdev, dev_bytenr);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr,
+ state->metablock_size,
+ &block_ctx, mirror_num);
+ if (ret)
+ continue;
+
+ pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
+ bytenr, block_ctx.dev->bdev,
+ block_ctx.dev_bytenr, mirror_num);
+ }
+ }
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
+{
+ return btrfsic_dev_state_hashtable_lookup(dev,
+ &btrfsic_dev_state_hashtable);
+}
+
+static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
+{
+ unsigned int segs = bio_segments(bio);
+ u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
+ u64 cur_bytenr = dev_bytenr;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ char **mapped_datav;
+ int bio_is_patched = 0;
+ int i = 0;
+
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info(
+"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, segs,
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
+
+ mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
+ if (!mapped_datav)
+ return;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_len != PAGE_SIZE);
+ mapped_datav[i] = page_address(bvec.bv_page);
+ i++;
+
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+ pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
+ i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
+ cur_bytenr += bvec.bv_len;
+ }
+
+ btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
+ bio, &bio_is_patched, bio->bi_opf);
+ kfree(mapped_datav);
+}
+
+static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
+{
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
+
+ if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = bio->bi_opf;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ } else if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE))) {
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
+ }
+}
+
+void btrfsic_check_bio(struct bio *bio)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ /*
+ * We can be called before btrfsic_mount, so there might not be a
+ * dev_state.
+ */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
+ mutex_lock(&btrfsic_mutex);
+ if (dev_state) {
+ if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
+ btrfsic_check_write_bio(bio, dev_state);
+ else if (bio->bi_opf & REQ_PREFLUSH)
+ btrfsic_check_flush_bio(bio, dev_state);
+ }
+ mutex_unlock(&btrfsic_mutex);
+}
+
+int btrfsic_mount(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask)
+{
+ int ret;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ if (!PAGE_ALIGNED(fs_info->nodesize)) {
+ pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
+ fs_info->nodesize, PAGE_SIZE);
+ return -1;
+ }
+ if (!PAGE_ALIGNED(fs_info->sectorsize)) {
+ pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
+ fs_info->sectorsize, PAGE_SIZE);
+ return -1;
+ }
+ state = kvzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ if (!btrfsic_is_initialized) {
+ mutex_init(&btrfsic_mutex);
+ btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+ btrfsic_is_initialized = 1;
+ }
+ mutex_lock(&btrfsic_mutex);
+ state->fs_info = fs_info;
+ state->print_mask = print_mask;
+ state->include_extent_data = including_extent_data;
+ state->metablock_size = fs_info->nodesize;
+ state->datablock_size = fs_info->sectorsize;
+ INIT_LIST_HEAD(&state->all_blocks_list);
+ btrfsic_block_hashtable_init(&state->block_hashtable);
+ btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+ state->max_superblock_generation = 0;
+ state->latest_superblock = NULL;
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_alloc();
+ if (NULL == ds) {
+ mutex_unlock(&btrfsic_mutex);
+ return -ENOMEM;
+ }
+ ds->bdev = device->bdev;
+ ds->state = state;
+ btrfsic_dev_state_hashtable_add(ds,
+ &btrfsic_dev_state_hashtable);
+ }
+
+ ret = btrfsic_process_superblock(state, fs_devices);
+ if (0 != ret) {
+ mutex_unlock(&btrfsic_mutex);
+ btrfsic_unmount(fs_devices);
+ return ret;
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+ btrfsic_dump_database(state);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+ btrfsic_dump_tree(state);
+
+ mutex_unlock(&btrfsic_mutex);
+ return 0;
+}
+
+void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfsic_block *b_all, *tmp_all;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ mutex_lock(&btrfsic_mutex);
+
+ state = NULL;
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_hashtable_lookup(
+ device->bdev->bd_dev,
+ &btrfsic_dev_state_hashtable);
+ if (NULL != ds) {
+ state = ds->state;
+ btrfsic_dev_state_hashtable_remove(ds);
+ btrfsic_dev_state_free(ds);
+ }
+ }
+
+ if (NULL == state) {
+ pr_info("btrfsic: error, cannot find state information on umount!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return;
+ }
+
+ /*
+ * Don't care about keeping the lists' state up to date,
+ * just free all memory that was allocated dynamically.
+ * Free the blocks and the block_links.
+ */
+ list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
+ all_blocks_node) {
+ struct btrfsic_block_link *l, *tmp;
+
+ list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
+ node_ref_to) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+
+ l->ref_cnt--;
+ if (0 == l->ref_cnt)
+ btrfsic_block_link_free(l);
+ }
+
+ if (b_all->is_iodone || b_all->never_written)
+ btrfsic_block_free(b_all);
+ else
+ pr_info(
+"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
+ btrfsic_get_block_type(state, b_all),
+ b_all->logical_bytenr, b_all->dev_state->bdev,
+ b_all->dev_bytenr, b_all->mirror_num);
+ }
+
+ mutex_unlock(&btrfsic_mutex);
+
+ kvfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000..e4c8aed79
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ */
+
+#ifndef BTRFS_CHECK_INTEGRITY_H
+#define BTRFS_CHECK_INTEGRITY_H
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+void btrfsic_check_bio(struct bio *bio);
+#else
+static inline void btrfsic_check_bio(struct bio *bio) { }
+#endif
+
+int btrfsic_mount(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_fs_devices *fs_devices);
+
+#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000..e6635fe70
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,1747 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/writeback.h>
+#include <linux/psi.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/log2.h>
+#include <crypto/hash.h>
+#include "misc.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+#include "subpage.h"
+#include "zoned.h"
+
+static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
+
+const char* btrfs_compress_type2str(enum btrfs_compression_type type)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ case BTRFS_COMPRESS_LZO:
+ case BTRFS_COMPRESS_ZSTD:
+ case BTRFS_COMPRESS_NONE:
+ return btrfs_compress_types[type];
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+ int i;
+
+ for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+ size_t comp_len = strlen(btrfs_compress_types[i]);
+
+ if (len < comp_len)
+ continue;
+
+ if (!strncmp(btrfs_compress_types[i], str, comp_len))
+ return true;
+ }
+ return false;
+}
+
+static int compression_compress_pages(int type, struct list_head *ws,
+ struct address_space *mapping, u64 start, struct page **pages,
+ unsigned long *out_pages, unsigned long *total_in,
+ unsigned long *total_out)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ return zlib_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_LZO:
+ return lzo_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_ZSTD:
+ return zstd_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can happen when compression races with remount setting
+ * it to 'no compress', while caller doesn't call
+ * inode_need_compress() to check if we really need to
+ * compress.
+ *
+ * Not a big deal, just need to inform caller that we
+ * haven't allocated any pages yet.
+ */
+ *out_pages = 0;
+ return -E2BIG;
+ }
+}
+
+static int compression_decompress_bio(struct list_head *ws,
+ struct compressed_bio *cb)
+{
+ switch (cb->compress_type) {
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static int compression_decompress(int type, struct list_head *ws,
+ unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static int btrfs_decompress_bio(struct compressed_bio *cb);
+
+static void finish_compressed_bio_read(struct compressed_bio *cb)
+{
+ unsigned int index;
+ struct page *page;
+
+ if (cb->status == BLK_STS_OK)
+ cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
+
+ /* Release the compressed pages */
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ put_page(page);
+ }
+
+ /* Do io completion on the original bio */
+ btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
+
+ /* Finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+}
+
+/*
+ * Verify the checksums and kick off repair if needed on the uncompressed data
+ * before decompressing it into the original bio and freeing the uncompressed
+ * pages.
+ */
+static void end_compressed_bio_read(struct btrfs_bio *bbio)
+{
+ struct compressed_bio *cb = bbio->private;
+ struct inode *inode = cb->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ blk_status_t status = bbio->bio.bi_status;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
+
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
+
+ if (!status &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset))) {
+ btrfs_clean_io_failure(bi, start, bv.bv_page,
+ bv.bv_offset);
+ } else {
+ int ret;
+
+ refcount_inc(&cb->pending_ios);
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ btrfs_submit_data_read_bio);
+ if (ret) {
+ refcount_dec(&cb->pending_ios);
+ status = errno_to_blk_status(ret);
+ }
+ }
+ }
+
+ if (status)
+ cb->status = status;
+
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
+ btrfs_bio_free_csum(bbio);
+ bio_put(&bbio->bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline void end_compressed_writeback(struct inode *inode,
+ const struct compressed_bio *cb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ unsigned long index = cb->start >> PAGE_SHIFT;
+ unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
+ struct folio_batch fbatch;
+ const int errno = blk_status_to_errno(cb->status);
+ int i;
+ int ret;
+
+ if (errno)
+ mapping_set_error(inode->i_mapping, errno);
+
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ ret = filemap_get_folios(inode->i_mapping, &index, end_index,
+ &fbatch);
+
+ if (ret == 0)
+ return;
+
+ for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ if (errno)
+ folio_set_error(folio);
+ btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
+ cb->start, cb->len);
+ }
+ folio_batch_release(&fbatch);
+ }
+ /* the inode may be gone now */
+}
+
+static void finish_compressed_bio_write(struct compressed_bio *cb)
+{
+ struct inode *inode = cb->inode;
+ unsigned int index;
+
+ /*
+ * Ok, we're the last bio for this extent, step one is to call back
+ * into the FS and do all the end_io operations.
+ */
+ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
+ cb->start, cb->start + cb->len - 1,
+ cb->status == BLK_STS_OK);
+
+ if (cb->writeback)
+ end_compressed_writeback(inode, cb);
+ /* Note, our inode could be gone now */
+
+ /*
+ * Release the compressed pages, these came from alloc_page and
+ * are not attached to the inode at all
+ */
+ for (index = 0; index < cb->nr_pages; index++) {
+ struct page *page = cb->compressed_pages[index];
+
+ page->mapping = NULL;
+ put_page(page);
+ }
+
+ /* Finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+}
+
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+ struct compressed_bio *cb =
+ container_of(work, struct compressed_bio, write_end_work);
+
+ finish_compressed_bio_write(cb);
+}
+
+/*
+ * Do the cleanup once all the compressed pages hit the disk. This will clear
+ * writeback on the file pages and free the compressed pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that metadata
+ * and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct btrfs_bio *bbio)
+{
+ struct compressed_bio *cb = bbio->private;
+
+ if (bbio->bio.bi_status)
+ cb->status = bbio->bio.bi_status;
+
+ if (refcount_dec_and_test(&cb->pending_ios)) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+
+ btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
+ queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ }
+ bio_put(&bbio->bio);
+}
+
+/*
+ * Allocate a compressed_bio, which will be used to read/write on-disk
+ * (aka, compressed) * data.
+ *
+ * @cb: The compressed_bio structure, which records all the needed
+ * information to bind the compressed data to the uncompressed
+ * page cache.
+ * @disk_byten: The logical bytenr where the compressed data will be read
+ * from or written to.
+ * @endio_func: The endio function to call after the IO for compressed data
+ * is finished.
+ * @next_stripe_start: Return value of logical bytenr of where next stripe starts.
+ * Let the caller know to only fill the bio up to the stripe
+ * boundary.
+ */
+
+
+static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+ blk_opf_t opf,
+ btrfs_bio_end_io_t endio_func,
+ u64 *next_stripe_start)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ struct btrfs_io_geometry geom;
+ struct extent_map *em;
+ struct bio *bio;
+ int ret;
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+ if (IS_ERR(em)) {
+ bio_put(bio);
+ return ERR_CAST(em);
+ }
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ bio_put(bio);
+ return ERR_PTR(ret);
+ }
+ *next_stripe_start = disk_bytenr + geom.len;
+ refcount_inc(&cb->pending_ios);
+ return bio;
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ unsigned int len, u64 disk_start,
+ unsigned int compressed_len,
+ struct page **compressed_pages,
+ unsigned int nr_pages,
+ blk_opf_t write_flags,
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct bio *bio = NULL;
+ struct compressed_bio *cb;
+ u64 cur_disk_bytenr = disk_start;
+ u64 next_stripe_start;
+ blk_status_t ret = BLK_STS_OK;
+ int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
+ const bool use_append = btrfs_use_zone_append(inode, disk_start);
+ const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
+
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
+ if (!cb)
+ return BLK_STS_RESOURCE;
+ refcount_set(&cb->pending_ios, 1);
+ cb->status = BLK_STS_OK;
+ cb->inode = &inode->vfs_inode;
+ cb->start = start;
+ cb->len = len;
+ cb->compressed_pages = compressed_pages;
+ cb->compressed_len = compressed_len;
+ cb->writeback = writeback;
+ INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
+ cb->nr_pages = nr_pages;
+
+ if (blkcg_css)
+ kthread_associate_blkcg(blkcg_css);
+
+ while (cur_disk_bytenr < disk_start + compressed_len) {
+ u64 offset = cur_disk_bytenr - disk_start;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!bio) {
+ bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+ bio_op | write_flags, end_compressed_bio_write,
+ &next_stripe_start);
+ if (IS_ERR(bio)) {
+ ret = errno_to_blk_status(PTR_ERR(bio));
+ break;
+ }
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
+ }
+ /*
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
+ */
+ ASSERT(cur_disk_bytenr != next_stripe_start);
+
+ /*
+ * We have various limits on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+ if (use_append)
+ added = bio_add_zone_append_page(bio, page, real_size,
+ offset_in_page(offset));
+ else
+ added = bio_add_page(bio, page, real_size,
+ offset_in_page(offset));
+ /* Reached zoned boundary */
+ if (added == 0)
+ submit = true;
+
+ cur_disk_bytenr += added;
+ /* Reached stripe boundary */
+ if (cur_disk_bytenr == next_stripe_start)
+ submit = true;
+
+ /* Finished the range */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ submit = true;
+
+ if (submit) {
+ if (!skip_sum) {
+ ret = btrfs_csum_one_bio(inode, bio, start, true);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ break;
+ }
+ }
+
+ ASSERT(bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, bio, 0);
+ bio = NULL;
+ }
+ cond_resched();
+ }
+
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
+
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_write(cb);
+ return ret;
+}
+
+static u64 bio_end_offset(struct bio *bio)
+{
+ struct bio_vec *last = bio_last_bvec_all(bio);
+
+ return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
+}
+
+/*
+ * Add extra pages in the same compressed file extent so that we don't need to
+ * re-read the same extent again and again.
+ *
+ * NOTE: this won't work well for subpage, as for subpage read, we lock the
+ * full page then submit bio for each compressed/regular extents.
+ *
+ * This means, if we have several sectors in the same page points to the same
+ * on-disk compressed data, we will re-read the same extent many times and
+ * this function can only help for the next page.
+ */
+static noinline int add_ra_bio_pages(struct inode *inode,
+ u64 compressed_end,
+ struct compressed_bio *cb,
+ int *memstall, unsigned long *pflags)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ unsigned long end_index;
+ u64 cur = bio_end_offset(cb->orig_bio);
+ u64 isize = i_size_read(inode);
+ int ret;
+ struct page *page;
+ struct extent_map *em;
+ struct address_space *mapping = inode->i_mapping;
+ struct extent_map_tree *em_tree;
+ struct extent_io_tree *tree;
+ int sectors_missed = 0;
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ tree = &BTRFS_I(inode)->io_tree;
+
+ if (isize == 0)
+ return 0;
+
+ /*
+ * For current subpage support, we only support 64K page size,
+ * which means maximum compressed extent size (128K) is just 2x page
+ * size.
+ * This makes readahead less effective, so here disable readahead for
+ * subpage for now, until full compressed write is supported.
+ */
+ if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+ return 0;
+
+ end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+ while (cur < compressed_end) {
+ u64 page_end;
+ u64 pg_index = cur >> PAGE_SHIFT;
+ u32 add_size;
+
+ if (pg_index > end_index)
+ break;
+
+ page = xa_load(&mapping->i_pages, pg_index);
+ if (page && !xa_is_value(page)) {
+ sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ fs_info->sectorsize_bits;
+
+ /* Beyond threshold, no need to continue */
+ if (sectors_missed > 4)
+ break;
+
+ /*
+ * Jump to next page start as we already have page for
+ * current offset.
+ */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
+ }
+
+ page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
+ if (!page)
+ break;
+
+ if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
+ put_page(page);
+ /* There is already a page, skip to page end */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
+ }
+
+ if (!*memstall && PageWorkingset(page)) {
+ psi_memstall_enter(pflags);
+ *memstall = 1;
+ }
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+
+ page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ lock_extent(tree, cur, page_end, NULL);
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
+ read_unlock(&em_tree->lock);
+
+ /*
+ * At this point, we have a locked page in the page cache for
+ * these bytes in the file. But, we have to make sure they map
+ * to this compressed extent on disk.
+ */
+ if (!em || cur < em->start ||
+ (cur + fs_info->sectorsize > extent_map_end(em)) ||
+ (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
+ free_extent_map(em);
+ unlock_extent(tree, cur, page_end, NULL);
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+ free_extent_map(em);
+
+ if (page->index == end_index) {
+ size_t zero_offset = offset_in_page(isize);
+
+ if (zero_offset) {
+ int zeros;
+ zeros = PAGE_SIZE - zero_offset;
+ memzero_page(page, zero_offset, zeros);
+ }
+ }
+
+ add_size = min(em->start + em->len, page_end + 1) - cur;
+ ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+ if (ret != add_size) {
+ unlock_extent(tree, cur, page_end, NULL);
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+ /*
+ * If it's subpage, we also need to increase its
+ * subpage::readers number, as at endio we will decrease
+ * subpage::readers and to unlock the page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ put_page(page);
+ cur += add_size;
+ }
+ return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it. We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_iter.bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map_tree *em_tree;
+ struct compressed_bio *cb;
+ unsigned int compressed_len;
+ struct bio *comp_bio = NULL;
+ const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_byte = disk_bytenr;
+ u64 next_stripe_start;
+ u64 file_offset;
+ u64 em_len;
+ u64 em_start;
+ struct extent_map *em;
+ unsigned long pflags;
+ int memstall = 0;
+ blk_status_t ret;
+ int ret2;
+ int i;
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ file_offset = bio_first_bvec_all(bio)->bv_offset +
+ page_offset(bio_first_page_all(bio));
+
+ /* we need the actual starting offset of this extent in the file */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ ret = BLK_STS_IOERR;
+ goto out;
+ }
+
+ ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
+ compressed_len = em->block_len;
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
+ if (!cb) {
+ ret = BLK_STS_RESOURCE;
+ goto out;
+ }
+
+ refcount_set(&cb->pending_ios, 1);
+ cb->status = BLK_STS_OK;
+ cb->inode = inode;
+
+ cb->start = em->orig_start;
+ em_len = em->len;
+ em_start = em->start;
+
+ cb->len = bio->bi_iter.bi_size;
+ cb->compressed_len = compressed_len;
+ cb->compress_type = em->compress_type;
+ cb->orig_bio = bio;
+
+ free_extent_map(em);
+ em = NULL;
+
+ cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!cb->compressed_pages) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
+ }
+
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ if (ret2) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
+ }
+
+ add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
+
+ /* include any pages we added in add_ra-bio_pages */
+ cb->len = bio->bi_iter.bi_size;
+
+ while (cur_disk_byte < disk_bytenr + compressed_len) {
+ u64 offset = cur_disk_byte - disk_bytenr;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = cb->compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!comp_bio) {
+ comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+ REQ_OP_READ, end_compressed_bio_read,
+ &next_stripe_start);
+ if (IS_ERR(comp_bio)) {
+ cb->status = errno_to_blk_status(PTR_ERR(comp_bio));
+ break;
+ }
+ }
+ /*
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
+ */
+ ASSERT(cur_disk_byte != next_stripe_start);
+ /*
+ * We have various limit on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+ added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
+ /*
+ * Maximum compressed extent is smaller than bio size limit,
+ * thus bio_add_page() should always success.
+ */
+ ASSERT(added == real_size);
+ cur_disk_byte += added;
+
+ /* Reached stripe boundary, need to submit */
+ if (cur_disk_byte == next_stripe_start)
+ submit = true;
+
+ /* Has finished the range, need to submit */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ submit = true;
+
+ if (submit) {
+ /* Save the original iter for read repair */
+ if (bio_op(comp_bio) == REQ_OP_READ)
+ btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
+
+ /*
+ * Save the initial offset of this chunk, as there
+ * is no direct correlation between compressed pages and
+ * the original file offset. The field is only used for
+ * priting error messages.
+ */
+ btrfs_bio(comp_bio)->file_offset = file_offset;
+
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
+ break;
+ }
+
+ ASSERT(comp_bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, comp_bio, mirror_num);
+ comp_bio = NULL;
+ }
+ }
+
+ if (memstall)
+ psi_memstall_leave(&pflags);
+
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
+ return;
+
+fail:
+ if (cb->compressed_pages) {
+ for (i = 0; i < cb->nr_pages; i++) {
+ if (cb->compressed_pages[i])
+ __free_page(cb->compressed_pages[i]);
+ }
+ }
+
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ free_extent_map(em);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+}
+
+/*
+ * Heuristic uses systematic sampling to collect data from the input data
+ * range, the logic can be tuned by the following constants:
+ *
+ * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
+ * @SAMPLING_INTERVAL - range from which the sampled data can be collected
+ */
+#define SAMPLING_READ_SIZE (16)
+#define SAMPLING_INTERVAL (256)
+
+/*
+ * For statistical analysis of the input data we consider bytes that form a
+ * Galois Field of 256 objects. Each object has an attribute count, ie. how
+ * many times the object appeared in the sample.
+ */
+#define BUCKET_SIZE (256)
+
+/*
+ * The size of the sample is based on a statistical sampling rule of thumb.
+ * The common way is to perform sampling tests as long as the number of
+ * elements in each cell is at least 5.
+ *
+ * Instead of 5, we choose 32 to obtain more accurate results.
+ * If the data contain the maximum number of symbols, which is 256, we obtain a
+ * sample size bound by 8192.
+ *
+ * For a sample of at most 8KB of data per data range: 16 consecutive bytes
+ * from up to 512 locations.
+ */
+#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
+ SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
+
+struct bucket_item {
+ u32 count;
+};
+
+struct heuristic_ws {
+ /* Partial copy of input data */
+ u8 *sample;
+ u32 sample_size;
+ /* Buckets store counters for each byte value */
+ struct bucket_item *bucket;
+ /* Sorting buffer */
+ struct bucket_item *bucket_b;
+ struct list_head list;
+};
+
+static struct workspace_manager heuristic_wsm;
+
+static void free_heuristic_ws(struct list_head *ws)
+{
+ struct heuristic_ws *workspace;
+
+ workspace = list_entry(ws, struct heuristic_ws, list);
+
+ kvfree(workspace->sample);
+ kfree(workspace->bucket);
+ kfree(workspace->bucket_b);
+ kfree(workspace);
+}
+
+static struct list_head *alloc_heuristic_ws(unsigned int level)
+{
+ struct heuristic_ws *ws;
+
+ ws = kzalloc(sizeof(*ws), GFP_KERNEL);
+ if (!ws)
+ return ERR_PTR(-ENOMEM);
+
+ ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
+ if (!ws->sample)
+ goto fail;
+
+ ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
+ if (!ws->bucket)
+ goto fail;
+
+ ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
+ if (!ws->bucket_b)
+ goto fail;
+
+ INIT_LIST_HEAD(&ws->list);
+ return &ws->list;
+fail:
+ free_heuristic_ws(&ws->list);
+ return ERR_PTR(-ENOMEM);
+}
+
+const struct btrfs_compress_op btrfs_heuristic_compress = {
+ .workspace_manager = &heuristic_wsm,
+};
+
+static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+ /* The heuristic is represented as compression type 0 */
+ &btrfs_heuristic_compress,
+ &btrfs_zlib_compress,
+ &btrfs_lzo_compress,
+ &btrfs_zstd_compress,
+};
+
+static struct list_head *alloc_workspace(int type, unsigned int level)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static void free_workspace(int type, struct list_head *ws)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
+ case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
+ case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static void btrfs_init_workspace_manager(int type)
+{
+ struct workspace_manager *wsm;
+ struct list_head *workspace;
+
+ wsm = btrfs_compress_op[type]->workspace_manager;
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ init_waitqueue_head(&wsm->ws_wait);
+
+ /*
+ * Preallocate one workspace for each compression type so we can
+ * guarantee forward progress in the worst case
+ */
+ workspace = alloc_workspace(type, 0);
+ if (IS_ERR(workspace)) {
+ pr_warn(
+ "BTRFS: cannot preallocate compression workspace, will try later\n");
+ } else {
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+ list_add(workspace, &wsm->idle_ws);
+ }
+}
+
+static void btrfs_cleanup_workspace_manager(int type)
+{
+ struct workspace_manager *wsman;
+ struct list_head *ws;
+
+ wsman = btrfs_compress_op[type]->workspace_manager;
+ while (!list_empty(&wsman->idle_ws)) {
+ ws = wsman->idle_ws.next;
+ list_del(ws);
+ free_workspace(type, ws);
+ atomic_dec(&wsman->total_ws);
+ }
+}
+
+/*
+ * This finds an available workspace or allocates a new one.
+ * If it's not possible to allocate a new one, waits until there's one.
+ * Preallocation makes a forward progress guarantees and we do not return
+ * errors.
+ */
+struct list_head *btrfs_get_workspace(int type, unsigned int level)
+{
+ struct workspace_manager *wsm;
+ struct list_head *workspace;
+ int cpus = num_online_cpus();
+ unsigned nofs_flag;
+ struct list_head *idle_ws;
+ spinlock_t *ws_lock;
+ atomic_t *total_ws;
+ wait_queue_head_t *ws_wait;
+ int *free_ws;
+
+ wsm = btrfs_compress_op[type]->workspace_manager;
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
+
+again:
+ spin_lock(ws_lock);
+ if (!list_empty(idle_ws)) {
+ workspace = idle_ws->next;
+ list_del(workspace);
+ (*free_ws)--;
+ spin_unlock(ws_lock);
+ return workspace;
+
+ }
+ if (atomic_read(total_ws) > cpus) {
+ DEFINE_WAIT(wait);
+
+ spin_unlock(ws_lock);
+ prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(total_ws) > cpus && !*free_ws)
+ schedule();
+ finish_wait(ws_wait, &wait);
+ goto again;
+ }
+ atomic_inc(total_ws);
+ spin_unlock(ws_lock);
+
+ /*
+ * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
+ * to turn it off here because we might get called from the restricted
+ * context of btrfs_compress_bio/btrfs_compress_pages
+ */
+ nofs_flag = memalloc_nofs_save();
+ workspace = alloc_workspace(type, level);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (IS_ERR(workspace)) {
+ atomic_dec(total_ws);
+ wake_up(ws_wait);
+
+ /*
+ * Do not return the error but go back to waiting. There's a
+ * workspace preallocated for each type and the compression
+ * time is bounded so we get to a workspace eventually. This
+ * makes our caller's life easier.
+ *
+ * To prevent silent and low-probability deadlocks (when the
+ * initial preallocation fails), check if there are any
+ * workspaces at all.
+ */
+ if (atomic_read(total_ws) == 0) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ /* once per minute */ 60 * HZ,
+ /* no burst */ 1);
+
+ if (__ratelimit(&_rs)) {
+ pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
+ }
+ }
+ goto again;
+ }
+ return workspace;
+}
+
+static struct list_head *get_workspace(int type, int level)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
+ case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+void btrfs_put_workspace(int type, struct list_head *ws)
+{
+ struct workspace_manager *wsm;
+ struct list_head *idle_ws;
+ spinlock_t *ws_lock;
+ atomic_t *total_ws;
+ wait_queue_head_t *ws_wait;
+ int *free_ws;
+
+ wsm = btrfs_compress_op[type]->workspace_manager;
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
+
+ spin_lock(ws_lock);
+ if (*free_ws <= num_online_cpus()) {
+ list_add(ws, idle_ws);
+ (*free_ws)++;
+ spin_unlock(ws_lock);
+ goto wake;
+ }
+ spin_unlock(ws_lock);
+
+ free_workspace(type, ws);
+ atomic_dec(total_ws);
+wake:
+ cond_wake_up(ws_wait);
+}
+
+static void put_workspace(int type, struct list_head *ws)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+/*
+ * Adjust @level according to the limits of the compression algorithm or
+ * fallback to default
+ */
+static unsigned int btrfs_compress_set_level(int type, unsigned level)
+{
+ const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+
+ if (level == 0)
+ level = ops->default_level;
+ else
+ level = min(level, ops->max_level);
+
+ return level;
+}
+
+/*
+ * Given an address space and start and length, compress the bytes into @pages
+ * that are allocated on demand.
+ *
+ * @type_level is encoded algorithm and level, where level 0 means whatever
+ * default the algorithm chooses and is opaque here;
+ * - compression algo are 0-3
+ * - the level are bits 4-7
+ *
+ * @out_pages is an in/out parameter, holds maximum number of pages to allocate
+ * and returns number of actually allocated pages
+ *
+ * @total_in is used to return the number of bytes actually read. It
+ * may be smaller than the input length if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * @total_out is an in/out parameter, must be set to the input length and will
+ * be also used to return the total number of compressed bytes
+ */
+int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
+ u64 start, struct page **pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out)
+{
+ int type = btrfs_compress_type(type_level);
+ int level = btrfs_compress_level(type_level);
+ struct list_head *workspace;
+ int ret;
+
+ level = btrfs_compress_set_level(type, level);
+ workspace = get_workspace(type, level);
+ ret = compression_compress_pages(type, workspace, mapping, start, pages,
+ out_pages, total_in, total_out);
+ put_workspace(type, workspace);
+ return ret;
+}
+
+static int btrfs_decompress_bio(struct compressed_bio *cb)
+{
+ struct list_head *workspace;
+ int ret;
+ int type = cb->compress_type;
+
+ workspace = get_workspace(type, 0);
+ ret = compression_decompress_bio(workspace, cb);
+ put_workspace(type, workspace);
+
+ return ret;
+}
+
+/*
+ * a less complex decompression routine. Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen)
+{
+ struct list_head *workspace;
+ int ret;
+
+ workspace = get_workspace(type, 0);
+ ret = compression_decompress(type, workspace, data_in, dest_page,
+ start_byte, srclen, destlen);
+ put_workspace(type, workspace);
+
+ return ret;
+}
+
+void __init btrfs_init_compress(void)
+{
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
+ zstd_init_workspace_manager();
+}
+
+void __cold btrfs_exit_compress(void)
+{
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
+ zstd_cleanup_workspace_manager();
+}
+
+/*
+ * Copy decompressed data from working buffer to pages.
+ *
+ * @buf: The decompressed data buffer
+ * @buf_len: The decompressed data length
+ * @decompressed: Number of bytes that are already decompressed inside the
+ * compressed extent
+ * @cb: The compressed extent descriptor
+ * @orig_bio: The original bio that the caller wants to read for
+ *
+ * An easier to understand graph is like below:
+ *
+ * |<- orig_bio ->| |<- orig_bio->|
+ * |<------- full decompressed extent ----->|
+ * |<----------- @cb range ---->|
+ * | |<-- @buf_len -->|
+ * |<--- @decompressed --->|
+ *
+ * Note that, @cb can be a subpage of the full decompressed extent, but
+ * @cb->start always has the same as the orig_file_offset value of the full
+ * decompressed extent.
+ *
+ * When reading compressed extent, we have to read the full compressed extent,
+ * while @orig_bio may only want part of the range.
+ * Thus this function will ensure only data covered by @orig_bio will be copied
+ * to.
+ *
+ * Return 0 if we have copied all needed contents for @orig_bio.
+ * Return >0 if we need continue decompress.
+ */
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ struct compressed_bio *cb, u32 decompressed)
+{
+ struct bio *orig_bio = cb->orig_bio;
+ /* Offset inside the full decompressed extent */
+ u32 cur_offset;
+
+ cur_offset = decompressed;
+ /* The main loop to do the copy */
+ while (cur_offset < decompressed + buf_len) {
+ struct bio_vec bvec;
+ size_t copy_len;
+ u32 copy_start;
+ /* Offset inside the full decompressed extent */
+ u32 bvec_offset;
+
+ bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
+ /*
+ * cb->start may underflow, but subtracting that value can still
+ * give us correct offset inside the full decompressed extent.
+ */
+ bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
+
+ /* Haven't reached the bvec range, exit */
+ if (decompressed + buf_len <= bvec_offset)
+ return 1;
+
+ copy_start = max(cur_offset, bvec_offset);
+ copy_len = min(bvec_offset + bvec.bv_len,
+ decompressed + buf_len) - copy_start;
+ ASSERT(copy_len);
+
+ /*
+ * Extra range check to ensure we didn't go beyond
+ * @buf + @buf_len.
+ */
+ ASSERT(copy_start - decompressed < buf_len);
+ memcpy_to_page(bvec.bv_page, bvec.bv_offset,
+ buf + copy_start - decompressed, copy_len);
+ cur_offset += copy_len;
+
+ bio_advance(orig_bio, copy_len);
+ /* Finished the bio */
+ if (!orig_bio->bi_iter.bi_size)
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Shannon Entropy calculation
+ *
+ * Pure byte distribution analysis fails to determine compressibility of data.
+ * Try calculating entropy to estimate the average minimum number of bits
+ * needed to encode the sampled data.
+ *
+ * For convenience, return the percentage of needed bits, instead of amount of
+ * bits directly.
+ *
+ * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
+ * and can be compressible with high probability
+ *
+ * @ENTROPY_LVL_HIGH - data are not compressible with high probability
+ *
+ * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
+ */
+#define ENTROPY_LVL_ACEPTABLE (65)
+#define ENTROPY_LVL_HIGH (80)
+
+/*
+ * For increasead precision in shannon_entropy calculation,
+ * let's do pow(n, M) to save more digits after comma:
+ *
+ * - maximum int bit length is 64
+ * - ilog2(MAX_SAMPLE_SIZE) -> 13
+ * - 13 * 4 = 52 < 64 -> M = 4
+ *
+ * So use pow(n, 4).
+ */
+static inline u32 ilog2_w(u64 n)
+{
+ return ilog2(n * n * n * n);
+}
+
+static u32 shannon_entropy(struct heuristic_ws *ws)
+{
+ const u32 entropy_max = 8 * ilog2_w(2);
+ u32 entropy_sum = 0;
+ u32 p, p_base, sz_base;
+ u32 i;
+
+ sz_base = ilog2_w(ws->sample_size);
+ for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
+ p = ws->bucket[i].count;
+ p_base = ilog2_w(p);
+ entropy_sum += p * (sz_base - p_base);
+ }
+
+ entropy_sum /= ws->sample_size;
+ return entropy_sum * 100 / entropy_max;
+}
+
+#define RADIX_BASE 4U
+#define COUNTERS_SIZE (1U << RADIX_BASE)
+
+static u8 get4bits(u64 num, int shift) {
+ u8 low4bits;
+
+ num >>= shift;
+ /* Reverse order */
+ low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
+ return low4bits;
+}
+
+/*
+ * Use 4 bits as radix base
+ * Use 16 u32 counters for calculating new position in buf array
+ *
+ * @array - array that will be sorted
+ * @array_buf - buffer array to store sorting results
+ * must be equal in size to @array
+ * @num - array size
+ */
+static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
+ int num)
+{
+ u64 max_num;
+ u64 buf_num;
+ u32 counters[COUNTERS_SIZE];
+ u32 new_addr;
+ u32 addr;
+ int bitlen;
+ int shift;
+ int i;
+
+ /*
+ * Try avoid useless loop iterations for small numbers stored in big
+ * counters. Example: 48 33 4 ... in 64bit array
+ */
+ max_num = array[0].count;
+ for (i = 1; i < num; i++) {
+ buf_num = array[i].count;
+ if (buf_num > max_num)
+ max_num = buf_num;
+ }
+
+ buf_num = ilog2(max_num);
+ bitlen = ALIGN(buf_num, RADIX_BASE * 2);
+
+ shift = 0;
+ while (shift < bitlen) {
+ memset(counters, 0, sizeof(counters));
+
+ for (i = 0; i < num; i++) {
+ buf_num = array[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]++;
+ }
+
+ for (i = 1; i < COUNTERS_SIZE; i++)
+ counters[i] += counters[i - 1];
+
+ for (i = num - 1; i >= 0; i--) {
+ buf_num = array[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]--;
+ new_addr = counters[addr];
+ array_buf[new_addr] = array[i];
+ }
+
+ shift += RADIX_BASE;
+
+ /*
+ * Normal radix expects to move data from a temporary array, to
+ * the main one. But that requires some CPU time. Avoid that
+ * by doing another sort iteration to original array instead of
+ * memcpy()
+ */
+ memset(counters, 0, sizeof(counters));
+
+ for (i = 0; i < num; i ++) {
+ buf_num = array_buf[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]++;
+ }
+
+ for (i = 1; i < COUNTERS_SIZE; i++)
+ counters[i] += counters[i - 1];
+
+ for (i = num - 1; i >= 0; i--) {
+ buf_num = array_buf[i].count;
+ addr = get4bits(buf_num, shift);
+ counters[addr]--;
+ new_addr = counters[addr];
+ array[new_addr] = array_buf[i];
+ }
+
+ shift += RADIX_BASE;
+ }
+}
+
+/*
+ * Size of the core byte set - how many bytes cover 90% of the sample
+ *
+ * There are several types of structured binary data that use nearly all byte
+ * values. The distribution can be uniform and counts in all buckets will be
+ * nearly the same (eg. encrypted data). Unlikely to be compressible.
+ *
+ * Other possibility is normal (Gaussian) distribution, where the data could
+ * be potentially compressible, but we have to take a few more steps to decide
+ * how much.
+ *
+ * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
+ * compression algo can easy fix that
+ * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
+ * probability is not compressible
+ */
+#define BYTE_CORE_SET_LOW (64)
+#define BYTE_CORE_SET_HIGH (200)
+
+static int byte_core_set_size(struct heuristic_ws *ws)
+{
+ u32 i;
+ u32 coreset_sum = 0;
+ const u32 core_set_threshold = ws->sample_size * 90 / 100;
+ struct bucket_item *bucket = ws->bucket;
+
+ /* Sort in reverse order */
+ radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
+
+ for (i = 0; i < BYTE_CORE_SET_LOW; i++)
+ coreset_sum += bucket[i].count;
+
+ if (coreset_sum > core_set_threshold)
+ return i;
+
+ for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
+ coreset_sum += bucket[i].count;
+ if (coreset_sum > core_set_threshold)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Count byte values in buckets.
+ * This heuristic can detect textual data (configs, xml, json, html, etc).
+ * Because in most text-like data byte set is restricted to limited number of
+ * possible characters, and that restriction in most cases makes data easy to
+ * compress.
+ *
+ * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
+ * less - compressible
+ * more - need additional analysis
+ */
+#define BYTE_SET_THRESHOLD (64)
+
+static u32 byte_set_size(const struct heuristic_ws *ws)
+{
+ u32 i;
+ u32 byte_set_size = 0;
+
+ for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
+ if (ws->bucket[i].count > 0)
+ byte_set_size++;
+ }
+
+ /*
+ * Continue collecting count of byte values in buckets. If the byte
+ * set size is bigger then the threshold, it's pointless to continue,
+ * the detection technique would fail for this type of data.
+ */
+ for (; i < BUCKET_SIZE; i++) {
+ if (ws->bucket[i].count > 0) {
+ byte_set_size++;
+ if (byte_set_size > BYTE_SET_THRESHOLD)
+ return byte_set_size;
+ }
+ }
+
+ return byte_set_size;
+}
+
+static bool sample_repeated_patterns(struct heuristic_ws *ws)
+{
+ const u32 half_of_sample = ws->sample_size / 2;
+ const u8 *data = ws->sample;
+
+ return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
+}
+
+static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
+ struct heuristic_ws *ws)
+{
+ struct page *page;
+ u64 index, index_end;
+ u32 i, curr_sample_pos;
+ u8 *in_data;
+
+ /*
+ * Compression handles the input data by chunks of 128KiB
+ * (defined by BTRFS_MAX_UNCOMPRESSED)
+ *
+ * We do the same for the heuristic and loop over the whole range.
+ *
+ * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
+ * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
+ */
+ if (end - start > BTRFS_MAX_UNCOMPRESSED)
+ end = start + BTRFS_MAX_UNCOMPRESSED;
+
+ index = start >> PAGE_SHIFT;
+ index_end = end >> PAGE_SHIFT;
+
+ /* Don't miss unaligned end */
+ if (!IS_ALIGNED(end, PAGE_SIZE))
+ index_end++;
+
+ curr_sample_pos = 0;
+ while (index < index_end) {
+ page = find_get_page(inode->i_mapping, index);
+ in_data = kmap_local_page(page);
+ /* Handle case where the start is not aligned to PAGE_SIZE */
+ i = start % PAGE_SIZE;
+ while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
+ /* Don't sample any garbage from the last page */
+ if (start > end - SAMPLING_READ_SIZE)
+ break;
+ memcpy(&ws->sample[curr_sample_pos], &in_data[i],
+ SAMPLING_READ_SIZE);
+ i += SAMPLING_INTERVAL;
+ start += SAMPLING_INTERVAL;
+ curr_sample_pos += SAMPLING_READ_SIZE;
+ }
+ kunmap_local(in_data);
+ put_page(page);
+
+ index++;
+ }
+
+ ws->sample_size = curr_sample_pos;
+}
+
+/*
+ * Compression heuristic.
+ *
+ * For now is's a naive and optimistic 'return true', we'll extend the logic to
+ * quickly (compared to direct compression) detect data characteristics
+ * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
+ * data.
+ *
+ * The following types of analysis can be performed:
+ * - detect mostly zero data
+ * - detect data with low "byte set" size (text, etc)
+ * - detect data with low/high "core byte" set
+ *
+ * Return non-zero if the compression should be done, 0 otherwise.
+ */
+int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
+{
+ struct list_head *ws_list = get_workspace(0, 0);
+ struct heuristic_ws *ws;
+ u32 i;
+ u8 byte;
+ int ret = 0;
+
+ ws = list_entry(ws_list, struct heuristic_ws, list);
+
+ heuristic_collect_sample(inode, start, end, ws);
+
+ if (sample_repeated_patterns(ws)) {
+ ret = 1;
+ goto out;
+ }
+
+ memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
+
+ for (i = 0; i < ws->sample_size; i++) {
+ byte = ws->sample[i];
+ ws->bucket[byte].count++;
+ }
+
+ i = byte_set_size(ws);
+ if (i < BYTE_SET_THRESHOLD) {
+ ret = 2;
+ goto out;
+ }
+
+ i = byte_core_set_size(ws);
+ if (i <= BYTE_CORE_SET_LOW) {
+ ret = 3;
+ goto out;
+ }
+
+ if (i >= BYTE_CORE_SET_HIGH) {
+ ret = 0;
+ goto out;
+ }
+
+ i = shannon_entropy(ws);
+ if (i <= ENTROPY_LVL_ACEPTABLE) {
+ ret = 4;
+ goto out;
+ }
+
+ /*
+ * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
+ * needed to give green light to compression.
+ *
+ * For now just assume that compression at that level is not worth the
+ * resources because:
+ *
+ * 1. it is possible to defrag the data later
+ *
+ * 2. the data would turn out to be hardly compressible, eg. 150 byte
+ * values, every bucket has counter at level ~54. The heuristic would
+ * be confused. This can happen when data have some internal repeated
+ * patterns like "abbacbbc...". This can be detected by analyzing
+ * pairs of bytes, which is too costly.
+ */
+ if (i < ENTROPY_LVL_HIGH) {
+ ret = 5;
+ goto out;
+ } else {
+ ret = 0;
+ goto out;
+ }
+
+out:
+ put_workspace(0, ws_list);
+ return ret;
+}
+
+/*
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ * level, unrecognized string will set the default level
+ */
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
+{
+ unsigned int level = 0;
+ int ret;
+
+ if (!type)
+ return 0;
+
+ if (str[0] == ':') {
+ ret = kstrtouint(str + 1, 10, &level);
+ if (ret)
+ level = 0;
+ }
+
+ level = btrfs_compress_set_level(type, level);
+
+ return level;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000..1aa02903d
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_COMPRESSION_H
+#define BTRFS_COMPRESSION_H
+
+#include <linux/sizes.h>
+
+struct btrfs_inode;
+
+/*
+ * We want to make sure that amount of RAM required to uncompress an extent is
+ * reasonable, so we limit the total size in ram of a compressed extent to
+ * 128k. This is a crucial number because it also controls how easily we can
+ * spread reads across cpus for decompression.
+ *
+ * We also want to make sure the amount of IO required to do a random read is
+ * reasonably small, so we limit the size of a compressed extent to 128k.
+ */
+
+/* Maximum length of compressed data stored on disk */
+#define BTRFS_MAX_COMPRESSED (SZ_128K)
+static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
+
+/* Maximum size of data before compression */
+#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
+
+#define BTRFS_ZLIB_DEFAULT_LEVEL 3
+
+struct compressed_bio {
+ /* Number of outstanding bios */
+ refcount_t pending_ios;
+
+ /* Number of compressed pages in the array */
+ unsigned int nr_pages;
+
+ /* the pages with the compressed data on them */
+ struct page **compressed_pages;
+
+ /* inode that owns this data */
+ struct inode *inode;
+
+ /* starting offset in the inode for our pages */
+ u64 start;
+
+ /* Number of bytes in the inode we're working on */
+ unsigned int len;
+
+ /* Number of bytes on disk */
+ unsigned int compressed_len;
+
+ /* The compression algorithm for this bio */
+ u8 compress_type;
+
+ /* Whether this is a write for writeback. */
+ bool writeback;
+
+ /* IO errors */
+ blk_status_t status;
+
+ union {
+ /* For reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+ struct work_struct write_end_work;
+ };
+};
+
+static inline unsigned int btrfs_compress_type(unsigned int type_level)
+{
+ return (type_level & 0xF);
+}
+
+static inline unsigned int btrfs_compress_level(unsigned int type_level)
+{
+ return ((type_level & 0xF0) >> 4);
+}
+
+void __init btrfs_init_compress(void);
+void __cold btrfs_exit_compress(void);
+
+int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
+ u64 start, struct page **pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out);
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen);
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ struct compressed_bio *cb, u32 decompressed);
+
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ unsigned int len, u64 disk_start,
+ unsigned int compressed_len,
+ struct page **compressed_pages,
+ unsigned int nr_pages,
+ blk_opf_t write_flags,
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback);
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num);
+
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+
+enum btrfs_compression_type {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LZO = 2,
+ BTRFS_COMPRESS_ZSTD = 3,
+ BTRFS_NR_COMPRESS_TYPES = 4,
+};
+
+struct workspace_manager {
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
+ wait_queue_head_t ws_wait;
+};
+
+struct list_head *btrfs_get_workspace(int type, unsigned int level);
+void btrfs_put_workspace(int type, struct list_head *ws);
+
+struct btrfs_compress_op {
+ struct workspace_manager *workspace_manager;
+ /* Maximum level supported by the compression algorithm */
+ unsigned int max_level;
+ unsigned int default_level;
+};
+
+/* The heuristic workspaces are managed via the 0th workspace manager */
+#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES
+
+extern const struct btrfs_compress_op btrfs_heuristic_compress;
+extern const struct btrfs_compress_op btrfs_zlib_compress;
+extern const struct btrfs_compress_op btrfs_lzo_compress;
+extern const struct btrfs_compress_op btrfs_zstd_compress;
+
+const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
+
+int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000..e08688844
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,5011 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007,2008 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/mm.h>
+#include <linux/error-injection.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "volumes.h"
+#include "qgroup.h"
+#include "tree-mod-log.h"
+#include "tree-checker.h"
+
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *ins_key, struct btrfs_path *path,
+ int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+ struct extent_buffer *dst,
+ struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+ struct extent_buffer *dst_buf,
+ struct extent_buffer *src_buf);
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+ int level, int slot);
+
+static const struct btrfs_csums {
+ u16 size;
+ const char name[10];
+ const char driver[12];
+} btrfs_csums[] = {
+ [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+ [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
+ [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
+ .driver = "blake2b-256" },
+};
+
+int btrfs_super_csum_size(const struct btrfs_super_block *s)
+{
+ u16 t = btrfs_super_csum_type(s);
+ /*
+ * csum type is validated at mount time
+ */
+ return btrfs_csums[t].size;
+}
+
+const char *btrfs_super_csum_name(u16 csum_type)
+{
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].name;
+}
+
+/*
+ * Return driver name if defined, otherwise the name that's also a valid driver
+ * name
+ */
+const char *btrfs_super_csum_driver(u16 csum_type)
+{
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].driver[0] ?
+ btrfs_csums[csum_type].driver :
+ btrfs_csums[csum_type].name;
+}
+
+size_t __attribute_const__ btrfs_get_num_csums(void)
+{
+ return ARRAY_SIZE(btrfs_csums);
+}
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+ return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+}
+
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+ if (!p)
+ return;
+ btrfs_release_path(p);
+ kmem_cache_free(btrfs_path_cachep, p);
+}
+
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_path *p)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ p->slots[i] = 0;
+ if (!p->nodes[i])
+ continue;
+ if (p->locks[i]) {
+ btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
+ p->locks[i] = 0;
+ }
+ free_extent_buffer(p->nodes[i]);
+ p->nodes[i] = NULL;
+ }
+}
+
+/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+bool __cold abort_should_print_stack(int errno)
+{
+ switch (errno) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
+
+/*
+ * safely gets a reference on the root node of a tree. A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree. See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear. It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ rcu_read_lock();
+ eb = rcu_dereference(root->node);
+
+ /*
+ * RCU really hurts here, we could free up the root node because
+ * it was COWed but we may not get the new root node yet so do
+ * the inc_not_zero dance and if it doesn't work then
+ * synchronize_rcu and try again.
+ */
+ if (atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ synchronize_rcu();
+ }
+ return eb;
+}
+
+/*
+ * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
+ * just get put onto a simple dirty list. Transaction walks this list to make
+ * sure they get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+ !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+ return;
+
+ spin_lock(&fs_info->trans_lock);
+ if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+ /* Want the extent tree to be the last on the list */
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ list_move_tail(&root->dirty_list,
+ &fs_info->dirty_cowonly_roots);
+ else
+ list_move(&root->dirty_list,
+ &fs_info->dirty_cowonly_roots);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid. The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *cow;
+ int ret = 0;
+ int level;
+ struct btrfs_disk_key disk_key;
+
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
+ trans->transid != fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
+ trans->transid != root->last_trans);
+
+ level = btrfs_header_level(buf);
+ if (level == 0)
+ btrfs_item_key(buf, &disk_key, 0);
+ else
+ btrfs_node_key(buf, &disk_key, 0);
+
+ cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
+ if (IS_ERR(cow))
+ return PTR_ERR(cow);
+
+ copy_extent_buffer_full(cow, buf);
+ btrfs_set_header_bytenr(cow, cow->start);
+ btrfs_set_header_generation(cow, trans->transid);
+ btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+ btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+ BTRFS_HEADER_FLAG_RELOC);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+ else
+ btrfs_set_header_owner(cow, new_root_objectid);
+
+ write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
+
+ WARN_ON(btrfs_header_generation(buf) > trans->transid);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ btrfs_mark_buffer_dirty(cow);
+ *cow_ret = cow;
+ return 0;
+}
+
+/*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ /*
+ * Tree blocks not in shareable trees and tree roots are never shared.
+ * If a block was allocated after the last snapshot and the block was
+ * not allocated by tree relocation, we know the block is not shared.
+ */
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
+ buf != root->node && buf != root->commit_root &&
+ (btrfs_header_generation(buf) <=
+ btrfs_root_last_snapshot(&root->root_item) ||
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+ return 1;
+
+ return 0;
+}
+
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *cow,
+ int *last_ref)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 refs;
+ u64 owner;
+ u64 flags;
+ u64 new_flags = 0;
+ int ret;
+
+ /*
+ * Backrefs update rules:
+ *
+ * Always use full backrefs for extent pointers in tree block
+ * allocated by tree relocation.
+ *
+ * If a shared tree block is no longer referenced by its owner
+ * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+ * use full backrefs for extent pointers in tree block.
+ *
+ * If a tree block is been relocating
+ * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+ * use full backrefs for extent pointers in tree block.
+ * The reason for this is some operations (such as drop tree)
+ * are only allowed for blocks use full backrefs.
+ */
+
+ if (btrfs_block_can_be_shared(root, buf)) {
+ ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
+ btrfs_header_level(buf), 1,
+ &refs, &flags);
+ if (ret)
+ return ret;
+ if (refs == 0) {
+ ret = -EROFS;
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ return ret;
+ }
+ } else {
+ refs = 1;
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ else
+ flags = 0;
+ }
+
+ owner = btrfs_header_owner(buf);
+ BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
+ if (refs > 1) {
+ if ((owner == root->root_key.objectid ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+ !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+ ret = btrfs_inc_ref(trans, root, buf, 1);
+ if (ret)
+ return ret;
+
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_dec_ref(trans, root, buf, 0);
+ if (ret)
+ return ret;
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ if (ret)
+ return ret;
+ }
+ new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else {
+
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+ if (ret)
+ return ret;
+ }
+ if (new_flags != 0) {
+ int level = btrfs_header_level(buf);
+
+ ret = btrfs_set_disk_extent_flags(trans, buf,
+ new_flags, level);
+ if (ret)
+ return ret;
+ }
+ } else {
+ if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+ if (ret)
+ return ret;
+ ret = btrfs_dec_ref(trans, root, buf, 1);
+ if (ret)
+ return ret;
+ }
+ btrfs_clean_tree_block(buf);
+ *last_ref = 1;
+ }
+ return 0;
+}
+
+/*
+ * does the dirty work in cow of a single block. The parent block (if
+ * supplied) is updated to point to the new cow copy. The new buffer is marked
+ * dirty and returned locked. If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow. This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *cow;
+ int level, ret;
+ int last_ref = 0;
+ int unlock_orig = 0;
+ u64 parent_start = 0;
+
+ if (*cow_ret == buf)
+ unlock_orig = 1;
+
+ btrfs_assert_tree_write_locked(buf);
+
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
+ trans->transid != fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
+ trans->transid != root->last_trans);
+
+ level = btrfs_header_level(buf);
+
+ if (level == 0)
+ btrfs_item_key(buf, &disk_key, 0);
+ else
+ btrfs_node_key(buf, &disk_key, 0);
+
+ if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
+ parent_start = parent->start;
+
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size, nest);
+ if (IS_ERR(cow))
+ return PTR_ERR(cow);
+
+ /* cow is set to blocking by btrfs_init_new_buffer */
+
+ copy_extent_buffer_full(cow, buf);
+ btrfs_set_header_bytenr(cow, cow->start);
+ btrfs_set_header_generation(cow, trans->transid);
+ btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+ btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+ BTRFS_HEADER_FLAG_RELOC);
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+ else
+ btrfs_set_header_owner(cow, root->root_key.objectid);
+
+ write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
+
+ ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ if (buf == root->node) {
+ WARN_ON(parent && parent != buf);
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ parent_start = buf->start;
+
+ ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
+ if (ret < 0) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ atomic_inc(&cow->refs);
+ rcu_assign_pointer(root->node, cow);
+
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
+ free_extent_buffer(buf);
+ add_root_to_dirty_list(root);
+ } else {
+ WARN_ON(trans->transid != btrfs_header_generation(parent));
+ btrfs_tree_mod_log_insert_key(parent, parent_slot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ btrfs_set_node_blockptr(parent, parent_slot,
+ cow->start);
+ btrfs_set_node_ptr_generation(parent, parent_slot,
+ trans->transid);
+ btrfs_mark_buffer_dirty(parent);
+ if (last_ref) {
+ ret = btrfs_tree_mod_log_free_eb(buf);
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
+ }
+ if (unlock_orig)
+ btrfs_tree_unlock(buf);
+ free_extent_buffer_stale(buf);
+ btrfs_mark_buffer_dirty(cow);
+ *cow_ret = cow;
+ return 0;
+}
+
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ if (btrfs_is_testing(root->fs_info))
+ return 0;
+
+ /* Ensure we can see the FORCE_COW bit */
+ smp_mb__before_atomic();
+
+ /*
+ * We do not need to cow a block if
+ * 1) this block is not created or changed in this transaction;
+ * 2) this block does not belong to TREE_RELOC tree;
+ * 3) the root is not forced COW.
+ *
+ * What is forced COW:
+ * when we create snapshot during committing the transaction,
+ * after we've finished copying src root, we must COW the shared
+ * block to ensure the metadata consistency.
+ */
+ if (btrfs_header_generation(buf) == trans->transid &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+ !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+ !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
+ return 0;
+ return 1;
+}
+
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't COWed more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 search_start;
+ int ret;
+
+ if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+ "attempt to COW block %llu on root %llu that is being deleted",
+ buf->start, btrfs_root_id(root));
+ return -EUCLEAN;
+ }
+
+ /*
+ * COWing must happen through a running transaction, which always
+ * matches the current fs generation (it's a transaction with a state
+ * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
+ * into error state to prevent the commit of any transaction.
+ */
+ if (unlikely(trans->transaction != fs_info->running_transaction ||
+ trans->transid != fs_info->generation)) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
+ buf->start, btrfs_root_id(root), trans->transid,
+ fs_info->running_transaction->transid,
+ fs_info->generation);
+ return -EUCLEAN;
+ }
+
+ if (!should_cow_block(trans, root, buf)) {
+ *cow_ret = buf;
+ return 0;
+ }
+
+ search_start = buf->start & ~((u64)SZ_1G - 1);
+
+ /*
+ * Before CoWing this block for later modification, check if it's
+ * the subtree root and do the delayed subtree trace if needed.
+ *
+ * Also We don't care about the error, as it's handled internally.
+ */
+ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
+ ret = __btrfs_cow_block(trans, root, buf, parent,
+ parent_slot, cow_ret, search_start, 0, nest);
+
+ trace_btrfs_cow_block(root, buf, *cow_ret);
+
+ return ret;
+}
+ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
+
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+ if (blocknr < other && other - (blocknr + blocksize) < 32768)
+ return 1;
+ if (blocknr > other && blocknr - (other + blocksize) < 32768)
+ return 1;
+ return 0;
+}
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static int comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(const struct btrfs_disk_key *disk,
+ const struct btrfs_key *k2)
+{
+ struct btrfs_key k1;
+
+ btrfs_disk_key_to_cpu(&k1, disk);
+
+ return btrfs_comp_cpu_keys(&k1, k2);
+}
+#endif
+
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
+{
+ if (k1->objectid > k2->objectid)
+ return 1;
+ if (k1->objectid < k2->objectid)
+ return -1;
+ if (k1->type > k2->type)
+ return 1;
+ if (k1->type < k2->type)
+ return -1;
+ if (k1->offset > k2->offset)
+ return 1;
+ if (k1->offset < k2->offset)
+ return -1;
+ return 0;
+}
+
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *parent,
+ int start_slot, u64 *last_ret,
+ struct btrfs_key *progress)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *cur;
+ u64 blocknr;
+ u64 search_start = *last_ret;
+ u64 last_block = 0;
+ u64 other;
+ u32 parent_nritems;
+ int end_slot;
+ int i;
+ int err = 0;
+ u32 blocksize;
+ int progress_passed = 0;
+ struct btrfs_disk_key disk_key;
+
+ /*
+ * COWing must happen through a running transaction, which always
+ * matches the current fs generation (it's a transaction with a state
+ * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
+ * into error state to prevent the commit of any transaction.
+ */
+ if (unlikely(trans->transaction != fs_info->running_transaction ||
+ trans->transid != fs_info->generation)) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
+ parent->start, btrfs_root_id(root), trans->transid,
+ fs_info->running_transaction->transid,
+ fs_info->generation);
+ return -EUCLEAN;
+ }
+
+ parent_nritems = btrfs_header_nritems(parent);
+ blocksize = fs_info->nodesize;
+ end_slot = parent_nritems - 1;
+
+ if (parent_nritems <= 1)
+ return 0;
+
+ for (i = start_slot; i <= end_slot; i++) {
+ int close = 1;
+
+ btrfs_node_key(parent, &disk_key, i);
+ if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+ continue;
+
+ progress_passed = 1;
+ blocknr = btrfs_node_blockptr(parent, i);
+ if (last_block == 0)
+ last_block = blocknr;
+
+ if (i > 0) {
+ other = btrfs_node_blockptr(parent, i - 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (!close && i < end_slot) {
+ other = btrfs_node_blockptr(parent, i + 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (close) {
+ last_block = blocknr;
+ continue;
+ }
+
+ cur = btrfs_read_node_slot(parent, i);
+ if (IS_ERR(cur))
+ return PTR_ERR(cur);
+ if (search_start == 0)
+ search_start = last_block;
+
+ btrfs_tree_lock(cur);
+ err = __btrfs_cow_block(trans, root, cur, parent, i,
+ &cur, search_start,
+ min(16 * blocksize,
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
+ if (err) {
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ break;
+ }
+ search_start = cur->start;
+ last_block = cur->start;
+ *last_ret = search_start;
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ }
+ return err;
+}
+
+/*
+ * Search for a key in the given extent_buffer.
+ *
+ * The lower boundary for the search is specified by the slot number @low. Use a
+ * value of 0 to search over the whole extent buffer.
+ *
+ * The slot in the extent buffer is returned via @slot. If the key exists in the
+ * extent buffer, then @slot will point to the slot where the key is, otherwise
+ * it points to the slot where you would insert the key.
+ *
+ * Slot may point to the total number of items (i.e. one position beyond the last
+ * key) if the key is bigger than the last key in the extent buffer.
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb, int low,
+ const struct btrfs_key *key, int *slot)
+{
+ unsigned long p;
+ int item_size;
+ int high = btrfs_header_nritems(eb);
+ int ret;
+ const int key_size = sizeof(struct btrfs_disk_key);
+
+ if (low > high) {
+ btrfs_err(eb->fs_info,
+ "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
+ __func__, low, high, eb->start,
+ btrfs_header_owner(eb), btrfs_header_level(eb));
+ return -EINVAL;
+ }
+
+ if (btrfs_header_level(eb) == 0) {
+ p = offsetof(struct btrfs_leaf, items);
+ item_size = sizeof(struct btrfs_item);
+ } else {
+ p = offsetof(struct btrfs_node, ptrs);
+ item_size = sizeof(struct btrfs_key_ptr);
+ }
+
+ while (low < high) {
+ unsigned long oip;
+ unsigned long offset;
+ struct btrfs_disk_key *tmp;
+ struct btrfs_disk_key unaligned;
+ int mid;
+
+ mid = (low + high) / 2;
+ offset = p + mid * item_size;
+ oip = offset_in_page(offset);
+
+ if (oip + key_size <= PAGE_SIZE) {
+ const unsigned long idx = get_eb_page_index(offset);
+ char *kaddr = page_address(eb->pages[idx]);
+
+ oip = get_eb_offset_in_page(eb, offset);
+ tmp = (struct btrfs_disk_key *)(kaddr + oip);
+ } else {
+ read_extent_buffer(eb, &unaligned, offset, key_size);
+ tmp = &unaligned;
+ }
+
+ ret = comp_keys(tmp, key);
+
+ if (ret < 0)
+ low = mid + 1;
+ else if (ret > 0)
+ high = mid;
+ else {
+ *slot = mid;
+ return 0;
+ }
+ }
+ *slot = low;
+ return 1;
+}
+
+/*
+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
+ */
+int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
+ int *slot)
+{
+ return generic_bin_search(eb, 0, key, slot);
+}
+
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) + size);
+ spin_unlock(&root->accounting_lock);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) - size);
+ spin_unlock(&root->accounting_lock);
+}
+
+/* given a node and slot number, this reads the blocks it points to. The
+ * extent buffer is returned with a reference taken (but unlocked).
+ */
+struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
+ int slot)
+{
+ int level = btrfs_header_level(parent);
+ struct extent_buffer *eb;
+ struct btrfs_key first_key;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(parent))
+ return ERR_PTR(-ENOENT);
+
+ BUG_ON(level == 0);
+
+ btrfs_node_key_to_cpu(parent, &first_key, slot);
+ eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
+ btrfs_header_owner(parent),
+ btrfs_node_ptr_generation(parent, slot),
+ level - 1, &first_key);
+ if (IS_ERR(eb))
+ return eb;
+ if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return ERR_PTR(-EIO);
+ }
+
+ return eb;
+}
+
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion. We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *right = NULL;
+ struct extent_buffer *mid;
+ struct extent_buffer *left = NULL;
+ struct extent_buffer *parent = NULL;
+ int ret = 0;
+ int wret;
+ int pslot;
+ int orig_slot = path->slots[level];
+ u64 orig_ptr;
+
+ ASSERT(level > 0);
+
+ mid = path->nodes[level];
+
+ WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
+ WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+ orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+ if (level < BTRFS_MAX_LEVEL - 1) {
+ parent = path->nodes[level + 1];
+ pslot = path->slots[level + 1];
+ }
+
+ /*
+ * deal with the case where there is only one pointer in the root
+ * by promoting the node below to a root
+ */
+ if (!parent) {
+ struct extent_buffer *child;
+
+ if (btrfs_header_nritems(mid) != 1)
+ return 0;
+
+ /* promote the child to a root */
+ child = btrfs_read_node_slot(mid, 0);
+ if (IS_ERR(child)) {
+ ret = PTR_ERR(child);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ goto enospc;
+ }
+
+ btrfs_tree_lock(child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ goto enospc;
+ }
+
+ ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
+ if (ret < 0) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ btrfs_abort_transaction(trans, ret);
+ goto enospc;
+ }
+ rcu_assign_pointer(root->node, child);
+
+ add_root_to_dirty_list(root);
+ btrfs_tree_unlock(child);
+
+ path->locks[level] = 0;
+ path->nodes[level] = NULL;
+ btrfs_clean_tree_block(mid);
+ btrfs_tree_unlock(mid);
+ /* once for the path */
+ free_extent_buffer(mid);
+
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+ /* once for the root ptr */
+ free_extent_buffer_stale(mid);
+ return 0;
+ }
+ if (btrfs_header_nritems(mid) >
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
+ return 0;
+
+ left = btrfs_read_node_slot(parent, pslot - 1);
+ if (IS_ERR(left))
+ left = NULL;
+
+ if (left) {
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ wret = btrfs_cow_block(trans, root, left,
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
+ if (wret) {
+ ret = wret;
+ goto enospc;
+ }
+ }
+
+ right = btrfs_read_node_slot(parent, pslot + 1);
+ if (IS_ERR(right))
+ right = NULL;
+
+ if (right) {
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ wret = btrfs_cow_block(trans, root, right,
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
+ if (wret) {
+ ret = wret;
+ goto enospc;
+ }
+ }
+
+ /* first, try to make some room in the middle buffer */
+ if (left) {
+ orig_slot += btrfs_header_nritems(left);
+ wret = push_node_left(trans, left, mid, 1);
+ if (wret < 0)
+ ret = wret;
+ }
+
+ /*
+ * then try to empty the right most buffer into the middle
+ */
+ if (right) {
+ wret = push_node_left(trans, mid, right, 1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ if (btrfs_header_nritems(right) == 0) {
+ btrfs_clean_tree_block(right);
+ btrfs_tree_unlock(right);
+ del_ptr(root, path, level + 1, pslot + 1);
+ root_sub_used(root, right->len);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), right,
+ 0, 1);
+ free_extent_buffer_stale(right);
+ right = NULL;
+ } else {
+ struct btrfs_disk_key right_key;
+ btrfs_node_key(right, &right_key, 0);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto enospc;
+ }
+ btrfs_set_node_key(parent, &right_key, pslot + 1);
+ btrfs_mark_buffer_dirty(parent);
+ }
+ }
+ if (btrfs_header_nritems(mid) == 1) {
+ /*
+ * we're not allowed to leave a node with one item in the
+ * tree during a delete. A deletion from lower in the tree
+ * could try to delete the only pointer in this node.
+ * So, pull some keys from the left.
+ * There has to be a left pointer at this point because
+ * otherwise we would have pulled some pointers from the
+ * right
+ */
+ if (!left) {
+ ret = -EROFS;
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ goto enospc;
+ }
+ wret = balance_node_right(trans, mid, left);
+ if (wret < 0) {
+ ret = wret;
+ goto enospc;
+ }
+ if (wret == 1) {
+ wret = push_node_left(trans, left, mid, 1);
+ if (wret < 0)
+ ret = wret;
+ }
+ BUG_ON(wret == 1);
+ }
+ if (btrfs_header_nritems(mid) == 0) {
+ btrfs_clean_tree_block(mid);
+ btrfs_tree_unlock(mid);
+ del_ptr(root, path, level + 1, pslot);
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+ free_extent_buffer_stale(mid);
+ mid = NULL;
+ } else {
+ /* update the parent key to reflect our changes */
+ struct btrfs_disk_key mid_key;
+ btrfs_node_key(mid, &mid_key, 0);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto enospc;
+ }
+ btrfs_set_node_key(parent, &mid_key, pslot);
+ btrfs_mark_buffer_dirty(parent);
+ }
+
+ /* update the path */
+ if (left) {
+ if (btrfs_header_nritems(left) > orig_slot) {
+ atomic_inc(&left->refs);
+ /* left was locked after cow */
+ path->nodes[level] = left;
+ path->slots[level + 1] -= 1;
+ path->slots[level] = orig_slot;
+ if (mid) {
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ }
+ } else {
+ orig_slot -= btrfs_header_nritems(left);
+ path->slots[level] = orig_slot;
+ }
+ }
+ /* double check we haven't messed things up */
+ if (orig_ptr !=
+ btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+ BUG();
+enospc:
+ if (right) {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ if (left) {
+ if (path->nodes[level] != left)
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ return ret;
+}
+
+/* Node balancing for insertion. Here we only split or push nodes around
+ * when they are completely full. This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *right = NULL;
+ struct extent_buffer *mid;
+ struct extent_buffer *left = NULL;
+ struct extent_buffer *parent = NULL;
+ int ret = 0;
+ int wret;
+ int pslot;
+ int orig_slot = path->slots[level];
+
+ if (level == 0)
+ return 1;
+
+ mid = path->nodes[level];
+ WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+ if (level < BTRFS_MAX_LEVEL - 1) {
+ parent = path->nodes[level + 1];
+ pslot = path->slots[level + 1];
+ }
+
+ if (!parent)
+ return 1;
+
+ left = btrfs_read_node_slot(parent, pslot - 1);
+ if (IS_ERR(left))
+ left = NULL;
+
+ /* first, try to make some room in the middle buffer */
+ if (left) {
+ u32 left_nr;
+
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+
+ left_nr = btrfs_header_nritems(left);
+ if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
+ wret = 1;
+ } else {
+ ret = btrfs_cow_block(trans, root, left, parent,
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
+ if (ret)
+ wret = 1;
+ else {
+ wret = push_node_left(trans, left, mid, 0);
+ }
+ }
+ if (wret < 0)
+ ret = wret;
+ if (wret == 0) {
+ struct btrfs_disk_key disk_key;
+ orig_slot += left_nr;
+ btrfs_node_key(mid, &disk_key, 0);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
+ btrfs_set_node_key(parent, &disk_key, pslot);
+ btrfs_mark_buffer_dirty(parent);
+ if (btrfs_header_nritems(left) > orig_slot) {
+ path->nodes[level] = left;
+ path->slots[level + 1] -= 1;
+ path->slots[level] = orig_slot;
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ } else {
+ orig_slot -=
+ btrfs_header_nritems(left);
+ path->slots[level] = orig_slot;
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ return 0;
+ }
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ right = btrfs_read_node_slot(parent, pslot + 1);
+ if (IS_ERR(right))
+ right = NULL;
+
+ /*
+ * then try to empty the right most buffer into the middle
+ */
+ if (right) {
+ u32 right_nr;
+
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+
+ right_nr = btrfs_header_nritems(right);
+ if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
+ wret = 1;
+ } else {
+ ret = btrfs_cow_block(trans, root, right,
+ parent, pslot + 1,
+ &right, BTRFS_NESTING_RIGHT_COW);
+ if (ret)
+ wret = 1;
+ else {
+ wret = balance_node_right(trans, right, mid);
+ }
+ }
+ if (wret < 0)
+ ret = wret;
+ if (wret == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(right, &disk_key, 0);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
+ btrfs_set_node_key(parent, &disk_key, pslot + 1);
+ btrfs_mark_buffer_dirty(parent);
+
+ if (btrfs_header_nritems(mid) <= orig_slot) {
+ path->nodes[level] = right;
+ path->slots[level + 1] += 1;
+ path->slots[level] = orig_slot -
+ btrfs_header_nritems(mid);
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 0;
+ }
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 1;
+}
+
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static void reada_for_search(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ int level, int slot, u64 objectid)
+{
+ struct extent_buffer *node;
+ struct btrfs_disk_key disk_key;
+ u32 nritems;
+ u64 search;
+ u64 target;
+ u64 nread = 0;
+ u64 nread_max;
+ u32 nr;
+ u32 blocksize;
+ u32 nscan = 0;
+
+ if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
+ return;
+
+ if (!path->nodes[level])
+ return;
+
+ node = path->nodes[level];
+
+ /*
+ * Since the time between visiting leaves is much shorter than the time
+ * between visiting nodes, limit read ahead of nodes to 1, to avoid too
+ * much IO at once (possibly random).
+ */
+ if (path->reada == READA_FORWARD_ALWAYS) {
+ if (level > 1)
+ nread_max = node->fs_info->nodesize;
+ else
+ nread_max = SZ_128K;
+ } else {
+ nread_max = SZ_64K;
+ }
+
+ search = btrfs_node_blockptr(node, slot);
+ blocksize = fs_info->nodesize;
+ if (path->reada != READA_FORWARD_ALWAYS) {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer(fs_info, search);
+ if (eb) {
+ free_extent_buffer(eb);
+ return;
+ }
+ }
+
+ target = search;
+
+ nritems = btrfs_header_nritems(node);
+ nr = slot;
+
+ while (1) {
+ if (path->reada == READA_BACK) {
+ if (nr == 0)
+ break;
+ nr--;
+ } else if (path->reada == READA_FORWARD ||
+ path->reada == READA_FORWARD_ALWAYS) {
+ nr++;
+ if (nr >= nritems)
+ break;
+ }
+ if (path->reada == READA_BACK && objectid) {
+ btrfs_node_key(node, &disk_key, nr);
+ if (btrfs_disk_key_objectid(&disk_key) != objectid)
+ break;
+ }
+ search = btrfs_node_blockptr(node, nr);
+ if (path->reada == READA_FORWARD_ALWAYS ||
+ (search <= target && target - search <= 65536) ||
+ (search > target && search - target <= 65536)) {
+ btrfs_readahead_node_child(node, nr);
+ nread += blocksize;
+ }
+ nscan++;
+ if (nread > nread_max || nscan > 32)
+ break;
+ }
+}
+
+static noinline void reada_for_balance(struct btrfs_path *path, int level)
+{
+ struct extent_buffer *parent;
+ int slot;
+ int nritems;
+
+ parent = path->nodes[level + 1];
+ if (!parent)
+ return;
+
+ nritems = btrfs_header_nritems(parent);
+ slot = path->slots[level + 1];
+
+ if (slot > 0)
+ btrfs_readahead_node_child(parent, slot - 1);
+ if (slot + 1 < nritems)
+ btrfs_readahead_node_child(parent, slot + 1);
+}
+
+
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree. The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block. This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+ int lowest_unlock, int min_write_lock_level,
+ int *write_lock_level)
+{
+ int i;
+ int skip_level = level;
+ bool check_skip = true;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ break;
+ if (!path->locks[i])
+ break;
+
+ if (check_skip) {
+ if (path->slots[i] == 0) {
+ skip_level = i + 1;
+ continue;
+ }
+
+ if (path->keep_locks) {
+ u32 nritems;
+
+ nritems = btrfs_header_nritems(path->nodes[i]);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
+ }
+
+ if (i >= lowest_unlock && i > skip_level) {
+ check_skip = false;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ if (write_lock_level &&
+ i > min_write_lock_level &&
+ i <= *write_lock_level) {
+ *write_lock_level = i - 1;
+ }
+ }
+ }
+}
+
+/*
+ * Helper function for btrfs_search_slot() and other functions that do a search
+ * on a btree. The goal is to find a tree block in the cache (the radix tree at
+ * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
+ * its pages from disk.
+ *
+ * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
+ * whole btree search, starting again from the current root node.
+ */
+static int
+read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
+ struct extent_buffer **eb_ret, int level, int slot,
+ const struct btrfs_key *key)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 blocknr;
+ u64 gen;
+ struct extent_buffer *tmp;
+ struct btrfs_key first_key;
+ int ret;
+ int parent_level;
+ bool unlock_up;
+
+ unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
+ blocknr = btrfs_node_blockptr(*eb_ret, slot);
+ gen = btrfs_node_ptr_generation(*eb_ret, slot);
+ parent_level = btrfs_header_level(*eb_ret);
+ btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+
+ /*
+ * If we need to read an extent buffer from disk and we are holding locks
+ * on upper level nodes, we unlock all the upper nodes before reading the
+ * extent buffer, and then return -EAGAIN to the caller as it needs to
+ * restart the search. We don't release the lock on the current level
+ * because we need to walk this node to figure out which blocks to read.
+ */
+ tmp = find_extent_buffer(fs_info, blocknr);
+ if (tmp) {
+ if (p->reada == READA_FORWARD_ALWAYS)
+ reada_for_search(fs_info, p, level, slot, key->objectid);
+
+ /* first we do an atomic uptodate check */
+ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ /*
+ * Do extra check for first_key, eb can be stale due to
+ * being cached, read from scrub, or have multiple
+ * parents (shared tree blocks).
+ */
+ if (btrfs_verify_level_key(tmp,
+ parent_level - 1, &first_key, gen)) {
+ free_extent_buffer(tmp);
+ return -EUCLEAN;
+ }
+ *eb_ret = tmp;
+ return 0;
+ }
+
+ if (p->nowait) {
+ free_extent_buffer(tmp);
+ return -EAGAIN;
+ }
+
+ if (unlock_up)
+ btrfs_unlock_up_safe(p, level + 1);
+
+ /* now we're allowed to do a blocking uptodate check */
+ ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
+ if (ret) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
+ }
+ if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EUCLEAN;
+ }
+
+ if (unlock_up)
+ ret = -EAGAIN;
+
+ goto out;
+ } else if (p->nowait) {
+ return -EAGAIN;
+ }
+
+ if (unlock_up) {
+ btrfs_unlock_up_safe(p, level + 1);
+ ret = -EAGAIN;
+ } else {
+ ret = 0;
+ }
+
+ if (p->reada != READA_NONE)
+ reada_for_search(fs_info, p, level, slot, key->objectid);
+
+ tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
+ gen, parent_level - 1, &first_key);
+ if (IS_ERR(tmp)) {
+ btrfs_release_path(p);
+ return PTR_ERR(tmp);
+ }
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!extent_buffer_uptodate(tmp))
+ ret = -EIO;
+
+out:
+ if (ret == 0) {
+ *eb_ret = tmp;
+ } else {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ }
+
+ return ret;
+}
+
+/*
+ * helper function for btrfs_search_slot. This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned. If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p,
+ struct extent_buffer *b, int level, int ins_len,
+ int *write_lock_level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+
+ if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
+
+ if (*write_lock_level < level + 1) {
+ *write_lock_level = level + 1;
+ btrfs_release_path(p);
+ return -EAGAIN;
+ }
+
+ reada_for_balance(p, level);
+ ret = split_node(trans, root, p, level);
+
+ b = p->nodes[level];
+ } else if (ins_len < 0 && btrfs_header_nritems(b) <
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
+
+ if (*write_lock_level < level + 1) {
+ *write_lock_level = level + 1;
+ btrfs_release_path(p);
+ return -EAGAIN;
+ }
+
+ reada_for_balance(p, level);
+ ret = balance_level(trans, root, p, level);
+ if (ret)
+ return ret;
+
+ b = p->nodes[level];
+ if (!b) {
+ btrfs_release_path(p);
+ return -EAGAIN;
+ }
+ BUG_ON(btrfs_header_nritems(b) == 1);
+ }
+ return ret;
+}
+
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u64 iobjectid, u64 ioff, u8 key_type,
+ struct btrfs_key *found_key)
+{
+ int ret;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+
+ ASSERT(path);
+ ASSERT(found_key);
+
+ key.type = key_type;
+ key.objectid = iobjectid;
+ key.offset = ioff;
+
+ ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ eb = path->nodes[0];
+ if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(fs_root, path);
+ if (ret)
+ return ret;
+ eb = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+ if (found_key->type != key.type ||
+ found_key->objectid != key.objectid)
+ return 1;
+
+ return 0;
+}
+
+static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
+ struct btrfs_path *p,
+ int write_lock_level)
+{
+ struct extent_buffer *b;
+ int root_lock = 0;
+ int level = 0;
+
+ if (p->search_commit_root) {
+ b = root->commit_root;
+ atomic_inc(&b->refs);
+ level = btrfs_header_level(b);
+ /*
+ * Ensure that all callers have set skip_locking when
+ * p->search_commit_root = 1.
+ */
+ ASSERT(p->skip_locking == 1);
+
+ goto out;
+ }
+
+ if (p->skip_locking) {
+ b = btrfs_root_node(root);
+ level = btrfs_header_level(b);
+ goto out;
+ }
+
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
+ /*
+ * If the level is set to maximum, we can skip trying to get the read
+ * lock.
+ */
+ if (write_lock_level < BTRFS_MAX_LEVEL) {
+ /*
+ * We don't know the level of the root node until we actually
+ * have it read locked
+ */
+ if (p->nowait) {
+ b = btrfs_try_read_lock_root_node(root);
+ if (IS_ERR(b))
+ return b;
+ } else {
+ b = btrfs_read_lock_root_node(root);
+ }
+ level = btrfs_header_level(b);
+ if (level > write_lock_level)
+ goto out;
+
+ /* Whoops, must trade for write lock */
+ btrfs_tree_read_unlock(b);
+ free_extent_buffer(b);
+ }
+
+ b = btrfs_lock_root_node(root);
+ root_lock = BTRFS_WRITE_LOCK;
+
+ /* The level might have changed, check again */
+ level = btrfs_header_level(b);
+
+out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = root_lock;
+ /*
+ * Callers are responsible for dropping b's references.
+ */
+ return b;
+}
+
+/*
+ * Replace the extent buffer at the lowest level of the path with a cloned
+ * version. The purpose is to be able to use it safely, after releasing the
+ * commit root semaphore, even if relocation is happening in parallel, the
+ * transaction used for relocation is committed and the extent buffer is
+ * reallocated in the next transaction.
+ *
+ * This is used in a context where the caller does not prevent transaction
+ * commits from happening, either by holding a transaction handle or holding
+ * some lock, while it's doing searches through a commit root.
+ * At the moment it's only used for send operations.
+ */
+static int finish_need_commit_sem_search(struct btrfs_path *path)
+{
+ const int i = path->lowest_level;
+ const int slot = path->slots[i];
+ struct extent_buffer *lowest = path->nodes[i];
+ struct extent_buffer *clone;
+
+ ASSERT(path->need_commit_sem);
+
+ if (!lowest)
+ return 0;
+
+ lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
+
+ clone = btrfs_clone_extent_buffer(lowest);
+ if (!clone)
+ return -ENOMEM;
+
+ btrfs_release_path(path);
+ path->nodes[i] = clone;
+ path->slots[i] = slot;
+
+ return 0;
+}
+
+static inline int search_for_key_slot(struct extent_buffer *eb,
+ int search_low_slot,
+ const struct btrfs_key *key,
+ int prev_cmp,
+ int *slot)
+{
+ /*
+ * If a previous call to btrfs_bin_search() on a parent node returned an
+ * exact match (prev_cmp == 0), we can safely assume the target key will
+ * always be at slot 0 on lower levels, since each key pointer
+ * (struct btrfs_key_ptr) refers to the lowest key accessible from the
+ * subtree it points to. Thus we can skip searching lower levels.
+ */
+ if (prev_cmp == 0) {
+ *slot = 0;
+ return 0;
+ }
+
+ return generic_bin_search(eb, search_low_slot, key, slot);
+}
+
+static int search_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_path *path,
+ int ins_len,
+ int prev_cmp)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int leaf_free_space = -1;
+ int search_low_slot = 0;
+ int ret;
+ bool do_bin_search = true;
+
+ /*
+ * If we are doing an insertion, the leaf has enough free space and the
+ * destination slot for the key is not slot 0, then we can unlock our
+ * write lock on the parent, and any other upper nodes, before doing the
+ * binary search on the leaf (with search_for_key_slot()), allowing other
+ * tasks to lock the parent and any other upper nodes.
+ */
+ if (ins_len > 0) {
+ /*
+ * Cache the leaf free space, since we will need it later and it
+ * will not change until then.
+ */
+ leaf_free_space = btrfs_leaf_free_space(leaf);
+
+ /*
+ * !path->locks[1] means we have a single node tree, the leaf is
+ * the root of the tree.
+ */
+ if (path->locks[1] && leaf_free_space >= ins_len) {
+ struct btrfs_disk_key first_key;
+
+ ASSERT(btrfs_header_nritems(leaf) > 0);
+ btrfs_item_key(leaf, &first_key, 0);
+
+ /*
+ * Doing the extra comparison with the first key is cheap,
+ * taking into account that the first key is very likely
+ * already in a cache line because it immediately follows
+ * the extent buffer's header and we have recently accessed
+ * the header's level field.
+ */
+ ret = comp_keys(&first_key, key);
+ if (ret < 0) {
+ /*
+ * The first key is smaller than the key we want
+ * to insert, so we are safe to unlock all upper
+ * nodes and we have to do the binary search.
+ *
+ * We do use btrfs_unlock_up_safe() and not
+ * unlock_up() because the later does not unlock
+ * nodes with a slot of 0 - we can safely unlock
+ * any node even if its slot is 0 since in this
+ * case the key does not end up at slot 0 of the
+ * leaf and there's no need to split the leaf.
+ */
+ btrfs_unlock_up_safe(path, 1);
+ search_low_slot = 1;
+ } else {
+ /*
+ * The first key is >= then the key we want to
+ * insert, so we can skip the binary search as
+ * the target key will be at slot 0.
+ *
+ * We can not unlock upper nodes when the key is
+ * less than the first key, because we will need
+ * to update the key at slot 0 of the parent node
+ * and possibly of other upper nodes too.
+ * If the key matches the first key, then we can
+ * unlock all the upper nodes, using
+ * btrfs_unlock_up_safe() instead of unlock_up()
+ * as stated above.
+ */
+ if (ret == 0)
+ btrfs_unlock_up_safe(path, 1);
+ /*
+ * ret is already 0 or 1, matching the result of
+ * a btrfs_bin_search() call, so there is no need
+ * to adjust it.
+ */
+ do_bin_search = false;
+ path->slots[0] = 0;
+ }
+ }
+ }
+
+ if (do_bin_search) {
+ ret = search_for_key_slot(leaf, search_low_slot, key,
+ prev_cmp, &path->slots[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (ins_len > 0) {
+ /*
+ * Item key already exists. In this case, if we are allowed to
+ * insert the item (for example, in dir_item case, item key
+ * collision is allowed), it will be merged with the original
+ * item. Only the item size grows, no new btrfs item will be
+ * added. If search_for_extension is not set, ins_len already
+ * accounts the size btrfs_item, deduct it here so leaf space
+ * check will be correct.
+ */
+ if (ret == 0 && !path->search_for_extension) {
+ ASSERT(ins_len >= sizeof(struct btrfs_item));
+ ins_len -= sizeof(struct btrfs_item);
+ }
+
+ ASSERT(leaf_free_space >= 0);
+
+ if (leaf_free_space < ins_len) {
+ int err;
+
+ err = split_leaf(trans, root, key, path, ins_len,
+ (ret == 0));
+ ASSERT(err <= 0);
+ if (WARN_ON(err > 0))
+ err = -EUCLEAN;
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * btrfs_search_slot - look for a key in a tree and perform necessary
+ * modifications to preserve tree invariants.
+ *
+ * @trans: Handle of transaction, used when modifying the tree
+ * @p: Holds all btree nodes along the search path
+ * @root: The root node of the tree
+ * @key: The key we are looking for
+ * @ins_len: Indicates purpose of search:
+ * >0 for inserts it's size of item inserted (*)
+ * <0 for deletions
+ * 0 for plain searches, not modifying the tree
+ *
+ * (*) If size of item inserted doesn't include
+ * sizeof(struct btrfs_item), then p->search_for_extension must
+ * be set.
+ * @cow: boolean should CoW operations be performed. Must always be 1
+ * when modifying the tree.
+ *
+ * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
+ * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
+ *
+ * If @key is found, 0 is returned and you can find the item in the leaf level
+ * of the path (level 0)
+ *
+ * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
+ * points to the slot where it should be inserted
+ *
+ * If an error is encountered while searching the tree a negative error number
+ * is returned
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *key, struct btrfs_path *p,
+ int ins_len, int cow)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *b;
+ int slot;
+ int ret;
+ int err;
+ int level;
+ int lowest_unlock = 1;
+ /* everything at write_lock_level or lower must be write locked */
+ int write_lock_level = 0;
+ u8 lowest_level = 0;
+ int min_write_lock_level;
+ int prev_cmp;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(lowest_level && ins_len > 0);
+ WARN_ON(p->nodes[0] != NULL);
+ BUG_ON(!cow && ins_len);
+
+ /*
+ * For now only allow nowait for read only operations. There's no
+ * strict reason why we can't, we just only need it for reads so it's
+ * only implemented for reads.
+ */
+ ASSERT(!p->nowait || !cow);
+
+ if (ins_len < 0) {
+ lowest_unlock = 2;
+
+ /* when we are removing items, we might have to go up to level
+ * two as we update tree pointers Make sure we keep write
+ * for those levels as well
+ */
+ write_lock_level = 2;
+ } else if (ins_len > 0) {
+ /*
+ * for inserting items, make sure we have a write lock on
+ * level 1 so we can update keys
+ */
+ write_lock_level = 1;
+ }
+
+ if (!cow)
+ write_lock_level = -1;
+
+ if (cow && (p->keep_locks || p->lowest_level))
+ write_lock_level = BTRFS_MAX_LEVEL;
+
+ min_write_lock_level = write_lock_level;
+
+ if (p->need_commit_sem) {
+ ASSERT(p->search_commit_root);
+ if (p->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem))
+ return -EAGAIN;
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
+ }
+
+again:
+ prev_cmp = -1;
+ b = btrfs_search_slot_get_root(root, p, write_lock_level);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto done;
+ }
+
+ while (b) {
+ int dec = 0;
+
+ level = btrfs_header_level(b);
+
+ if (cow) {
+ bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
+
+ /*
+ * if we don't really need to cow this block
+ * then we don't want to set the path blocking,
+ * so we test it here
+ */
+ if (!should_cow_block(trans, root, b))
+ goto cow_done;
+
+ /*
+ * must have write locks on this node and the
+ * parent
+ */
+ if (level > write_lock_level ||
+ (level + 1 > write_lock_level &&
+ level + 1 < BTRFS_MAX_LEVEL &&
+ p->nodes[level + 1])) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ if (last_level)
+ err = btrfs_cow_block(trans, root, b, NULL, 0,
+ &b,
+ BTRFS_NESTING_COW);
+ else
+ err = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ }
+cow_done:
+ p->nodes[level] = b;
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ *
+ * If we're inserting or deleting (ins_len != 0), then we might
+ * be changing slot zero, which may require changing the parent.
+ * So, we can't drop the lock until after we know which slot
+ * we're operating on.
+ */
+ if (!ins_len && !p->keep_locks) {
+ int u = level + 1;
+
+ if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
+ btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
+ p->locks[u] = 0;
+ }
+ }
+
+ if (level == 0) {
+ if (ins_len > 0)
+ ASSERT(write_lock_level >= 1);
+
+ ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
+ if (!p->search_for_split)
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, NULL);
+ goto done;
+ }
+
+ ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
+ prev_cmp = ret;
+
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ b = p->nodes[level];
+ slot = p->slots[level];
+
+ /*
+ * Slot 0 is special, if we change the key we have to update
+ * the parent pointer which means we must have a write lock on
+ * the parent
+ */
+ if (slot == 0 && ins_len && write_lock_level < level + 1) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ unlock_up(p, level, lowest_unlock, min_write_lock_level,
+ &write_lock_level);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ if (!p->skip_locking) {
+ level = btrfs_header_level(b);
+
+ btrfs_maybe_reset_lockdep_class(root, b);
+
+ if (level <= write_lock_level) {
+ btrfs_tree_lock(b);
+ p->locks[level] = BTRFS_WRITE_LOCK;
+ } else {
+ if (p->nowait) {
+ if (!btrfs_try_tree_read_lock(b)) {
+ free_extent_buffer(b);
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(b);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ }
+ p->nodes[level] = b;
+ }
+ }
+ ret = 1;
+done:
+ if (ret < 0 && !p->skip_release_on_error)
+ btrfs_release_path(p);
+
+ if (p->need_commit_sem) {
+ int ret2;
+
+ ret2 = finish_need_commit_sem_search(p);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
+
+ return ret;
+}
+ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
+
+/*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *b;
+ int slot;
+ int ret;
+ int err;
+ int level;
+ int lowest_unlock = 1;
+ u8 lowest_level = 0;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(p->nodes[0] != NULL);
+ ASSERT(!p->nowait);
+
+ if (p->search_commit_root) {
+ BUG_ON(time_seq);
+ return btrfs_search_slot(NULL, root, key, p, 0, 0);
+ }
+
+again:
+ b = btrfs_get_old_root(root, time_seq);
+ if (!b) {
+ ret = -EIO;
+ goto done;
+ }
+ level = btrfs_header_level(b);
+ p->locks[level] = BTRFS_READ_LOCK;
+
+ while (b) {
+ int dec = 0;
+
+ level = btrfs_header_level(b);
+ p->nodes[level] = b;
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ */
+ btrfs_unlock_up_safe(p, level + 1);
+
+ ret = btrfs_bin_search(b, key, &slot);
+ if (ret < 0)
+ goto done;
+
+ if (level == 0) {
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+ goto done;
+ }
+
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ btrfs_tree_read_lock(b);
+ b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
+ if (!b) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
+ }
+ ret = 1;
+done:
+ if (ret < 0)
+ btrfs_release_path(p);
+
+ return ret;
+}
+
+/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_path *p, int find_higher,
+ int return_any)
+{
+ int ret;
+ struct extent_buffer *leaf;
+
+again:
+ ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+ if (ret <= 0)
+ return ret;
+ /*
+ * a return value of 1 means the path is at the position where the
+ * item should be inserted. Normally this is the next bigger item,
+ * but in case the previous item is the last in a leaf, path points
+ * to the first free slot in the previous leaf, i.e. at an invalid
+ * item.
+ */
+ leaf = p->nodes[0];
+
+ if (find_higher) {
+ if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, p);
+ if (ret <= 0)
+ return ret;
+ if (!return_any)
+ return 1;
+ /*
+ * no higher item found, return the next
+ * lower instead
+ */
+ return_any = 0;
+ find_higher = 0;
+ btrfs_release_path(p);
+ goto again;
+ }
+ } else {
+ if (p->slots[0] == 0) {
+ ret = btrfs_prev_leaf(root, p);
+ if (ret < 0)
+ return ret;
+ if (!ret) {
+ leaf = p->nodes[0];
+ if (p->slots[0] == btrfs_header_nritems(leaf))
+ p->slots[0]--;
+ return 0;
+ }
+ if (!return_any)
+ return 1;
+ /*
+ * no lower item found, return the next
+ * higher instead
+ */
+ return_any = 0;
+ find_higher = 1;
+ btrfs_release_path(p);
+ goto again;
+ } else {
+ --p->slots[0];
+ }
+ }
+ return 0;
+}
+
+/*
+ * Execute search and call btrfs_previous_item to traverse backwards if the item
+ * was not found.
+ *
+ * Return 0 if found, 1 if not found and < 0 if error.
+ */
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ int ret;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret > 0)
+ ret = btrfs_previous_item(root, path, key->objectid, key->type);
+
+ if (ret == 0)
+ btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
+
+ return ret;
+}
+
+/**
+ * Search for a valid slot for the given path.
+ *
+ * @root: The root node of the tree.
+ * @key: Will contain a valid item if found.
+ * @path: The starting point to validate the slot.
+ *
+ * Return: 0 if the item is valid
+ * 1 if not found
+ * <0 if error.
+ */
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ while (1) {
+ int ret;
+ const int slot = path->slots[0];
+ const struct extent_buffer *leaf = path->nodes[0];
+
+ /* This is where we start walking the path. */
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset the path.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ return ret;
+ continue;
+ }
+ /* Store the found, valid item in @key. */
+ btrfs_item_key_to_cpu(leaf, key, slot);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ */
+static void fixup_low_keys(struct btrfs_path *path,
+ struct btrfs_disk_key *key, int level)
+{
+ int i;
+ struct extent_buffer *t;
+ int ret;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ int tslot = path->slots[i];
+
+ if (!path->nodes[i])
+ break;
+ t = path->nodes[i];
+ ret = btrfs_tree_mod_log_insert_key(t, tslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
+ BUG_ON(ret < 0);
+ btrfs_set_node_key(t, key, tslot);
+ btrfs_mark_buffer_dirty(path->nodes[i]);
+ if (tslot != 0)
+ break;
+ }
+}
+
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *eb;
+ int slot;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot > 0) {
+ btrfs_item_key(eb, &disk_key, slot - 1);
+ if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+ btrfs_crit(fs_info,
+ "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+ slot, btrfs_disk_key_objectid(&disk_key),
+ btrfs_disk_key_type(&disk_key),
+ btrfs_disk_key_offset(&disk_key),
+ new_key->objectid, new_key->type,
+ new_key->offset);
+ btrfs_print_leaf(eb);
+ BUG();
+ }
+ }
+ if (slot < btrfs_header_nritems(eb) - 1) {
+ btrfs_item_key(eb, &disk_key, slot + 1);
+ if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+ btrfs_crit(fs_info,
+ "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
+ slot, btrfs_disk_key_objectid(&disk_key),
+ btrfs_disk_key_type(&disk_key),
+ btrfs_disk_key_offset(&disk_key),
+ new_key->objectid, new_key->type,
+ new_key->offset);
+ btrfs_print_leaf(eb);
+ BUG();
+ }
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(eb, &disk_key, slot);
+ btrfs_mark_buffer_dirty(eb);
+ if (slot == 0)
+ fixup_low_keys(path, &disk_key, 1);
+}
+
+/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+ struct extent_buffer *dst,
+ struct extent_buffer *src, int empty)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int push_items = 0;
+ int src_nritems;
+ int dst_nritems;
+ int ret = 0;
+
+ src_nritems = btrfs_header_nritems(src);
+ dst_nritems = btrfs_header_nritems(dst);
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
+ WARN_ON(btrfs_header_generation(src) != trans->transid);
+ WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+ if (!empty && src_nritems <= 8)
+ return 1;
+
+ if (push_items <= 0)
+ return 1;
+
+ if (empty) {
+ push_items = min(src_nritems, push_items);
+ if (push_items < src_nritems) {
+ /* leave at least 8 pointers in the node if
+ * we aren't going to empty it
+ */
+ if (src_nritems - push_items < 8) {
+ if (push_items <= 8)
+ return 1;
+ push_items -= 8;
+ }
+ }
+ } else
+ push_items = min(src_nritems - 8, push_items);
+
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ copy_extent_buffer(dst, src,
+ btrfs_node_key_ptr_offset(dst_nritems),
+ btrfs_node_key_ptr_offset(0),
+ push_items * sizeof(struct btrfs_key_ptr));
+
+ if (push_items < src_nritems) {
+ /*
+ * Don't call btrfs_tree_mod_log_insert_move() here, key removal
+ * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
+ */
+ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(push_items),
+ (src_nritems - push_items) *
+ sizeof(struct btrfs_key_ptr));
+ }
+ btrfs_set_header_nritems(src, src_nritems - push_items);
+ btrfs_set_header_nritems(dst, dst_nritems + push_items);
+ btrfs_mark_buffer_dirty(src);
+ btrfs_mark_buffer_dirty(dst);
+
+ return ret;
+}
+
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+ struct extent_buffer *dst,
+ struct extent_buffer *src)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int push_items = 0;
+ int max_push;
+ int src_nritems;
+ int dst_nritems;
+ int ret = 0;
+
+ WARN_ON(btrfs_header_generation(src) != trans->transid);
+ WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+ src_nritems = btrfs_header_nritems(src);
+ dst_nritems = btrfs_header_nritems(dst);
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
+ if (push_items <= 0)
+ return 1;
+
+ if (src_nritems < 4)
+ return 1;
+
+ max_push = src_nritems / 2 + 1;
+ /* don't try to empty the node */
+ if (max_push >= src_nritems)
+ return 1;
+
+ if (max_push < push_items)
+ push_items = max_push;
+
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+ BUG_ON(ret < 0);
+ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+ btrfs_node_key_ptr_offset(0),
+ (dst_nritems) *
+ sizeof(struct btrfs_key_ptr));
+
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
+ push_items);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ copy_extent_buffer(dst, src,
+ btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(src_nritems - push_items),
+ push_items * sizeof(struct btrfs_key_ptr));
+
+ btrfs_set_header_nritems(src, src_nritems - push_items);
+ btrfs_set_header_nritems(dst, dst_nritems + push_items);
+
+ btrfs_mark_buffer_dirty(src);
+ btrfs_mark_buffer_dirty(dst);
+
+ return ret;
+}
+
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 lower_gen;
+ struct extent_buffer *lower;
+ struct extent_buffer *c;
+ struct extent_buffer *old;
+ struct btrfs_disk_key lower_key;
+ int ret;
+
+ BUG_ON(path->nodes[level]);
+ BUG_ON(path->nodes[level-1] != root->node);
+
+ lower = path->nodes[level-1];
+ if (level == 1)
+ btrfs_item_key(lower, &lower_key, 0);
+ else
+ btrfs_node_key(lower, &lower_key, 0);
+
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+
+ root_add_used(root, fs_info->nodesize);
+
+ btrfs_set_header_nritems(c, 1);
+ btrfs_set_node_key(c, &lower_key, 0);
+ btrfs_set_node_blockptr(c, 0, lower->start);
+ lower_gen = btrfs_header_generation(lower);
+ WARN_ON(lower_gen != trans->transid);
+
+ btrfs_set_node_ptr_generation(c, 0, lower_gen);
+
+ btrfs_mark_buffer_dirty(c);
+
+ old = root->node;
+ ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
+ BUG_ON(ret < 0);
+ rcu_assign_pointer(root->node, c);
+
+ /* the super has an extra ref to root->node */
+ free_extent_buffer(old);
+
+ add_root_to_dirty_list(root);
+ atomic_inc(&c->refs);
+ path->nodes[level] = c;
+ path->locks[level] = BTRFS_WRITE_LOCK;
+ path->slots[level] = 0;
+ return 0;
+}
+
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ */
+static void insert_ptr(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_disk_key *key, u64 bytenr,
+ int slot, int level)
+{
+ struct extent_buffer *lower;
+ int nritems;
+ int ret;
+
+ BUG_ON(!path->nodes[level]);
+ btrfs_assert_tree_write_locked(path->nodes[level]);
+ lower = path->nodes[level];
+ nritems = btrfs_header_nritems(lower);
+ BUG_ON(slot > nritems);
+ BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
+ if (slot != nritems) {
+ if (level) {
+ ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
+ slot, nritems - slot);
+ BUG_ON(ret < 0);
+ }
+ memmove_extent_buffer(lower,
+ btrfs_node_key_ptr_offset(slot + 1),
+ btrfs_node_key_ptr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_key_ptr));
+ }
+ if (level) {
+ ret = btrfs_tree_mod_log_insert_key(lower, slot,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+ btrfs_set_node_key(lower, key, slot);
+ btrfs_set_node_blockptr(lower, slot, bytenr);
+ WARN_ON(trans->transid == 0);
+ btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+ btrfs_set_header_nritems(lower, nritems + 1);
+ btrfs_mark_buffer_dirty(lower);
+}
+
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *c;
+ struct extent_buffer *split;
+ struct btrfs_disk_key disk_key;
+ int mid;
+ int ret;
+ u32 c_nritems;
+
+ c = path->nodes[level];
+ WARN_ON(btrfs_header_generation(c) != trans->transid);
+ if (c == root->node) {
+ /*
+ * trying to split the root, lets make a new one
+ *
+ * tree mod log: We don't log_removal old root in
+ * insert_new_root, because that root buffer will be kept as a
+ * normal node. We are going to log removal of half of the
+ * elements below with btrfs_tree_mod_log_eb_copy(). We're
+ * holding a tree lock on the buffer, which is why we cannot
+ * race with other tree_mod_log users.
+ */
+ ret = insert_new_root(trans, root, path, level + 1);
+ if (ret)
+ return ret;
+ } else {
+ ret = push_nodes_for_insert(trans, root, path, level);
+ c = path->nodes[level];
+ if (!ret && btrfs_header_nritems(c) <
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
+ return 0;
+ if (ret < 0)
+ return ret;
+ }
+
+ c_nritems = btrfs_header_nritems(c);
+ mid = (c_nritems + 1) / 2;
+ btrfs_node_key(c, &disk_key, mid);
+
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0,
+ BTRFS_NESTING_SPLIT);
+ if (IS_ERR(split))
+ return PTR_ERR(split);
+
+ root_add_used(root, fs_info->nodesize);
+ ASSERT(btrfs_header_level(c) == level);
+
+ ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
+ if (ret) {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ copy_extent_buffer(split, c,
+ btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(mid),
+ (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+ btrfs_set_header_nritems(split, c_nritems - mid);
+ btrfs_set_header_nritems(c, mid);
+
+ btrfs_mark_buffer_dirty(c);
+ btrfs_mark_buffer_dirty(split);
+
+ insert_ptr(trans, path, &disk_key, split->start,
+ path->slots[level + 1] + 1, level + 1);
+
+ if (path->slots[level] >= mid) {
+ path->slots[level] -= mid;
+ btrfs_tree_unlock(c);
+ free_extent_buffer(c);
+ path->nodes[level] = split;
+ path->slots[level + 1] += 1;
+ } else {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
+ }
+ return 0;
+}
+
+/*
+ * how many bytes are required to store the items in a leaf. start
+ * and nr indicate which items in the leaf to check. This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+ int data_len;
+ int nritems = btrfs_header_nritems(l);
+ int end = min(nritems, start + nr) - 1;
+
+ if (!nr)
+ return 0;
+ data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
+ data_len = data_len - btrfs_item_offset(l, end);
+ data_len += sizeof(struct btrfs_item) * nr;
+ WARN_ON(data_len < 0);
+ return data_len;
+}
+
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data. IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ int nritems = btrfs_header_nritems(leaf);
+ int ret;
+
+ ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
+ if (ret < 0) {
+ btrfs_crit(fs_info,
+ "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
+ ret,
+ (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
+ leaf_space_used(leaf, 0, nritems), nritems);
+ }
+ return ret;
+}
+
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right. We'll push up to and including min_slot, but no lower
+ */
+static noinline int __push_leaf_right(struct btrfs_path *path,
+ int data_size, int empty,
+ struct extent_buffer *right,
+ int free_space, u32 left_nritems,
+ u32 min_slot)
+{
+ struct btrfs_fs_info *fs_info = right->fs_info;
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *upper = path->nodes[1];
+ struct btrfs_map_token token;
+ struct btrfs_disk_key disk_key;
+ int slot;
+ u32 i;
+ int push_space = 0;
+ int push_items = 0;
+ u32 nr;
+ u32 right_nritems;
+ u32 data_end;
+ u32 this_item_size;
+
+ if (empty)
+ nr = 0;
+ else
+ nr = max_t(u32, 1, min_slot);
+
+ if (path->slots[0] >= left_nritems)
+ push_space += data_size;
+
+ slot = path->slots[1];
+ i = left_nritems - 1;
+ while (i >= nr) {
+ if (!empty && push_items > 0) {
+ if (path->slots[0] > i)
+ break;
+ if (path->slots[0] == i) {
+ int space = btrfs_leaf_free_space(left);
+
+ if (space + push_space * 2 > free_space)
+ break;
+ }
+ }
+
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+ this_item_size = btrfs_item_size(left, i);
+ if (this_item_size + sizeof(struct btrfs_item) +
+ push_space > free_space)
+ break;
+
+ push_items++;
+ push_space += this_item_size + sizeof(struct btrfs_item);
+ if (i == 0)
+ break;
+ i--;
+ }
+
+ if (push_items == 0)
+ goto out_unlock;
+
+ WARN_ON(!empty && push_items == left_nritems);
+
+ /* push left to right */
+ right_nritems = btrfs_header_nritems(right);
+
+ push_space = btrfs_item_data_end(left, left_nritems - push_items);
+ push_space -= leaf_data_end(left);
+
+ /* make room in the right data area */
+ data_end = leaf_data_end(right);
+ memmove_extent_buffer(right,
+ BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
+
+ /* copy from the left data area */
+ copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
+ BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+ BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
+ push_space);
+
+ memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+ btrfs_item_nr_offset(0),
+ right_nritems * sizeof(struct btrfs_item));
+
+ /* copy the items from left to right */
+ copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(left_nritems - push_items),
+ push_items * sizeof(struct btrfs_item));
+
+ /* update the item pointers */
+ btrfs_init_map_token(&token, right);
+ right_nritems += push_items;
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
+ for (i = 0; i < right_nritems; i++) {
+ push_space -= btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
+ }
+
+ left_nritems -= push_items;
+ btrfs_set_header_nritems(left, left_nritems);
+
+ if (left_nritems)
+ btrfs_mark_buffer_dirty(left);
+ else
+ btrfs_clean_tree_block(left);
+
+ btrfs_mark_buffer_dirty(right);
+
+ btrfs_item_key(right, &disk_key, 0);
+ btrfs_set_node_key(upper, &disk_key, slot + 1);
+ btrfs_mark_buffer_dirty(upper);
+
+ /* then fixup the leaf pointer in the path */
+ if (path->slots[0] >= left_nritems) {
+ path->slots[0] -= left_nritems;
+ if (btrfs_header_nritems(path->nodes[0]) == 0)
+ btrfs_clean_tree_block(path->nodes[0]);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 0;
+
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf. It won't
+ * push any slot lower than min_slot
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ int min_data_size, int data_size,
+ int empty, u32 min_slot)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *right;
+ struct extent_buffer *upper;
+ int slot;
+ int free_space;
+ u32 left_nritems;
+ int ret;
+
+ if (!path->nodes[1])
+ return 1;
+
+ slot = path->slots[1];
+ upper = path->nodes[1];
+ if (slot >= btrfs_header_nritems(upper) - 1)
+ return 1;
+
+ btrfs_assert_tree_write_locked(path->nodes[1]);
+
+ right = btrfs_read_node_slot(upper, slot + 1);
+ /*
+ * slot + 1 is not valid or we fail to read the right node,
+ * no big deal, just return.
+ */
+ if (IS_ERR(right))
+ return 1;
+
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+
+ free_space = btrfs_leaf_free_space(right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
+ if (ret)
+ goto out_unlock;
+
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
+ if (path->slots[0] == left_nritems && !empty) {
+ /* Key greater than all keys in the leaf, right neighbor has
+ * enough room for it and we're not emptying our leaf to delete
+ * it, therefore use right neighbor to insert the new item and
+ * no need to touch/dirty our left leaf. */
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ path->slots[1]++;
+ return 0;
+ }
+
+ return __push_leaf_right(path, min_data_size, empty,
+ right, free_space, left_nritems, min_slot);
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items. The
+ * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
+ * items
+ */
+static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
+ int empty, struct extent_buffer *left,
+ int free_space, u32 right_nritems,
+ u32 max_slot)
+{
+ struct btrfs_fs_info *fs_info = left->fs_info;
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *right = path->nodes[0];
+ int i;
+ int push_space = 0;
+ int push_items = 0;
+ u32 old_left_nritems;
+ u32 nr;
+ int ret = 0;
+ u32 this_item_size;
+ u32 old_left_item_size;
+ struct btrfs_map_token token;
+
+ if (empty)
+ nr = min(right_nritems, max_slot);
+ else
+ nr = min(right_nritems - 1, max_slot);
+
+ for (i = 0; i < nr; i++) {
+ if (!empty && push_items > 0) {
+ if (path->slots[0] < i)
+ break;
+ if (path->slots[0] == i) {
+ int space = btrfs_leaf_free_space(right);
+
+ if (space + push_space * 2 > free_space)
+ break;
+ }
+ }
+
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+ this_item_size = btrfs_item_size(right, i);
+ if (this_item_size + sizeof(struct btrfs_item) + push_space >
+ free_space)
+ break;
+
+ push_items++;
+ push_space += this_item_size + sizeof(struct btrfs_item);
+ }
+
+ if (push_items == 0) {
+ ret = 1;
+ goto out;
+ }
+ WARN_ON(!empty && push_items == btrfs_header_nritems(right));
+
+ /* push data from right to left */
+ copy_extent_buffer(left, right,
+ btrfs_item_nr_offset(btrfs_header_nritems(left)),
+ btrfs_item_nr_offset(0),
+ push_items * sizeof(struct btrfs_item));
+
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
+ btrfs_item_offset(right, push_items - 1);
+
+ copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
+ leaf_data_end(left) - push_space,
+ BTRFS_LEAF_DATA_OFFSET +
+ btrfs_item_offset(right, push_items - 1),
+ push_space);
+ old_left_nritems = btrfs_header_nritems(left);
+ BUG_ON(old_left_nritems <= 0);
+
+ btrfs_init_map_token(&token, left);
+ old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
+ for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+ u32 ioff;
+
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
+ }
+ btrfs_set_header_nritems(left, old_left_nritems + push_items);
+
+ /* fixup right node */
+ if (push_items > right_nritems)
+ WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
+ right_nritems);
+
+ if (push_items < right_nritems) {
+ push_space = btrfs_item_offset(right, push_items - 1) -
+ leaf_data_end(right);
+ memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
+ BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+ BTRFS_LEAF_DATA_OFFSET +
+ leaf_data_end(right), push_space);
+
+ memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(push_items),
+ (btrfs_header_nritems(right) - push_items) *
+ sizeof(struct btrfs_item));
+ }
+
+ btrfs_init_map_token(&token, right);
+ right_nritems -= push_items;
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
+ for (i = 0; i < right_nritems; i++) {
+ push_space = push_space - btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
+ }
+
+ btrfs_mark_buffer_dirty(left);
+ if (right_nritems)
+ btrfs_mark_buffer_dirty(right);
+ else
+ btrfs_clean_tree_block(right);
+
+ btrfs_item_key(right, &disk_key, 0);
+ fixup_low_keys(path, &disk_key, 1);
+
+ /* then fixup the leaf pointer in the path */
+ if (path->slots[0] < push_items) {
+ path->slots[0] += old_left_nritems;
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = left;
+ path->slots[1] -= 1;
+ } else {
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ path->slots[0] -= push_items;
+ }
+ BUG_ON(path->slots[0] < 0);
+ return ret;
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items. The
+ * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
+ * items
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int min_data_size,
+ int data_size, int empty, u32 max_slot)
+{
+ struct extent_buffer *right = path->nodes[0];
+ struct extent_buffer *left;
+ int slot;
+ int free_space;
+ u32 right_nritems;
+ int ret = 0;
+
+ slot = path->slots[1];
+ if (slot == 0)
+ return 1;
+ if (!path->nodes[1])
+ return 1;
+
+ right_nritems = btrfs_header_nritems(right);
+ if (right_nritems == 0)
+ return 1;
+
+ btrfs_assert_tree_write_locked(path->nodes[1]);
+
+ left = btrfs_read_node_slot(path->nodes[1], slot - 1);
+ /*
+ * slot - 1 is not valid or we fail to read the left node,
+ * no big deal, just return.
+ */
+ if (IS_ERR(left))
+ return 1;
+
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+
+ free_space = btrfs_leaf_free_space(left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ ret = btrfs_cow_block(trans, root, left,
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
+ if (ret) {
+ /* we hit -ENOSPC, but it isn't fatal here */
+ if (ret == -ENOSPC)
+ ret = 1;
+ goto out;
+ }
+
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ return __push_leaf_left(path, min_data_size,
+ empty, left, free_space, right_nritems,
+ max_slot);
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ */
+static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int data_copy_size;
+ int rt_data_off;
+ int i;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_map_token token;
+
+ nritems = nritems - mid;
+ btrfs_set_header_nritems(right, nritems);
+ data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
+
+ copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(mid),
+ nritems * sizeof(struct btrfs_item));
+
+ copy_extent_buffer(right, l,
+ BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
+ data_copy_size, BTRFS_LEAF_DATA_OFFSET +
+ leaf_data_end(l), data_copy_size);
+
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
+
+ btrfs_init_map_token(&token, right);
+ for (i = 0; i < nritems; i++) {
+ u32 ioff;
+
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
+ }
+
+ btrfs_set_header_nritems(l, mid);
+ btrfs_item_key(right, &disk_key, 0);
+ insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
+
+ btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(l);
+ BUG_ON(path->slots[0] != slot);
+
+ if (mid <= slot) {
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] -= mid;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+
+ BUG_ON(path->slots[0] < 0);
+}
+
+/*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf. A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ * A B C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves. If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size)
+{
+ int ret;
+ int progress = 0;
+ int slot;
+ u32 nritems;
+ int space_needed = data_size;
+
+ slot = path->slots[0];
+ if (slot < btrfs_header_nritems(path->nodes[0]))
+ space_needed -= btrfs_leaf_free_space(path->nodes[0]);
+
+ /*
+ * try to push all the items after our slot into the
+ * right leaf
+ */
+ ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0)
+ progress++;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ /*
+ * our goal is to get our slot at the start or end of a leaf. If
+ * we've done so we're done
+ */
+ if (path->slots[0] == 0 || path->slots[0] == nritems)
+ return 0;
+
+ if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
+ return 0;
+
+ /* try to push all the items before our slot into the next leaf */
+ slot = path->slots[0];
+ space_needed = data_size;
+ if (slot > 0)
+ space_needed -= btrfs_leaf_free_space(path->nodes[0]);
+ ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0)
+ progress++;
+
+ if (progress)
+ return 0;
+ return 1;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct btrfs_key *ins_key,
+ struct btrfs_path *path, int data_size,
+ int extend)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *l;
+ u32 nritems;
+ int mid;
+ int slot;
+ struct extent_buffer *right;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+ int wret;
+ int split;
+ int num_doubles = 0;
+ int tried_avoid_double = 0;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (extend && data_size + btrfs_item_size(l, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
+ return -EOVERFLOW;
+
+ /* first try to make some room by pushing left and right */
+ if (data_size && path->nodes[1]) {
+ int space_needed = data_size;
+
+ if (slot < btrfs_header_nritems(l))
+ space_needed -= btrfs_leaf_free_space(l);
+
+ wret = push_leaf_right(trans, root, path, space_needed,
+ space_needed, 0, 0);
+ if (wret < 0)
+ return wret;
+ if (wret) {
+ space_needed = data_size;
+ if (slot > 0)
+ space_needed -= btrfs_leaf_free_space(l);
+ wret = push_leaf_left(trans, root, path, space_needed,
+ space_needed, 0, (u32)-1);
+ if (wret < 0)
+ return wret;
+ }
+ l = path->nodes[0];
+
+ /* did the pushes work? */
+ if (btrfs_leaf_free_space(l) >= data_size)
+ return 0;
+ }
+
+ if (!path->nodes[1]) {
+ ret = insert_new_root(trans, root, path, 1);
+ if (ret)
+ return ret;
+ }
+again:
+ split = 1;
+ l = path->nodes[0];
+ slot = path->slots[0];
+ nritems = btrfs_header_nritems(l);
+ mid = (nritems + 1) / 2;
+
+ if (mid <= slot) {
+ if (nritems == 1 ||
+ leaf_space_used(l, mid, nritems - mid) + data_size >
+ BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ if (slot >= nritems) {
+ split = 0;
+ } else {
+ mid = slot;
+ if (mid != nritems &&
+ leaf_space_used(l, mid, nritems - mid) +
+ data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ if (data_size && !tried_avoid_double)
+ goto push_for_double;
+ split = 2;
+ }
+ }
+ }
+ } else {
+ if (leaf_space_used(l, 0, mid) + data_size >
+ BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ if (!extend && data_size && slot == 0) {
+ split = 0;
+ } else if ((extend || !data_size) && slot == 0) {
+ mid = 1;
+ } else {
+ mid = slot;
+ if (mid != nritems &&
+ leaf_space_used(l, mid, nritems - mid) +
+ data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ if (data_size && !tried_avoid_double)
+ goto push_for_double;
+ split = 2;
+ }
+ }
+ }
+ }
+
+ if (split == 0)
+ btrfs_cpu_key_to_disk(&disk_key, ins_key);
+ else
+ btrfs_item_key(l, &disk_key, mid);
+
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0,
+ num_doubles ? BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
+ if (IS_ERR(right))
+ return PTR_ERR(right);
+
+ root_add_used(root, fs_info->nodesize);
+
+ if (split == 0) {
+ if (mid <= slot) {
+ btrfs_set_header_nritems(right, 0);
+ insert_ptr(trans, path, &disk_key,
+ right->start, path->slots[1] + 1, 1);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ path->slots[1] += 1;
+ } else {
+ btrfs_set_header_nritems(right, 0);
+ insert_ptr(trans, path, &disk_key,
+ right->start, path->slots[1], 1);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ if (path->slots[1] == 0)
+ fixup_low_keys(path, &disk_key, 1);
+ }
+ /*
+ * We create a new leaf 'right' for the required ins_len and
+ * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
+ * the content of ins_len to 'right'.
+ */
+ return ret;
+ }
+
+ copy_for_split(trans, path, l, right, slot, mid, nritems);
+
+ if (split == 2) {
+ BUG_ON(num_doubles != 0);
+ num_doubles++;
+ goto again;
+ }
+
+ return 0;
+
+push_for_double:
+ push_for_double_split(trans, root, path, data_size);
+ tried_avoid_double = 1;
+ if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
+ return 0;
+ goto again;
+}
+
+static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int ins_len)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_len = 0;
+ u32 item_size;
+ int ret;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+ key.type != BTRFS_EXTENT_CSUM_KEY);
+
+ if (btrfs_leaf_free_space(leaf) >= ins_len)
+ return 0;
+
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+ }
+ btrfs_release_path(path);
+
+ path->keep_locks = 1;
+ path->search_for_split = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ path->search_for_split = 0;
+ if (ret > 0)
+ ret = -EAGAIN;
+ if (ret < 0)
+ goto err;
+
+ ret = -EAGAIN;
+ leaf = path->nodes[0];
+ /* if our item isn't there, return now */
+ if (item_size != btrfs_item_size(leaf, path->slots[0]))
+ goto err;
+
+ /* the leaf has changed, it now has room. return now */
+ if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
+ goto err;
+
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
+ goto err;
+ }
+
+ ret = split_leaf(trans, root, &key, path, ins_len, 1);
+ if (ret)
+ goto err;
+
+ path->keep_locks = 0;
+ btrfs_unlock_up_safe(path, 1);
+ return 0;
+err:
+ path->keep_locks = 0;
+ return ret;
+}
+
+static noinline int split_item(struct btrfs_path *path,
+ const struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ struct extent_buffer *leaf;
+ int orig_slot, slot;
+ char *buf;
+ u32 nritems;
+ u32 item_size;
+ u32 orig_offset;
+ struct btrfs_disk_key disk_key;
+
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
+
+ orig_slot = path->slots[0];
+ orig_offset = btrfs_item_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ buf = kmalloc(item_size, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+ path->slots[0]), item_size);
+
+ slot = path->slots[0] + 1;
+ nritems = btrfs_header_nritems(leaf);
+ if (slot != nritems) {
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+
+ btrfs_set_item_offset(leaf, slot, orig_offset);
+ btrfs_set_item_size(leaf, slot, item_size - split_offset);
+
+ btrfs_set_item_offset(leaf, orig_slot,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, orig_slot, split_offset);
+
+ btrfs_set_header_nritems(leaf, nritems + 1);
+
+ /* write the data for the start of the original item */
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ split_offset);
+
+ /* write the data for the new item */
+ write_extent_buffer(leaf, buf + split_offset,
+ btrfs_item_ptr_offset(leaf, slot),
+ item_size - split_offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ BUG_ON(btrfs_leaf_free_space(leaf) < 0);
+ kfree(buf);
+ return 0;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation. After
+ * the split, the path is pointing to the old item. The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ int ret;
+ ret = setup_leaf_for_split(trans, root, path,
+ sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ ret = split_item(path, new_key, split_offset);
+ return ret;
+}
+
+/*
+ * make the item pointed to by the path smaller. new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+{
+ int slot;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data_start;
+ unsigned int old_size;
+ unsigned int size_diff;
+ int i;
+ struct btrfs_map_token token;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ old_size = btrfs_item_size(leaf, slot);
+ if (old_size == new_size)
+ return;
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(leaf);
+
+ old_data_start = btrfs_item_offset(leaf, slot);
+
+ size_diff = old_size - new_size;
+
+ BUG_ON(slot < 0);
+ BUG_ON(slot >= nritems);
+
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ btrfs_init_map_token(&token, leaf);
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + size_diff);
+ }
+
+ /* shift the data */
+ if (from_end) {
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
+ data_end, old_data_start + new_size - data_end);
+ } else {
+ struct btrfs_disk_key disk_key;
+ u64 offset;
+
+ btrfs_item_key(leaf, &disk_key, slot);
+
+ if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+ unsigned long ptr;
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ fi = (struct btrfs_file_extent_item *)(
+ (unsigned long)fi - size_diff);
+
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ memmove_extent_buffer(leaf, ptr,
+ (unsigned long)fi,
+ BTRFS_FILE_EXTENT_INLINE_DATA_START);
+ }
+ }
+
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
+ data_end, old_data_start - data_end);
+
+ offset = btrfs_disk_key_offset(&disk_key);
+ btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+ if (slot == 0)
+ fixup_low_keys(path, &disk_key, 1);
+ }
+
+ btrfs_set_item_size(leaf, slot, new_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(leaf) < 0) {
+ btrfs_print_leaf(leaf);
+ BUG();
+ }
+}
+
+/*
+ * make the item pointed to by the path bigger, data_size is the added size.
+ */
+void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+{
+ int slot;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data;
+ unsigned int old_size;
+ int i;
+ struct btrfs_map_token token;
+
+ leaf = path->nodes[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(leaf);
+
+ if (btrfs_leaf_free_space(leaf) < data_size) {
+ btrfs_print_leaf(leaf);
+ BUG();
+ }
+ slot = path->slots[0];
+ old_data = btrfs_item_data_end(leaf, slot);
+
+ BUG_ON(slot < 0);
+ if (slot >= nritems) {
+ btrfs_print_leaf(leaf);
+ btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
+ slot, nritems);
+ BUG();
+ }
+
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ btrfs_init_map_token(&token, leaf);
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff - data_size);
+ }
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
+ data_end, old_data - data_end);
+
+ data_end = old_data;
+ old_size = btrfs_item_size(leaf, slot);
+ btrfs_set_item_size(leaf, slot, old_size + data_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(leaf) < 0) {
+ btrfs_print_leaf(leaf);
+ BUG();
+ }
+}
+
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @batch: information about the batch of items to insert
+ */
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ const struct btrfs_item_batch *batch)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int i;
+ u32 nritems;
+ unsigned int data_end;
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_map_token token;
+ u32 total_size;
+
+ /*
+ * Before anything else, update keys in the parent and other ancestors
+ * if needed, then release the write locks on them, so that other tasks
+ * can use them while we modify the leaf.
+ */
+ if (path->slots[0] == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
+ fixup_low_keys(path, &disk_key, 1);
+ }
+ btrfs_unlock_up_safe(path, 1);
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(leaf);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+
+ if (btrfs_leaf_free_space(leaf) < total_size) {
+ btrfs_print_leaf(leaf);
+ btrfs_crit(fs_info, "not enough freespace need %u have %d",
+ total_size, btrfs_leaf_free_space(leaf));
+ BUG();
+ }
+
+ btrfs_init_map_token(&token, leaf);
+ if (slot != nritems) {
+ unsigned int old_data = btrfs_item_data_end(leaf, slot);
+
+ if (old_data < data_end) {
+ btrfs_print_leaf(leaf);
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
+ slot, old_data, data_end);
+ BUG();
+ }
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - batch->total_data_size);
+ }
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end - batch->total_data_size,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ old_data - data_end);
+ data_end = old_data;
+ }
+
+ /* setup the item for the new data */
+ for (i = 0; i < batch->nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
+ btrfs_set_item_key(leaf, &disk_key, slot + i);
+ data_end -= batch->data_sizes[i];
+ btrfs_set_token_item_offset(&token, slot + i, data_end);
+ btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
+ }
+
+ btrfs_set_header_nritems(leaf, nritems + batch->nr);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(leaf) < 0) {
+ btrfs_print_leaf(leaf);
+ BUG();
+ }
+}
+
+/*
+ * Insert a new item into a leaf.
+ *
+ * @root: The root of the btree.
+ * @path: A path pointing to the target leaf and slot.
+ * @key: The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size)
+{
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ setup_items_for_insert(root, path, &batch);
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch)
+{
+ int ret = 0;
+ int slot;
+ u32 total_size;
+
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ return ret;
+
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ setup_items_for_insert(root, path, batch);
+ return 0;
+}
+
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *cpu_key, void *data,
+ u32 data_size)
+{
+ int ret = 0;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+ if (!ret) {
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, data, ptr, data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+ int level, int slot)
+{
+ struct extent_buffer *parent = path->nodes[level];
+ u32 nritems;
+ int ret;
+
+ nritems = btrfs_header_nritems(parent);
+ if (slot != nritems - 1) {
+ if (level) {
+ ret = btrfs_tree_mod_log_insert_move(parent, slot,
+ slot + 1, nritems - slot - 1);
+ BUG_ON(ret < 0);
+ }
+ memmove_extent_buffer(parent,
+ btrfs_node_key_ptr_offset(slot),
+ btrfs_node_key_ptr_offset(slot + 1),
+ sizeof(struct btrfs_key_ptr) *
+ (nritems - slot - 1));
+ } else if (level) {
+ ret = btrfs_tree_mod_log_insert_key(parent, slot,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+
+ nritems--;
+ btrfs_set_header_nritems(parent, nritems);
+ if (nritems == 0 && parent == root->node) {
+ BUG_ON(btrfs_header_level(root->node) != 1);
+ /* just turn the root into a leaf and break */
+ btrfs_set_header_level(root->node, 0);
+ } else if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(parent, &disk_key, 0);
+ fixup_low_keys(path, &disk_key, level + 1);
+ }
+ btrfs_mark_buffer_dirty(parent);
+}
+
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent. zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing. path->nodes[1] must be locked.
+ */
+static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf)
+{
+ WARN_ON(btrfs_header_generation(leaf) != trans->transid);
+ del_ptr(root, path, 1, path->slots[1]);
+
+ /*
+ * btrfs_free_extent is expensive, we want to make sure we
+ * aren't holding any locks when we call it
+ */
+ btrfs_unlock_up_safe(path, 0);
+
+ root_sub_used(root, leaf->len);
+
+ atomic_inc(&leaf->refs);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
+ free_extent_buffer_stale(leaf);
+}
+/*
+ * delete the item at the leaf level in path. If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int slot, int nr)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *leaf;
+ int ret = 0;
+ int wret;
+ u32 nritems;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ if (slot + nr != nritems) {
+ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
+ const int data_end = leaf_data_end(leaf);
+ struct btrfs_map_token token;
+ u32 dsize = 0;
+ int i;
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size(leaf, slot + i);
+
+ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
+ data_end + dsize,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ last_off - data_end);
+
+ btrfs_init_map_token(&token, leaf);
+ for (i = slot + nr; i < nritems; i++) {
+ u32 ioff;
+
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + dsize);
+ }
+
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+ btrfs_item_nr_offset(slot + nr),
+ sizeof(struct btrfs_item) *
+ (nritems - slot - nr));
+ }
+ btrfs_set_header_nritems(leaf, nritems - nr);
+ nritems -= nr;
+
+ /* delete the leaf if we've emptied it */
+ if (nritems == 0) {
+ if (leaf == root->node) {
+ btrfs_set_header_level(leaf, 0);
+ } else {
+ btrfs_clean_tree_block(leaf);
+ btrfs_del_leaf(trans, root, path, leaf);
+ }
+ } else {
+ int used = leaf_space_used(leaf, 0, nritems);
+ if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_item_key(leaf, &disk_key, 0);
+ fixup_low_keys(path, &disk_key, 1);
+ }
+
+ /*
+ * Try to delete the leaf if it is mostly empty. We do this by
+ * trying to move all its items into its left and right neighbours.
+ * If we can't move all the items, then we don't delete it - it's
+ * not ideal, but future insertions might fill the leaf with more
+ * items, or items from other leaves might be moved later into our
+ * leaf due to deletions on those leaves.
+ */
+ if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
+ u32 min_push_space;
+
+ /* push_leaf_left fixes the path.
+ * make sure the path still points to our leaf
+ * for possible call to del_ptr below
+ */
+ slot = path->slots[1];
+ atomic_inc(&leaf->refs);
+ /*
+ * We want to be able to at least push one item to the
+ * left neighbour leaf, and that's the first item.
+ */
+ min_push_space = sizeof(struct btrfs_item) +
+ btrfs_item_size(leaf, 0);
+ wret = push_leaf_left(trans, root, path, 0,
+ min_push_space, 1, (u32)-1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+
+ if (path->nodes[0] == leaf &&
+ btrfs_header_nritems(leaf)) {
+ /*
+ * If we were not able to push all items from our
+ * leaf to its left neighbour, then attempt to
+ * either push all the remaining items to the
+ * right neighbour or none. There's no advantage
+ * in pushing only some items, instead of all, as
+ * it's pointless to end up with a leaf having
+ * too few items while the neighbours can be full
+ * or nearly full.
+ */
+ nritems = btrfs_header_nritems(leaf);
+ min_push_space = leaf_space_used(leaf, 0, nritems);
+ wret = push_leaf_right(trans, root, path, 0,
+ min_push_space, 1, 0);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ }
+
+ if (btrfs_header_nritems(leaf) == 0) {
+ path->slots[1] = slot;
+ btrfs_del_leaf(trans, root, path, leaf);
+ free_extent_buffer(leaf);
+ ret = 0;
+ } else {
+ /* if we're still in the path, make sure
+ * we're dirty. Otherwise, one of the
+ * push_leaf functions must have already
+ * dirtied this buffer
+ */
+ if (path->nodes[0] == leaf)
+ btrfs_mark_buffer_dirty(leaf);
+ free_extent_buffer(leaf);
+ }
+ } else {
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ }
+ return ret;
+}
+
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ struct btrfs_key orig_key;
+ struct btrfs_disk_key found_key;
+ int ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+ orig_key = key;
+
+ if (key.offset > 0) {
+ key.offset--;
+ } else if (key.type > 0) {
+ key.type--;
+ key.offset = (u64)-1;
+ } else if (key.objectid > 0) {
+ key.objectid--;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+ } else {
+ return 1;
+ }
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * Previous key not found. Even if we were at slot 0 of the leaf we had
+ * before releasing the path and calling btrfs_search_slot(), we now may
+ * be in a slot pointing to the same original key - this can happen if
+ * after we released the path, one of more items were moved from a
+ * sibling leaf into the front of the leaf we had due to an insertion
+ * (see push_leaf_right()).
+ * If we hit this case and our slot is > 0 and just decrement the slot
+ * so that the caller does not process the same key again, which may or
+ * may not break the caller, depending on its logic.
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
+ ret = comp_keys(&found_key, &orig_key);
+ if (ret == 0) {
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ return 0;
+ }
+ /*
+ * At slot 0, same key as before, it means orig_key is
+ * the lowest, leftmost, key in the tree. We're done.
+ */
+ return 1;
+ }
+ }
+
+ btrfs_item_key(path->nodes[0], &found_key, 0);
+ ret = comp_keys(&found_key, &key);
+ /*
+ * We might have had an item with the previous key in the tree right
+ * before we released our path. And after we released our path, that
+ * item might have been pushed to the first slot (0) of the leaf we
+ * were holding due to a tree balance. Alternatively, an item with the
+ * previous key can exist as the only element of a leaf (big fat item).
+ * Therefore account for these 2 cases, so that our callers (like
+ * btrfs_previous_item) don't miss an existing item with a key matching
+ * the previous key we computed above.
+ */
+ if (ret <= 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through. Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ struct btrfs_path *path,
+ u64 min_trans)
+{
+ struct extent_buffer *cur;
+ struct btrfs_key found_key;
+ int slot;
+ int sret;
+ u32 nritems;
+ int level;
+ int ret = 1;
+ int keep_locks = path->keep_locks;
+
+ ASSERT(!path->nowait);
+ path->keep_locks = 1;
+again:
+ cur = btrfs_read_lock_root_node(root);
+ level = btrfs_header_level(cur);
+ WARN_ON(path->nodes[level]);
+ path->nodes[level] = cur;
+ path->locks[level] = BTRFS_READ_LOCK;
+
+ if (btrfs_header_generation(cur) < min_trans) {
+ ret = 1;
+ goto out;
+ }
+ while (1) {
+ nritems = btrfs_header_nritems(cur);
+ level = btrfs_header_level(cur);
+ sret = btrfs_bin_search(cur, min_key, &slot);
+ if (sret < 0) {
+ ret = sret;
+ goto out;
+ }
+
+ /* at the lowest level, we're done, setup the path and exit */
+ if (level == path->lowest_level) {
+ if (slot >= nritems)
+ goto find_next_key;
+ ret = 0;
+ path->slots[level] = slot;
+ btrfs_item_key_to_cpu(cur, &found_key, slot);
+ goto out;
+ }
+ if (sret && slot > 0)
+ slot--;
+ /*
+ * check this node pointer against the min_trans parameters.
+ * If it is too old, skip to the next one.
+ */
+ while (slot < nritems) {
+ u64 gen;
+
+ gen = btrfs_node_ptr_generation(cur, slot);
+ if (gen < min_trans) {
+ slot++;
+ continue;
+ }
+ break;
+ }
+find_next_key:
+ /*
+ * we didn't find a candidate key in this node, walk forward
+ * and find another one
+ */
+ if (slot >= nritems) {
+ path->slots[level] = slot;
+ sret = btrfs_find_next_key(root, path, min_key, level,
+ min_trans);
+ if (sret == 0) {
+ btrfs_release_path(path);
+ goto again;
+ } else {
+ goto out;
+ }
+ }
+ /* save our key for returning back */
+ btrfs_node_key_to_cpu(cur, &found_key, slot);
+ path->slots[level] = slot;
+ if (level == path->lowest_level) {
+ ret = 0;
+ goto out;
+ }
+ cur = btrfs_read_node_slot(cur, slot);
+ if (IS_ERR(cur)) {
+ ret = PTR_ERR(cur);
+ goto out;
+ }
+
+ btrfs_tree_read_lock(cur);
+
+ path->locks[level - 1] = BTRFS_READ_LOCK;
+ path->nodes[level - 1] = cur;
+ unlock_up(path, level, 1, 0, NULL);
+ }
+out:
+ path->keep_locks = keep_locks;
+ if (ret == 0) {
+ btrfs_unlock_up_safe(path, path->lowest_level + 1);
+ memcpy(min_key, &found_key, sizeof(found_key));
+ }
+ return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path. It looks for and returns the next key in the
+ * tree based on the current path and the min_trans parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, int level, u64 min_trans)
+{
+ int slot;
+ struct extent_buffer *c;
+
+ WARN_ON(!path->keep_locks && !path->skip_locking);
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level])
+ return 1;
+
+ slot = path->slots[level] + 1;
+ c = path->nodes[level];
+next:
+ if (slot >= btrfs_header_nritems(c)) {
+ int ret;
+ int orig_lowest;
+ struct btrfs_key cur_key;
+ if (level + 1 >= BTRFS_MAX_LEVEL ||
+ !path->nodes[level + 1])
+ return 1;
+
+ if (path->locks[level + 1] || path->skip_locking) {
+ level++;
+ continue;
+ }
+
+ slot = btrfs_header_nritems(c) - 1;
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, &cur_key, slot);
+ else
+ btrfs_node_key_to_cpu(c, &cur_key, slot);
+
+ orig_lowest = path->lowest_level;
+ btrfs_release_path(path);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &cur_key, path,
+ 0, 0);
+ path->lowest_level = orig_lowest;
+ if (ret < 0)
+ return ret;
+
+ c = path->nodes[level];
+ slot = path->slots[level];
+ if (ret == 0)
+ slot++;
+ goto next;
+ }
+
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, key, slot);
+ else {
+ u64 gen = btrfs_node_ptr_generation(c, slot);
+
+ if (gen < min_trans) {
+ slot++;
+ goto next;
+ }
+ btrfs_node_key_to_cpu(c, key, slot);
+ }
+ return 0;
+ }
+ return 1;
+}
+
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq)
+{
+ int slot;
+ int level;
+ struct extent_buffer *c;
+ struct extent_buffer *next;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ bool need_commit_sem = false;
+ u32 nritems;
+ int ret;
+ int i;
+
+ /*
+ * The nowait semantics are used only for write paths, where we don't
+ * use the tree mod log and sequence numbers.
+ */
+ if (time_seq)
+ ASSERT(!path->nowait);
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (nritems == 0)
+ return 1;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+ level = 1;
+ next = NULL;
+ btrfs_release_path(path);
+
+ path->keep_locks = 1;
+
+ if (time_seq) {
+ ret = btrfs_search_old_slot(root, &key, path, time_seq);
+ } else {
+ if (path->need_commit_sem) {
+ path->need_commit_sem = 0;
+ need_commit_sem = true;
+ if (path->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem)) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
+ }
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ }
+ path->keep_locks = 0;
+
+ if (ret < 0)
+ goto done;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ /*
+ * by releasing the path above we dropped all our locks. A balance
+ * could have added more items next to the key that used to be
+ * at the very end of the block. So, check again here and
+ * advance the path if there are now more items available.
+ */
+ if (nritems > 0 && path->slots[0] < nritems - 1) {
+ if (ret == 0)
+ path->slots[0]++;
+ ret = 0;
+ goto done;
+ }
+ /*
+ * So the above check misses one case:
+ * - after releasing the path above, someone has removed the item that
+ * used to be at the very end of the block, and balance between leafs
+ * gets another one with bigger key.offset to replace it.
+ *
+ * This one should be returned as well, or we can get leaf corruption
+ * later(esp. in __btrfs_drop_extents()).
+ *
+ * And a bit more explanation about this check,
+ * with ret > 0, the key isn't found, the path points to the slot
+ * where it should be inserted, so the path->slots[0] item must be the
+ * bigger one.
+ */
+ if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
+ ret = 0;
+ goto done;
+ }
+
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level]) {
+ ret = 1;
+ goto done;
+ }
+
+ slot = path->slots[level] + 1;
+ c = path->nodes[level];
+ if (slot >= btrfs_header_nritems(c)) {
+ level++;
+ if (level == BTRFS_MAX_LEVEL) {
+ ret = 1;
+ goto done;
+ }
+ continue;
+ }
+
+
+ /*
+ * Our current level is where we're going to start from, and to
+ * make sure lockdep doesn't complain we need to drop our locks
+ * and nodes from 0 to our current level.
+ */
+ for (i = 0; i < level; i++) {
+ if (path->locks[level]) {
+ btrfs_tree_read_unlock(path->nodes[i]);
+ path->locks[i] = 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+
+ next = c;
+ ret = read_block_for_search(root, path, &next, level,
+ slot, &key);
+ if (ret == -EAGAIN && !path->nowait)
+ goto again;
+
+ if (ret < 0) {
+ btrfs_release_path(path);
+ goto done;
+ }
+
+ if (!path->skip_locking) {
+ ret = btrfs_try_tree_read_lock(next);
+ if (!ret && path->nowait) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ if (!ret && time_seq) {
+ /*
+ * If we don't get the lock, we may be racing
+ * with push_leaf_left, holding that lock while
+ * itself waiting for the leaf we've currently
+ * locked. To solve this situation, we give up
+ * on our lock and cycle.
+ */
+ free_extent_buffer(next);
+ btrfs_release_path(path);
+ cond_resched();
+ goto again;
+ }
+ if (!ret)
+ btrfs_tree_read_lock(next);
+ }
+ break;
+ }
+ path->slots[level] = slot;
+ while (1) {
+ level--;
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ if (!path->skip_locking)
+ path->locks[level] = BTRFS_READ_LOCK;
+ if (!level)
+ break;
+
+ ret = read_block_for_search(root, path, &next, level,
+ 0, &key);
+ if (ret == -EAGAIN && !path->nowait)
+ goto again;
+
+ if (ret < 0) {
+ btrfs_release_path(path);
+ goto done;
+ }
+
+ if (!path->skip_locking) {
+ if (path->nowait) {
+ if (!btrfs_try_tree_read_lock(next)) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(next);
+ }
+ }
+ }
+ ret = 0;
+done:
+ unlock_up(path, 0, 1, 0, NULL);
+ if (need_commit_sem) {
+ int ret2;
+
+ path->need_commit_sem = 1;
+ ret2 = finish_need_commit_sem_search(path);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
+
+ return ret;
+}
+
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid,
+ int type)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+
+ while (1) {
+ if (path->slots[0] == 0) {
+ ret = btrfs_prev_leaf(root, path);
+ if (ret != 0)
+ return ret;
+ } else {
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (nritems == 0)
+ return 1;
+ if (path->slots[0] == nritems)
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid < min_objectid)
+ break;
+ if (found_key.type == type)
+ return 0;
+ if (found_key.objectid == min_objectid &&
+ found_key.type < type)
+ break;
+ }
+ return 1;
+}
+
+/*
+ * search in extent tree to find a previous Metadata/Data extent item with
+ * min objecitd.
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_extent_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+
+ while (1) {
+ if (path->slots[0] == 0) {
+ ret = btrfs_prev_leaf(root, path);
+ if (ret != 0)
+ return ret;
+ } else {
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (nritems == 0)
+ return 1;
+ if (path->slots[0] == nritems)
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid < min_objectid)
+ break;
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ found_key.type == BTRFS_METADATA_ITEM_KEY)
+ return 0;
+ if (found_key.objectid == min_objectid &&
+ found_key.type < BTRFS_EXTENT_ITEM_KEY)
+ break;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000..cca1acf2e
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,4132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_CTREE_H
+#define BTRFS_CTREE_H
+
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <trace/events/btrfs.h>
+#include <asm/unaligned.h>
+#include <linux/pagemap.h>
+#include <linux/btrfs.h>
+#include <linux/btrfs_tree.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/sizes.h>
+#include <linux/dynamic_debug.h>
+#include <linux/refcount.h>
+#include <linux/crc32c.h>
+#include <linux/iomap.h>
+#include <linux/fscrypt.h>
+#include "extent-io-tree.h"
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+#include "block-rsv.h"
+#include "locking.h"
+
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+struct btrfs_pending_snapshot;
+struct btrfs_delayed_ref_root;
+struct btrfs_space_info;
+struct btrfs_block_group;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+extern struct kmem_cache *btrfs_free_space_cachep;
+extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
+struct btrfs_ordered_sum;
+struct btrfs_ref;
+struct btrfs_bio;
+struct btrfs_ioctl_encoded_io_args;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct reloc_control;
+
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
+
+/*
+ * Maximum number of mirrors that can be available for all profiles counting
+ * the target device of dev-replace as one. During an active device replace
+ * procedure, the target device of the copy operation is a mirror for the
+ * filesystem data as well that can be used to read data in order to repair
+ * read errors on other disks.
+ *
+ * Current value is derived from RAID1C4 with 4 copies.
+ */
+#define BTRFS_MAX_MIRRORS (4 + 1)
+
+#define BTRFS_MAX_LEVEL 8
+
+#define BTRFS_OLDEST_GENERATION 0ULL
+
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
+#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
+
+/*
+ * Use large batch size to reduce overhead of metadata updates. On the reader
+ * side, we only read it when we are close to ENOSPC and the read overhead is
+ * mostly related to the number of CPUs, so it is OK to use arbitrary large
+ * value here.
+ */
+#define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M
+
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+
+/*
+ * Deltas are an effective way to populate global statistics. Give macro names
+ * to make it clear what we're doing. An example is discard_extents in
+ * btrfs_free_space_ctl.
+ */
+#define BTRFS_STAT_NR_ENTRIES 2
+#define BTRFS_STAT_CURR 0
+#define BTRFS_STAT_PREV 1
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+ BUG_ON(num_stripes == 0);
+ return sizeof(struct btrfs_chunk) +
+ sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
+/*
+ * Runtime (in-memory) states of filesystem
+ */
+enum {
+ /* Global indicator of serious filesystem errors */
+ BTRFS_FS_STATE_ERROR,
+ /*
+ * Filesystem is being remounted, allow to skip some operations, like
+ * defrag
+ */
+ BTRFS_FS_STATE_REMOUNTING,
+ /* Filesystem in RO mode */
+ BTRFS_FS_STATE_RO,
+ /* Track if a transaction abort has been reported on this filesystem */
+ BTRFS_FS_STATE_TRANS_ABORTED,
+ /*
+ * Bio operations should be blocked on this filesystem because a source
+ * or target device is being destroyed as part of a device replace
+ */
+ BTRFS_FS_STATE_DEV_REPLACING,
+ /* The btrfs_fs_info created for self-tests */
+ BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+ BTRFS_FS_STATE_NO_CSUMS,
+
+ /* Indicates there was an error cleaning up a log tree. */
+ BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+ BTRFS_FS_STATE_COUNT
+};
+
+#define BTRFS_BACKREF_REV_MAX 256
+#define BTRFS_BACKREF_REV_SHIFT 56
+#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+ BTRFS_BACKREF_REV_SHIFT)
+
+#define BTRFS_OLD_BACKREF_REV 0
+#define BTRFS_MIXED_BACKREF_REV 1
+
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+ /* these first four must match the super block */
+ u8 csum[BTRFS_CSUM_SIZE];
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ __le64 bytenr; /* which block this node is supposed to live in */
+ __le64 flags;
+
+ /* allowed to be different from the super from here on down */
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ __le64 generation;
+ __le64 owner;
+ __le32 nritems;
+ u8 level;
+} __attribute__ ((__packed__));
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+
+/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+ __le64 tree_root;
+ __le64 tree_root_gen;
+
+ __le64 chunk_root;
+ __le64 chunk_root_gen;
+
+ __le64 extent_root;
+ __le64 extent_root_gen;
+
+ __le64 fs_root;
+ __le64 fs_root_gen;
+
+ __le64 dev_root;
+ __le64 dev_root_gen;
+
+ __le64 csum_root;
+ __le64 csum_root_gen;
+
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 num_devices;
+ /* future */
+ __le64 unused_64[4];
+
+ u8 tree_root_level;
+ u8 chunk_root_level;
+ u8 extent_root_level;
+ u8 fs_root_level;
+ u8 dev_root_level;
+ u8 csum_root_level;
+ /* future and to align */
+ u8 unused_8[10];
+} __attribute__ ((__packed__));
+
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
+#define BTRFS_SUPER_INFO_SIZE 4096
+
+/*
+ * The reserved space at the beginning of each device.
+ * It covers the primary super block and leaves space for potential use by other
+ * tools like bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
+
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+ /* the first 4 fields must match struct btrfs_header */
+ u8 csum[BTRFS_CSUM_SIZE];
+ /* FS specific UUID, visible to user */
+ u8 fsid[BTRFS_FSID_SIZE];
+ __le64 bytenr; /* this block number */
+ __le64 flags;
+
+ /* allowed to be different from the btrfs_header from here own down */
+ __le64 magic;
+ __le64 generation;
+ __le64 root;
+ __le64 chunk_root;
+ __le64 log_root;
+
+ /*
+ * This member has never been utilized since the very beginning, thus
+ * it's always 0 regardless of kernel version. We always use
+ * generation + 1 to read log tree root. So here we mark it deprecated.
+ */
+ __le64 __unused_log_root_transid;
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 root_dir_objectid;
+ __le64 num_devices;
+ __le32 sectorsize;
+ __le32 nodesize;
+ __le32 __unused_leafsize;
+ __le32 stripesize;
+ __le32 sys_chunk_array_size;
+ __le64 chunk_root_generation;
+ __le64 compat_flags;
+ __le64 compat_ro_flags;
+ __le64 incompat_flags;
+ __le16 csum_type;
+ u8 root_level;
+ u8 chunk_root_level;
+ u8 log_root_level;
+ struct btrfs_dev_item dev_item;
+
+ char label[BTRFS_LABEL_SIZE];
+
+ __le64 cache_generation;
+ __le64 uuid_tree_generation;
+
+ /* the UUID written into btree blocks */
+ u8 metadata_uuid[BTRFS_FSID_SIZE];
+
+ /* future expansion */
+ u8 reserved8[8];
+ __le64 reserved[27];
+ u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+ struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+ /* Padded to 4096 bytes */
+ u8 padding[565];
+} __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
+
+/*
+ * Compat flags that we support. If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP \
+ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+ BTRFS_FEATURE_COMPAT_RO_VERITY | \
+ BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
+
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
+
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
+ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
+
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+ struct btrfs_disk_key key;
+ __le32 offset;
+ __le32 size;
+} __attribute__ ((__packed__));
+
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+ struct btrfs_header header;
+ struct btrfs_item items[];
+} __attribute__ ((__packed__));
+
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+ struct btrfs_disk_key key;
+ __le64 blockptr;
+ __le64 generation;
+} __attribute__ ((__packed__));
+
+struct btrfs_node {
+ struct btrfs_header header;
+ struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+
+/* Read ahead values for struct btrfs_path.reada */
+enum {
+ READA_NONE,
+ READA_BACK,
+ READA_FORWARD,
+ /*
+ * Similar to READA_FORWARD but unlike it:
+ *
+ * 1) It will trigger readahead even for leaves that are not close to
+ * each other on disk;
+ * 2) It also triggers readahead for nodes;
+ * 3) During a search, even when a node or leaf is already in memory, it
+ * will still trigger readahead for other nodes and leaves that follow
+ * it.
+ *
+ * This is meant to be used only when we know we are iterating over the
+ * entire tree or a very large part of it.
+ */
+ READA_FORWARD_ALWAYS,
+};
+
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+ struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+ int slots[BTRFS_MAX_LEVEL];
+ /* if there is real range locking, this locks field will change */
+ u8 locks[BTRFS_MAX_LEVEL];
+ u8 reada;
+ /* keep some upper locks as we walk down */
+ u8 lowest_level;
+
+ /*
+ * set by btrfs_split_item, tells search_slot to keep all locks
+ * and to force calls to keep space in the nodes
+ */
+ unsigned int search_for_split:1;
+ unsigned int keep_locks:1;
+ unsigned int skip_locking:1;
+ unsigned int search_commit_root:1;
+ unsigned int need_commit_sem:1;
+ unsigned int skip_release_on_error:1;
+ /*
+ * Indicate that new item (btrfs_search_slot) is extending already
+ * existing item and ins_len contains only the data size and not item
+ * header (ie. sizeof(struct btrfs_item) is not included).
+ */
+ unsigned int search_for_extension:1;
+ /* Stop search if any locks need to be taken (for read) */
+ unsigned int nowait:1;
+};
+
+struct btrfs_dev_replace {
+ u64 replace_state; /* see #define above */
+ time64_t time_started; /* seconds since 1-Jan-1970 */
+ time64_t time_stopped; /* seconds since 1-Jan-1970 */
+ atomic64_t num_write_errors;
+ atomic64_t num_uncorrectable_read_errors;
+
+ u64 cursor_left;
+ u64 committed_cursor_left;
+ u64 cursor_left_last_write_of_item;
+ u64 cursor_right;
+
+ u64 cont_reading_from_srcdev_mode; /* see #define above */
+
+ int is_valid;
+ int item_needs_writeback;
+ struct btrfs_device *srcdev;
+ struct btrfs_device *tgtdev;
+
+ struct mutex lock_finishing_cancel_unmount;
+ struct rw_semaphore rwsem;
+
+ struct btrfs_scrub_progress scrub_progress;
+
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
+};
+
+/*
+ * free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes. They are used for all metadata
+ * allocations. In ssd_spread mode they are also used for data allocations.
+ */
+struct btrfs_free_cluster {
+ spinlock_t lock;
+ spinlock_t refill_lock;
+ struct rb_root root;
+
+ /* largest extent in this cluster */
+ u64 max_size;
+
+ /* first extent starting offset */
+ u64 window_start;
+
+ /* We did a full search and couldn't create a cluster */
+ bool fragmented;
+
+ struct btrfs_block_group *block_group;
+ /*
+ * when a cluster is allocated from a block group, we put the
+ * cluster onto a list in the block group so that it can
+ * be freed before the block group is freed.
+ */
+ struct list_head block_group_list;
+};
+
+/* Discard control. */
+/*
+ * Async discard uses multiple lists to differentiate the discard filter
+ * parameters. Index 0 is for completely free block groups where we need to
+ * ensure the entire block group is trimmed without being lossy. Indices
+ * afterwards represent monotonically decreasing discard filter sizes to
+ * prioritize what should be discarded next.
+ */
+#define BTRFS_NR_DISCARD_LISTS 3
+#define BTRFS_DISCARD_INDEX_UNUSED 0
+#define BTRFS_DISCARD_INDEX_START 1
+
+struct btrfs_discard_ctl {
+ struct workqueue_struct *discard_workers;
+ struct delayed_work work;
+ spinlock_t lock;
+ struct btrfs_block_group *block_group;
+ struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
+ u64 prev_discard;
+ u64 prev_discard_time;
+ atomic_t discardable_extents;
+ atomic64_t discardable_bytes;
+ u64 max_discard_size;
+ u64 delay_ms;
+ u32 iops_limit;
+ u32 kbps_limit;
+ u64 discard_extent_bytes;
+ u64 discard_bitmap_bytes;
+ atomic64_t discard_bytes_saved;
+};
+
+enum {
+ BTRFS_FS_CLOSING_START,
+ BTRFS_FS_CLOSING_DONE,
+ BTRFS_FS_LOG_RECOVERING,
+ BTRFS_FS_OPEN,
+ BTRFS_FS_QUOTA_ENABLED,
+ BTRFS_FS_UPDATE_UUID_TREE_GEN,
+ BTRFS_FS_CREATING_FREE_SPACE_TREE,
+ BTRFS_FS_BTREE_ERR,
+ BTRFS_FS_LOG1_ERR,
+ BTRFS_FS_LOG2_ERR,
+ BTRFS_FS_QUOTA_OVERRIDE,
+ /* Used to record internally whether fs has been frozen */
+ BTRFS_FS_FROZEN,
+ /*
+ * Indicate that balance has been set up from the ioctl and is in the
+ * main phase. The fs_info::balance_ctl is initialized.
+ */
+ BTRFS_FS_BALANCE_RUNNING,
+
+ /*
+ * Indicate that relocation of a chunk has started, it's set per chunk
+ * and is toggled between chunks.
+ */
+ BTRFS_FS_RELOC_RUNNING,
+
+ /* Indicate that the cleaner thread is awake and doing something. */
+ BTRFS_FS_CLEANER_RUNNING,
+
+ /*
+ * The checksumming has an optimized version and is considered fast,
+ * so we don't need to offload checksums to workqueues.
+ */
+ BTRFS_FS_CSUM_IMPL_FAST,
+
+ /* Indicate that the discard workqueue can service discards. */
+ BTRFS_FS_DISCARD_RUNNING,
+
+ /* Indicate that we need to cleanup space cache v1 */
+ BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+
+ /* Indicate that we can't trust the free space tree for caching yet */
+ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
+
+ /* Indicate whether there are any tree modification log users */
+ BTRFS_FS_TREE_MOD_LOG_USERS,
+
+ /* Indicate that we want the transaction kthread to commit right now. */
+ BTRFS_FS_COMMIT_TRANS,
+
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
+ /* Indicate we have to finish a zone to do next allocation. */
+ BTRFS_FS_NEED_ZONE_FINISH,
+
+ /* This is set when active zone tracking is needed. */
+ BTRFS_FS_ACTIVE_ZONE_TRACKING,
+
+#if BITS_PER_LONG == 32
+ /* Indicate if we have error/warn message printed on 32bit systems */
+ BTRFS_FS_32BIT_ERROR,
+ BTRFS_FS_32BIT_WARN,
+#endif
+};
+
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE_PAUSED,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+ /* Total number of commits */
+ u64 commit_count;
+ /* The maximum commit duration so far in ns */
+ u64 max_commit_dur;
+ /* The last commit duration in ns */
+ u64 last_commit_dur;
+ /* The total commit duration in ns */
+ u64 total_commit_dur;
+};
+
+struct btrfs_fs_info {
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ unsigned long flags;
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *quota_root;
+ struct btrfs_root *uuid_root;
+ struct btrfs_root *data_reloc_root;
+ struct btrfs_root *block_group_root;
+
+ /* the log root tree is a directory of all the other log roots */
+ struct btrfs_root *log_root_tree;
+
+ /* The tree that holds the global roots (csum, extent, etc) */
+ rwlock_t global_root_lock;
+ struct rb_root global_root_tree;
+
+ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
+
+ /* block group cache stuff */
+ rwlock_t block_group_cache_lock;
+ struct rb_root_cached block_group_cache_tree;
+
+ /* keep track of unallocated space */
+ atomic64_t free_chunk_space;
+
+ /* Track ranges which are used by log trees blocks/logged data extents */
+ struct extent_io_tree excluded_extents;
+
+ /* logical->physical extent mapping */
+ struct extent_map_tree mapping_tree;
+
+ /*
+ * block reservation for extent, checksum, root tree and
+ * delayed dir index item
+ */
+ struct btrfs_block_rsv global_block_rsv;
+ /* block reservation for metadata operations */
+ struct btrfs_block_rsv trans_block_rsv;
+ /* block reservation for chunk tree */
+ struct btrfs_block_rsv chunk_block_rsv;
+ /* block reservation for delayed operations */
+ struct btrfs_block_rsv delayed_block_rsv;
+ /* block reservation for delayed refs */
+ struct btrfs_block_rsv delayed_refs_rsv;
+
+ struct btrfs_block_rsv empty_block_rsv;
+
+ u64 generation;
+ u64 last_trans_committed;
+ /*
+ * Generation of the last transaction used for block group relocation
+ * since the filesystem was last mounted (or 0 if none happened yet).
+ * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+ */
+ u64 last_reloc_trans;
+ u64 avg_delayed_ref_runtime;
+
+ /*
+ * this is updated to the current trans every time a full commit
+ * is required instead of the faster short fsync log commits
+ */
+ u64 last_trans_log_full_commit;
+ unsigned long mount_opt;
+ /*
+ * Track requests for actions that need to be done during transaction
+ * commit (like for some mount options).
+ */
+ unsigned long pending_changes;
+ unsigned long compress_type:4;
+ unsigned int compress_level;
+ u32 commit_interval;
+ /*
+ * It is a suggestive number, the read side is safe even it gets a
+ * wrong number because we will write out the data into a regular
+ * extent. The write side(mount/remount) is under ->s_umount lock,
+ * so it is also safe.
+ */
+ u64 max_inline;
+
+ struct btrfs_transaction *running_transaction;
+ wait_queue_head_t transaction_throttle;
+ wait_queue_head_t transaction_wait;
+ wait_queue_head_t transaction_blocked_wait;
+ wait_queue_head_t async_submit_wait;
+
+ /*
+ * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+ * when they are updated.
+ *
+ * Because we do not clear the flags for ever, so we needn't use
+ * the lock on the read side.
+ *
+ * We also needn't use the lock when we mount the fs, because
+ * there is no other task which will update the flag.
+ */
+ spinlock_t super_lock;
+ struct btrfs_super_block *super_copy;
+ struct btrfs_super_block *super_for_commit;
+ struct super_block *sb;
+ struct inode *btree_inode;
+ struct mutex tree_log_mutex;
+ struct mutex transaction_kthread_mutex;
+ struct mutex cleaner_mutex;
+ struct mutex chunk_mutex;
+
+ /*
+ * this is taken to make sure we don't set block groups ro after
+ * the free space cache has been allocated on them
+ */
+ struct mutex ro_block_group_mutex;
+
+ /* this is used during read/modify/write to make sure
+ * no two ios are trying to mod the same stripe at the same
+ * time
+ */
+ struct btrfs_stripe_hash_table *stripe_hash_table;
+
+ /*
+ * this protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make
+ * sure the commit code doesn't find the list temporarily empty
+ * because another function happens to be doing non-waiting preflush
+ * before jumping into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
+
+ struct rw_semaphore commit_root_sem;
+
+ struct rw_semaphore cleanup_work_sem;
+
+ struct rw_semaphore subvol_sem;
+
+ spinlock_t trans_lock;
+ /*
+ * the reloc mutex goes with the trans lock, it is taken
+ * during commit to protect us from the relocation code
+ */
+ struct mutex reloc_mutex;
+
+ struct list_head trans_list;
+ struct list_head dead_roots;
+ struct list_head caching_block_groups;
+
+ spinlock_t delayed_iput_lock;
+ struct list_head delayed_iputs;
+ atomic_t nr_delayed_iputs;
+ wait_queue_head_t delayed_iputs_wait;
+
+ atomic64_t tree_mod_seq;
+
+ /* this protects tree_mod_log and tree_mod_seq_list */
+ rwlock_t tree_mod_log_lock;
+ struct rb_root tree_mod_log;
+ struct list_head tree_mod_seq_list;
+
+ atomic_t async_delalloc_pages;
+
+ /*
+ * this is used to protect the following list -- ordered_roots.
+ */
+ spinlock_t ordered_root_lock;
+
+ /*
+ * all fs/file tree roots in which there are data=ordered extents
+ * pending writeback are added into this list.
+ *
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
+ struct list_head ordered_roots;
+
+ struct mutex delalloc_root_mutex;
+ spinlock_t delalloc_root_lock;
+ /* all fs/file tree roots that have delalloc inodes. */
+ struct list_head delalloc_roots;
+
+ /*
+ * there is a pool of worker threads for checksumming during writes
+ * and a pool for checksumming after reads. This is because readers
+ * can run with FS locks held, and the writers may be waiting for
+ * those locks. We don't want ordering in the pending list to cause
+ * deadlocks, and so the two are serviced separately.
+ *
+ * A third pool does submit_bio to avoid deadlocking with the other
+ * two
+ */
+ struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *hipri_workers;
+ struct btrfs_workqueue *delalloc_workers;
+ struct btrfs_workqueue *flush_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
+ struct workqueue_struct *endio_raid56_workers;
+ struct workqueue_struct *rmw_workers;
+ struct workqueue_struct *compressed_write_workers;
+ struct btrfs_workqueue *endio_write_workers;
+ struct btrfs_workqueue *endio_freespace_worker;
+ struct btrfs_workqueue *caching_workers;
+
+ /*
+ * fixup workers take dirty pages that didn't properly go through
+ * the cow mechanism and make them safe to write. It happens
+ * for the sys_munmap function call path
+ */
+ struct btrfs_workqueue *fixup_workers;
+ struct btrfs_workqueue *delayed_workers;
+
+ struct task_struct *transaction_kthread;
+ struct task_struct *cleaner_kthread;
+ u32 thread_pool_size;
+
+ struct kobject *space_info_kobj;
+ struct kobject *qgroups_kobj;
+ struct kobject *discard_kobj;
+
+ /* used to keep from writing metadata until there is a nice batch */
+ struct percpu_counter dirty_metadata_bytes;
+ struct percpu_counter delalloc_bytes;
+ struct percpu_counter ordered_bytes;
+ s32 dirty_metadata_batch;
+ s32 delalloc_batch;
+
+ struct list_head dirty_cowonly_roots;
+
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * The space_info list is effectively read only after initial
+ * setup. It is populated at mount time and cleaned up after
+ * all block groups are removed. RCU is used to protect it.
+ */
+ struct list_head space_info;
+
+ struct btrfs_space_info *data_sinfo;
+
+ struct reloc_control *reloc_ctl;
+
+ /* data_alloc_cluster is only used in ssd_spread mode */
+ struct btrfs_free_cluster data_alloc_cluster;
+
+ /* all metadata allocations go through this cluster */
+ struct btrfs_free_cluster meta_alloc_cluster;
+
+ /* auto defrag inodes go here */
+ spinlock_t defrag_inodes_lock;
+ struct rb_root defrag_inodes;
+ atomic_t defrag_running;
+
+ /* Used to protect avail_{data, metadata, system}_alloc_bits */
+ seqlock_t profiles_lock;
+ /*
+ * these three are in extended format (availability of single
+ * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+ * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
+ u64 avail_data_alloc_bits;
+ u64 avail_metadata_alloc_bits;
+ u64 avail_system_alloc_bits;
+
+ /* restriper state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
+
+ /* Cancellation requests for chunk relocation */
+ atomic_t reloc_cancel_req;
+
+ u32 data_chunk_allocations;
+ u32 metadata_ratio;
+
+ void *bdev_holder;
+
+ /* private scrub information */
+ struct mutex scrub_lock;
+ atomic_t scrubs_running;
+ atomic_t scrub_pause_req;
+ atomic_t scrubs_paused;
+ atomic_t scrub_cancel_req;
+ wait_queue_head_t scrub_pause_wait;
+ /*
+ * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+ * running.
+ */
+ refcount_t scrub_workers_refcnt;
+ struct workqueue_struct *scrub_workers;
+ struct workqueue_struct *scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity_workers;
+ struct btrfs_subpage_info *subpage_info;
+
+ struct btrfs_discard_ctl discard_ctl;
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ u32 check_integrity_print_mask;
+#endif
+ /* is qgroup tracking in a consistent state? */
+ u64 qgroup_flags;
+
+ /* holds configuration and tracking. Protected by qgroup_lock */
+ struct rb_root qgroup_tree;
+ spinlock_t qgroup_lock;
+
+ /*
+ * used to avoid frequently calling ulist_alloc()/ulist_free()
+ * when doing qgroup accounting, it must be protected by qgroup_lock.
+ */
+ struct ulist *qgroup_ulist;
+
+ /*
+ * Protect user change for quota operations. If a transaction is needed,
+ * it must be started before locking this lock.
+ */
+ struct mutex qgroup_ioctl_lock;
+
+ /* list of dirty qgroups to be written at next commit */
+ struct list_head dirty_qgroups;
+
+ /* used by qgroup for an efficient tree traversal */
+ u64 qgroup_seq;
+
+ /* qgroup rescan items */
+ struct mutex qgroup_rescan_lock; /* protects the progress item */
+ struct btrfs_key qgroup_rescan_progress;
+ struct btrfs_workqueue *qgroup_rescan_workers;
+ struct completion qgroup_rescan_completion;
+ struct btrfs_work qgroup_rescan_work;
+ bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
+ u8 qgroup_drop_subtree_thres;
+
+ /* filesystem state */
+ unsigned long fs_state;
+
+ struct btrfs_delayed_root *delayed_root;
+
+ /* Extent buffer radix tree */
+ spinlock_t buffer_lock;
+ /* Entries are eb->start / sectorsize */
+ struct radix_tree_root buffer_radix;
+
+ /* next backup root to be overwritten */
+ int backup_root_index;
+
+ /* device replace state */
+ struct btrfs_dev_replace dev_replace;
+
+ struct semaphore uuid_tree_rescan_sem;
+
+ /* Used to reclaim the metadata space in the background. */
+ struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
+ struct work_struct preempt_reclaim_work;
+
+ /* Reclaim partially filled block groups in the background */
+ struct work_struct reclaim_bgs_work;
+ struct list_head reclaim_bgs;
+ int bg_reclaim_threshold;
+
+ spinlock_t unused_bgs_lock;
+ struct list_head unused_bgs;
+ struct mutex unused_bg_unpin_mutex;
+ /* Protect block groups that are going to be deleted */
+ struct mutex reclaim_bgs_lock;
+
+ /* Cached block sizes */
+ u32 nodesize;
+ u32 sectorsize;
+ /* ilog2 of sectorsize, use to avoid 64bit division */
+ u32 sectorsize_bits;
+ u32 csum_size;
+ u32 csums_per_leaf;
+ u32 stripesize;
+
+ /*
+ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
+ * filesystem, on zoned it depends on the device constraints.
+ */
+ u64 max_extent_size;
+
+ /* Block groups and devices containing active swapfiles. */
+ spinlock_t swapfile_pins_lock;
+ struct rb_root swapfile_pins;
+
+ struct crypto_shash *csum_shash;
+
+ /* Type of exclusive operation running, protected by super_lock */
+ enum btrfs_exclusive_operation exclusive_operation;
+
+ /*
+ * Zone size > 0 when in ZONED mode, otherwise it's used for a check
+ * if the mode is enabled
+ */
+ u64 zone_size;
+
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
+ struct mutex zoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
+
+ /*
+ * Start of the dedicated data relocation block group, protected by
+ * relocation_bg_lock.
+ */
+ spinlock_t relocation_bg_lock;
+ u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
+
+ u64 nr_global_roots;
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
+
+ /* Updates are not protected by any lock */
+ struct btrfs_commit_stats commit_stats;
+
+ /*
+ * Last generation where we dropped a non-relocation root.
+ * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+ * to change it and to read it, respectively.
+ */
+ u64 last_root_drop_gen;
+
+ /*
+ * Annotations for transaction events (structures are empty when
+ * compiled without lockdep).
+ */
+ struct lockdep_map btrfs_trans_num_writers_map;
+ struct lockdep_map btrfs_trans_num_extwriters_map;
+ struct lockdep_map btrfs_state_change_map[4];
+ struct lockdep_map btrfs_trans_pending_ordered_map;
+ struct lockdep_map btrfs_ordered_extent_map;
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ spinlock_t ref_verify_lock;
+ struct rb_root block_tree;
+#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct kobject *debug_kobj;
+ struct list_head allocated_roots;
+
+ spinlock_t eb_leak_lock;
+ struct list_head allocated_ebs;
+#endif
+};
+
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+ u64 gen)
+{
+ WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+ sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (!fs_info)
+ return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+ return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
+/*
+ * The state of btrfs root
+ */
+enum {
+ /*
+ * btrfs_record_root_in_trans is a multi-step process, and it can race
+ * with the balancing code. But the race is very small, and only the
+ * first time the root is added to each transaction. So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+ BTRFS_ROOT_IN_TRANS_SETUP,
+
+ /*
+ * Set if tree blocks of this root can be shared by other roots.
+ * Only subvolume trees and their reloc trees have this bit set.
+ * Conflicts with TRACK_DIRTY bit.
+ *
+ * This affects two things:
+ *
+ * - How balance works
+ * For shareable roots, we need to use reloc tree and do path
+ * replacement for balance, and need various pre/post hooks for
+ * snapshot creation to handle them.
+ *
+ * While for non-shareable trees, we just simply do a tree search
+ * with COW.
+ *
+ * - How dirty roots are tracked
+ * For shareable roots, btrfs_record_root_in_trans() is needed to
+ * track them, while non-subvolume roots have TRACK_DIRTY bit, they
+ * don't need to set this manually.
+ */
+ BTRFS_ROOT_SHAREABLE,
+ BTRFS_ROOT_TRACK_DIRTY,
+ BTRFS_ROOT_IN_RADIX,
+ BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ BTRFS_ROOT_DEFRAG_RUNNING,
+ BTRFS_ROOT_FORCE_COW,
+ BTRFS_ROOT_MULTI_LOG_TASKS,
+ BTRFS_ROOT_DIRTY,
+ BTRFS_ROOT_DELETING,
+
+ /*
+ * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
+ *
+ * Set for the subvolume tree owning the reloc tree.
+ */
+ BTRFS_ROOT_DEAD_RELOC_TREE,
+ /* Mark dead root stored on device whose cleanup needs to be resumed */
+ BTRFS_ROOT_DEAD_TREE,
+ /* The root has a log tree. Used for subvolume roots and the tree root. */
+ BTRFS_ROOT_HAS_LOG_TREE,
+ /* Qgroup flushing is in progress */
+ BTRFS_ROOT_QGROUP_FLUSHING,
+ /* We started the orphan cleanup for this root. */
+ BTRFS_ROOT_ORPHAN_CLEANUP,
+ /* This root has a drop operation that was started previously. */
+ BTRFS_ROOT_UNFINISHED_DROP,
+ /* This reloc root needs to have its buffers lockdep class reset. */
+ BTRFS_ROOT_RESET_LOCKDEP_CLASS,
+};
+
+enum btrfs_lockdep_trans_states {
+ BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+ BTRFS_LOCKDEP_TRANS_COMPLETED,
+};
+
+/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock) \
+ do { \
+ rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->lock##_map, _THIS_IP_); \
+ } while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock) \
+ rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i) \
+ do { \
+ rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
+ } while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i) \
+ rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i) \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
+ } while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
+ &lock##_key, 0); \
+ } while (0)
+
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
+/*
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+ spinlock_t lock;
+ /* RM_EMPTY_ROOT() of above blocks[] */
+ bool swapped;
+ struct rb_root blocks[BTRFS_MAX_LEVEL];
+};
+
+/*
+ * in ram representation of the tree. extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_root {
+ struct rb_node rb_node;
+
+ struct extent_buffer *node;
+
+ struct extent_buffer *commit_root;
+ struct btrfs_root *log_root;
+ struct btrfs_root *reloc_root;
+
+ unsigned long state;
+ struct btrfs_root_item root_item;
+ struct btrfs_key root_key;
+ struct btrfs_fs_info *fs_info;
+ struct extent_io_tree dirty_log_pages;
+
+ struct mutex objectid_mutex;
+
+ spinlock_t accounting_lock;
+ struct btrfs_block_rsv *block_rsv;
+
+ struct mutex log_mutex;
+ wait_queue_head_t log_writer_wait;
+ wait_queue_head_t log_commit_wait[2];
+ struct list_head log_ctxs[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
+ atomic_t log_writers;
+ atomic_t log_commit[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
+ atomic_t log_batch;
+ int log_transid;
+ /* No matter the commit succeeds or not*/
+ int log_transid_committed;
+ /* Just be updated when the commit succeeds. */
+ int last_log_commit;
+ pid_t log_start_pid;
+
+ u64 last_trans;
+
+ u32 type;
+
+ u64 free_objectid;
+
+ struct btrfs_key defrag_progress;
+ struct btrfs_key defrag_max;
+
+ /* The dirty list is only used by non-shareable roots */
+ struct list_head dirty_list;
+
+ struct list_head root_list;
+
+ spinlock_t log_extents_lock[2];
+ struct list_head logged_list[2];
+
+ spinlock_t inode_lock;
+ /* red-black tree that keeps track of in-memory inodes */
+ struct rb_root inode_tree;
+
+ /*
+ * radix tree that keeps track of delayed nodes of every inode,
+ * protected by inode_lock
+ */
+ struct radix_tree_root delayed_nodes_tree;
+ /*
+ * right now this just gets used so that a root has its own devid
+ * for stat. It may be used for more later
+ */
+ dev_t anon_dev;
+
+ spinlock_t root_item_lock;
+ refcount_t refs;
+
+ struct mutex delalloc_mutex;
+ spinlock_t delalloc_lock;
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
+ struct list_head delalloc_inodes;
+ struct list_head delalloc_root;
+ u64 nr_delalloc_inodes;
+
+ struct mutex ordered_extent_mutex;
+ /*
+ * this is used by the balancing code to wait for all the pending
+ * ordered extents
+ */
+ spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
+ struct list_head ordered_extents;
+ struct list_head ordered_root;
+ u64 nr_ordered_extents;
+
+ /*
+ * Not empty if this subvolume root has gone through tree block swap
+ * (relocation)
+ *
+ * Will be used by reloc_control::dirty_subvol_roots.
+ */
+ struct list_head reloc_dirty_list;
+
+ /*
+ * Number of currently running SEND ioctls to prevent
+ * manipulation with the read-only status via SUBVOL_SETFLAGS
+ */
+ int send_in_progress;
+ /*
+ * Number of currently running deduplication operations that have a
+ * destination inode belonging to this root. Protected by the lock
+ * root_item_lock.
+ */
+ int dedupe_in_progress;
+ /* For exclusion of snapshot creation and nocow writes */
+ struct btrfs_drew_lock snapshot_lock;
+
+ atomic_t snapshot_force_cow;
+
+ /* For qgroup metadata reserved space */
+ spinlock_t qgroup_meta_rsv_lock;
+ u64 qgroup_meta_rsv_pertrans;
+ u64 qgroup_meta_rsv_prealloc;
+ wait_queue_head_t qgroup_flush_wait;
+
+ /* Number of active swapfiles */
+ atomic_t nr_swapfiles;
+
+ /* Record pairs of swapped blocks for qgroup */
+ struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
+ /* Used only by log trees, when logging csum items */
+ struct extent_io_tree log_csum_range;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ u64 alloc_bytenr;
+#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
+};
+
+/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+struct btrfs_replace_extent_info {
+ u64 disk_offset;
+ u64 disk_len;
+ u64 data_offset;
+ u64 data_len;
+ u64 file_offset;
+ /* Pointer to a file extent item of type regular or prealloc. */
+ char *extent_buf;
+ /*
+ * Set to true when attempting to replace a file range with a new extent
+ * described by this structure, set to false when attempting to clone an
+ * existing extent into a file range.
+ */
+ bool is_new_extent;
+ /* Indicate if we should update the inode's mtime and ctime. */
+ bool update_times;
+ /* Meaningful only if is_new_extent is true. */
+ int qgroup_reserved;
+ /*
+ * Meaningful only if is_new_extent is true.
+ * Used to track how many extent items we have already inserted in a
+ * subvolume tree that refer to the extent described by this structure,
+ * so that we know when to create a new delayed ref or update an existing
+ * one.
+ */
+ int insertions;
+};
+
+/* Arguments for btrfs_drop_extents() */
+struct btrfs_drop_extents_args {
+ /* Input parameters */
+
+ /*
+ * If NULL, btrfs_drop_extents() will allocate and free its own path.
+ * If 'replace_extent' is true, this must not be NULL. Also the path
+ * is always released except if 'replace_extent' is true and
+ * btrfs_drop_extents() sets 'extent_inserted' to true, in which case
+ * the path is kept locked.
+ */
+ struct btrfs_path *path;
+ /* Start offset of the range to drop extents from */
+ u64 start;
+ /* End (exclusive, last byte + 1) of the range to drop extents from */
+ u64 end;
+ /* If true drop all the extent maps in the range */
+ bool drop_cache;
+ /*
+ * If true it means we want to insert a new extent after dropping all
+ * the extents in the range. If this is true, the 'extent_item_size'
+ * parameter must be set as well and the 'extent_inserted' field will
+ * be set to true by btrfs_drop_extents() if it could insert the new
+ * extent.
+ * Note: when this is set to true the path must not be NULL.
+ */
+ bool replace_extent;
+ /*
+ * Used if 'replace_extent' is true. Size of the file extent item to
+ * insert after dropping all existing extents in the range
+ */
+ u32 extent_item_size;
+
+ /* Output parameters */
+
+ /*
+ * Set to the minimum between the input parameter 'end' and the end
+ * (exclusive, last byte + 1) of the last dropped extent. This is always
+ * set even if btrfs_drop_extents() returns an error.
+ */
+ u64 drop_end;
+ /*
+ * The number of allocated bytes found in the range. This can be smaller
+ * than the range's length when there are holes in the range.
+ */
+ u64 bytes_found;
+ /*
+ * Only set if 'replace_extent' is true. Set to true if we were able
+ * to insert a replacement extent after dropping all extents in the
+ * range, otherwise set to false by btrfs_drop_extents().
+ * Also, if btrfs_drop_extents() has set this to true it means it
+ * returned with the path locked, otherwise if it has set this to
+ * false it has returned with the path released.
+ */
+ bool extent_inserted;
+};
+
+struct btrfs_file_private {
+ void *filldir_buf;
+ u64 last_index;
+};
+
+
+static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
+{
+
+ return info->nodesize - sizeof(struct btrfs_header);
+}
+
+#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items)
+
+static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
+{
+ return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
+}
+
+static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info)
+{
+ return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr);
+}
+
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
+static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
+{
+ return BTRFS_MAX_ITEM_SIZE(info) -
+ BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
+{
+ return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
+}
+
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
+enum {
+ BTRFS_MOUNT_NODATASUM = (1UL << 0),
+ BTRFS_MOUNT_NODATACOW = (1UL << 1),
+ BTRFS_MOUNT_NOBARRIER = (1UL << 2),
+ BTRFS_MOUNT_SSD = (1UL << 3),
+ BTRFS_MOUNT_DEGRADED = (1UL << 4),
+ BTRFS_MOUNT_COMPRESS = (1UL << 5),
+ BTRFS_MOUNT_NOTREELOG = (1UL << 6),
+ BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
+ BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
+ BTRFS_MOUNT_NOSSD = (1UL << 9),
+ BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
+ BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
+ BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
+ BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
+ BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
+ BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
+ BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
+ BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
+ BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
+ BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
+ BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
+ BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
+ BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
+};
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_DEFAULT_MAX_INLINE (2048)
+
+#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
+ BTRFS_MOUNT_##opt)
+
+#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
+do { \
+ if (!btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_set_opt(fs_info->mount_opt, opt); \
+} while (0)
+
+#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
+do { \
+ if (btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_clear_opt(fs_info->mount_opt, opt); \
+} while (0)
+
+/*
+ * Requests for changes that need to be done during transaction commit.
+ *
+ * Internal mount options that are used for special handling of the real
+ * mount options (eg. cannot be set during remount and have to be set during
+ * transaction commit)
+ */
+
+#define BTRFS_PENDING_COMMIT (0)
+
+#define btrfs_test_pending(info, opt) \
+ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_set_pending(info, opt) \
+ set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_clear_pending(info, opt) \
+ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+
+/*
+ * Helpers for setting pending mount option changes.
+ *
+ * Expects corresponding macros
+ * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
+ */
+#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
+do { \
+ if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
+ btrfs_info((info), fmt, ##args); \
+ btrfs_set_pending((info), SET_##opt); \
+ btrfs_clear_pending((info), CLEAR_##opt); \
+ } \
+} while(0)
+
+#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
+do { \
+ if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
+ btrfs_info((info), fmt, ##args); \
+ btrfs_set_pending((info), CLEAR_##opt); \
+ btrfs_clear_pending((info), SET_##opt); \
+ } \
+} while(0)
+
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM (1U << 0)
+#define BTRFS_INODE_NODATACOW (1U << 1)
+#define BTRFS_INODE_READONLY (1U << 2)
+#define BTRFS_INODE_NOCOMPRESS (1U << 3)
+#define BTRFS_INODE_PREALLOC (1U << 4)
+#define BTRFS_INODE_SYNC (1U << 5)
+#define BTRFS_INODE_IMMUTABLE (1U << 6)
+#define BTRFS_INODE_APPEND (1U << 7)
+#define BTRFS_INODE_NODUMP (1U << 8)
+#define BTRFS_INODE_NOATIME (1U << 9)
+#define BTRFS_INODE_DIRSYNC (1U << 10)
+#define BTRFS_INODE_COMPRESS (1U << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31)
+
+#define BTRFS_INODE_FLAG_MASK \
+ (BTRFS_INODE_NODATASUM | \
+ BTRFS_INODE_NODATACOW | \
+ BTRFS_INODE_READONLY | \
+ BTRFS_INODE_NOCOMPRESS | \
+ BTRFS_INODE_PREALLOC | \
+ BTRFS_INODE_SYNC | \
+ BTRFS_INODE_IMMUTABLE | \
+ BTRFS_INODE_APPEND | \
+ BTRFS_INODE_NODUMP | \
+ BTRFS_INODE_NOATIME | \
+ BTRFS_INODE_DIRSYNC | \
+ BTRFS_INODE_COMPRESS | \
+ BTRFS_INODE_ROOT_ITEM_INIT)
+
+#define BTRFS_INODE_RO_VERITY (1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY)
+
+struct btrfs_map_token {
+ struct extent_buffer *eb;
+ char *kaddr;
+ unsigned long offset;
+};
+
+#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
+ ((bytes) >> (fs_info)->sectorsize_bits)
+
+static inline void btrfs_init_map_token(struct btrfs_map_token *token,
+ struct extent_buffer *eb)
+{
+ token->eb = eb;
+ token->kaddr = page_address(eb->pages[0]);
+ token->offset = 0;
+}
+
+/* some macros to generate set/get functions for the struct fields. This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
+#define read_eb_member(eb, ptr, type, member, result) (\
+ read_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) (\
+ write_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define DECLARE_BTRFS_SETGET_BITS(bits) \
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off, \
+ u##bits val); \
+u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val);
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
+ const type *s) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_##bits(eb, s, offsetof(type, member)); \
+} \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
+ u##bits val) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_##bits(eb, s, offsetof(type, member), val); \
+} \
+static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
+ const type *s) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_token_##bits(token, s, offsetof(type, member));\
+} \
+static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
+ type *s, u##bits val) \
+{ \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
+}
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
+{ \
+ const type *p = page_address(eb->pages[0]) + \
+ offset_in_page(eb->start); \
+ return get_unaligned_le##bits(&p->member); \
+} \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, \
+ u##bits val) \
+{ \
+ type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ put_unaligned_le##bits(val, &p->member); \
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(const type *s) \
+{ \
+ return get_unaligned_le##bits(&s->member); \
+} \
+static inline void btrfs_set_##name(type *s, u##bits val) \
+{ \
+ put_unaligned_le##bits(val, &s->member); \
+}
+
+static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
+ struct btrfs_dev_item *s)
+{
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
+ total_bytes));
+}
+static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
+ struct btrfs_dev_item *s,
+ u64 val)
+{
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
+ btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
+}
+
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+ start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+ dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+ seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+ bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+ generation, 64);
+
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+ return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+ return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+ return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+ stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+ num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+ sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+ int nr)
+{
+ unsigned long offset = (unsigned long)c;
+ offset += offsetof(struct btrfs_chunk, stripe);
+ offset += nr * sizeof(struct btrfs_stripe);
+ return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+ extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+ parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+ name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+ sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+ transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+ nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+ block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+ chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+ chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+ chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+ root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+ objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+ offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+ count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+ count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+ type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+ offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+ if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_BLOCK_REF_KEY)
+ return sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_SHARED_DATA_REF_KEY)
+ return sizeof(struct btrfs_shared_data_ref) +
+ sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ return sizeof(struct btrfs_extent_data_ref) +
+ offsetof(struct btrfs_extent_inline_ref, offset);
+ return 0;
+}
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
+ blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+ generation, 64);
+
+static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+ return offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr;
+ ptr = btrfs_node_key_ptr_offset(nr);
+ write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+ return offsetof(struct btrfs_leaf, items) +
+ sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(int nr)
+{
+ return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+
+#define BTRFS_ITEM_SETGET_FUNCS(member) \
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \
+ int slot) \
+{ \
+ return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \
+} \
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
+ int slot, u32 val) \
+{ \
+ btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \
+} \
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
+ int slot) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ return btrfs_token_raw_item_##member(token, item); \
+} \
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
+ int slot, u32 val) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ btrfs_set_token_raw_item_##member(token, item, val); \
+}
+
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
+
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
+}
+
+static inline void btrfs_item_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(nr);
+ read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(nr);
+ write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
+ data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
+ name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
+ transid, 64);
+
+static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ const struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+ num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+ num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+ generation, 64);
+
+static inline void btrfs_free_space_key(const struct extent_buffer *eb,
+ const struct btrfs_free_space_header *h,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+ struct btrfs_free_space_header *h,
+ const struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+ objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Optimized helpers for little-endian architectures where CPU and on-disk
+ * structures have the same endianness and we can skip conversions.
+ */
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
+ const struct btrfs_disk_key *disk_key)
+{
+ memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *cpu_key)
+{
+ memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_node_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_item_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *cpu_key)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_dir_item_key(eb, item, disk_key);
+}
+
+#else
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+ const struct btrfs_disk_key *disk)
+{
+ cpu->offset = le64_to_cpu(disk->offset);
+ cpu->type = disk->type;
+ cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+ const struct btrfs_key *cpu)
+{
+ disk->offset = cpu_to_le64(cpu->offset);
+ disk->type = cpu->type;
+ disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_node_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_item_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *key)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_dir_item_key(eb, item, &disk_key);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+#endif
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+ generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
+ nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
+
+static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
+{
+ return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+ btrfs_set_header_flags(eb, flags | flag);
+}
+
+static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+ btrfs_set_header_flags(eb, flags & ~flag);
+}
+
+static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
+{
+ u64 flags = btrfs_header_flags(eb);
+ return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+ int rev)
+{
+ u64 flags = btrfs_header_flags(eb);
+ flags &= ~BTRFS_BACKREF_REV_MASK;
+ flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+ btrfs_set_header_flags(eb, flags);
+}
+
+static inline int btrfs_is_leaf(const struct extent_buffer *eb)
+{
+ return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+ last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+ generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+ ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+ otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+ stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+ rtransid, 64);
+
+static inline bool btrfs_root_readonly(const struct btrfs_root *root)
+{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
+}
+
+static inline bool btrfs_root_dead(const struct btrfs_root *root)
+{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+ return root->root_key.objectid;
+}
+
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+ tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+ tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+ tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+ chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+ chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+ extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+ fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+ fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+ fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+ dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+ dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+ dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+ csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+ csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+ csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+ num_devices, 64);
+
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(const struct extent_buffer *eb,
+ const struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ const struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ const struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ const struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
+}
+
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+ struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+ struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+ root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+ chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+ log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+ log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+ sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+ nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+ stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+ root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+ num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+ compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+ compat_ro_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+ incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+ csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+ cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+ uuid_tree_generation, 64);
+
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
+
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
+{
+ u32 nr = btrfs_header_nritems(leaf);
+
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
+ return btrfs_item_offset(leaf, nr - 1);
+}
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
+ type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+ struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+ struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+ struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+ struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
+ struct btrfs_file_extent_item, ram_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+ struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+ struct btrfs_file_extent_item, compression, 8);
+
+static inline unsigned long
+btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e)
+{
+ return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+ return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
+}
+
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+ disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+ disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+ offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+ num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+ ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+ compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ other_encoding, 16);
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers. If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(
+ const struct extent_buffer *eb,
+ int nr)
+{
+ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+ version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+ rescan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+ rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+ excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+ struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+ rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+ struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+ excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+ struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+ max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+ max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+ rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+ rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+ replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+ time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+ time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+ num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+ cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+ cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item,
+ cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+ struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+ struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+ struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+ struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item,
+ num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+ struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ struct btrfs_dev_replace_item, cursor_right, 64);
+
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+ ((type *)(BTRFS_LEAF_DATA_OFFSET + \
+ btrfs_item_offset(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
+ btrfs_item_offset(leaf, slot)))
+
+static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
+{
+ return crc32c(crc, address, length);
+}
+
+static inline void btrfs_crc32c_final(u32 crc, u8 *result)
+{
+ put_unaligned_le32(~crc, result);
+}
+
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return crc32c((u32)~1, name, len);
+}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+ int len)
+{
+ return (u64) crc32c(parent_objectid, name, len);
+}
+
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
+}
+
+/* extent-tree.c */
+
+enum btrfs_inline_ref_type {
+ BTRFS_REF_TYPE_INVALID,
+ BTRFS_REF_TYPE_BLOCK,
+ BTRFS_REF_TYPE_DATA,
+ BTRFS_REF_TYPE_ANY,
+};
+
+int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
+ struct btrfs_extent_inline_ref *iref,
+ enum btrfs_inline_ref_type is_data);
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
+
+
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 num_bytes);
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ unsigned long count);
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
+int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 offset, int metadata, u64 *refs, u64 *flags);
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
+ int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct extent_buffer *eb);
+int btrfs_cross_ref_exist(struct btrfs_root *root,
+ u64 objectid, u64 offset, u64 bytenr, bool strict,
+ struct btrfs_path *path);
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ const struct btrfs_disk_key *key,
+ int level, u64 hint,
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 owner,
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ u64 root_objectid, u64 owner, u64 offset,
+ struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
+ u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+ struct btrfs_key *ins, int is_data, int delalloc);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb, u64 flags, int level);
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
+
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len, int delalloc);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 len);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref);
+
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+/*
+ * Different levels for to flush space when doing space reservations.
+ *
+ * The higher the level, the more methods we try to reclaim space.
+ */
+enum btrfs_reserve_flush_enum {
+ /* If we are in the transaction, we can't flush anything.*/
+ BTRFS_RESERVE_NO_FLUSH,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Allocating a new chunk
+ */
+ BTRFS_RESERVE_FLUSH_LIMIT,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Running delayed refs
+ * - Running delalloc and waiting for ordered extents
+ * - Allocating a new chunk
+ */
+ BTRFS_RESERVE_FLUSH_EVICT,
+
+ /*
+ * Flush space by above mentioned methods and by:
+ * - Running delayed iputs
+ * - Committing transaction
+ *
+ * Can be interrupted by a fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
+ BTRFS_RESERVE_FLUSH_ALL,
+
+ /*
+ * Pretty much the same as FLUSH_ALL, but can also steal space from
+ * global rsv.
+ *
+ * Can be interrupted by a fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_ALL_STEAL,
+};
+
+enum btrfs_flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELAYED_REFS_NR = 3,
+ FLUSH_DELAYED_REFS = 4,
+ FLUSH_DELALLOC = 5,
+ FLUSH_DELALLOC_WAIT = 6,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
+};
+
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int nitems, bool use_global_rsv);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes, bool noflush);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
+ u64 start, u64 end);
+int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes);
+int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
+
+/* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
+ int *slot);
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+int btrfs_previous_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid,
+ int type);
+int btrfs_previous_extent_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid);
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, int lowest_level,
+ u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ struct btrfs_path *path,
+ u64 min_trans);
+struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
+ int slot);
+
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+ struct extent_buffer *buf);
+void btrfs_extend_item(struct btrfs_path *path, u32 data_size);
+void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key,
+ unsigned long split_offset);
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key);
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *key, struct btrfs_path *p,
+ int ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_path *p, int find_higher,
+ int return_any);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *parent,
+ int start_slot, u64 *last_ret,
+ struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int slot, int nr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+ /*
+ * Pointer to an array containing the keys of the items to insert (in
+ * sorted order).
+ */
+ const struct btrfs_key *keys;
+ /* Pointer to an array containing the data size for each item to insert. */
+ const u32 *data_sizes;
+ /*
+ * The sum of data sizes for all items. The caller can compute this while
+ * setting up the data_sizes array, so it ends up being more efficient
+ * than having btrfs_insert_empty_items() or setup_item_for_insert()
+ * doing it, as it would avoid an extra loop over a potentially large
+ * array, and in the case of setup_item_for_insert(), we would be doing
+ * it while holding a write lock on a leaf and often on upper level nodes
+ * too, unnecessarily increasing the size of a critical section.
+ */
+ u32 total_data_size;
+ /* Size of the keys and data_sizes arrays (number of items in the batch). */
+ int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size);
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch);
+
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size)
+{
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ return btrfs_insert_empty_items(trans, root, path, &batch);
+}
+
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq);
+
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
+/*
+ * Search in @root for a given @key, and store the slot found in @found_key.
+ *
+ * @root: The root node of the tree.
+ * @key: The key we are looking for.
+ * @found_key: Will hold the found item.
+ * @path: Holds the current slot/leaf.
+ * @iter_ret: Contains the value returned from btrfs_search_slot or
+ * btrfs_get_next_valid_item, whichever was executed last.
+ *
+ * The @iter_ret is an output variable that will contain the return value of
+ * btrfs_search_slot, if it encountered an error, or the value returned from
+ * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid
+ * slot was found, 1 if there were no more leaves, and <0 if there was an error.
+ *
+ * It's recommended to use a separate variable for iter_ret and then use it to
+ * set the function return value so there's no confusion of the 0/1/errno
+ * values stemming from btrfs_search_slot.
+ */
+#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \
+ for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \
+ (iter_ret) >= 0 && \
+ (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \
+ (path)->slots[0]++ \
+ )
+
+static inline int btrfs_next_old_item(struct btrfs_root *root,
+ struct btrfs_path *p, u64 time_seq)
+{
+ ++p->slots[0];
+ if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+ return btrfs_next_old_leaf(root, p, time_seq);
+ return 0;
+}
+
+/*
+ * Search the tree again to find a leaf with greater keys.
+ *
+ * Returns 0 if it found something or 1 if there are no greater leaves.
+ * Returns < 0 on error.
+ */
+static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ return btrfs_next_old_leaf(root, path, 0);
+}
+
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+ return btrfs_next_old_item(root, p, 0);
+}
+int btrfs_leaf_free_space(struct extent_buffer *leaf);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+ int for_reloc);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * Do it this way so we only ever do one test_bit in the normal case.
+ */
+ if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
+ if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
+ return 2;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
+ * since setting and checking for SB_RDONLY in the superblock's flags is not
+ * atomic.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
+{
+ return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
+ btrfs_fs_closing(fs_info);
+}
+
+static inline void btrfs_set_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags |= SB_RDONLY;
+ set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags &= ~SB_RDONLY;
+ clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
+
+/* root-item.c */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+ u64 ref_id, u64 dirid, u64 sequence,
+ const struct fscrypt_str *name);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+ u64 ref_id, u64 dirid, u64 *sequence,
+ const struct fscrypt_str *name);
+int btrfs_del_root(struct btrfs_trans_handle *trans,
+ const struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_root_item *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key);
+int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
+void btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+
+/* uuid-tree.c */
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ u64 subid);
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
+
+/* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const struct fscrypt_str *name);
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name, struct btrfs_inode *dir,
+ struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const struct fscrypt_str *name, int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 index, const struct fscrypt_str *name, int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const struct fscrypt_str *name);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ const char *name,
+ int name_len);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
+
+/* file-item.c */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 pos,
+ u64 num_bytes);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums);
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 offset, bool one_ordered);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit,
+ bool nowait);
+void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em);
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
+u64 btrfs_file_extent_end(const struct btrfs_path *path);
+
+/* inode.c */
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes, bool nowait, bool strict);
+
+void __btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct btrfs_inode *inode);
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir, struct btrfs_inode *inode,
+ const struct fscrypt_str *name);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
+ const struct fscrypt_str *name, int add_backref, u64 index);
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front);
+
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context);
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ unsigned int extra_bits,
+ struct extent_state **cached_state);
+struct btrfs_new_inode_args {
+ /* Input */
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool orphan;
+ bool subvol;
+
+ /*
+ * Output from btrfs_new_inode_prepare(), input to
+ * btrfs_create_new_inode().
+ */
+ struct posix_acl *default_acl;
+ struct posix_acl *acl;
+ struct fscrypt_name fname;
+};
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir);
+ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+ u32 bits);
+void btrfs_clear_delalloc_extent(struct inode *inode,
+ struct extent_state *state, u32 bits);
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+ struct extent_state *other);
+void btrfs_split_delalloc_extent(struct inode *inode,
+ struct extent_state *orig, u64 split);
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
+vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
+void btrfs_evict_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+void btrfs_free_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
+int __init btrfs_init_cachep(void);
+void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
+ struct btrfs_root *root, struct btrfs_path *path);
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
+struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
+ struct page *page, size_t pg_offset,
+ u64 start, u64 end);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+ struct btrfs_trans_handle *trans, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started, unsigned long *nr_written,
+ struct writeback_control *wbc);
+int btrfs_writepage_cow_fixup(struct page *page);
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
+ u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+
+extern const struct dentry_operations btrfs_dentry_operations;
+
+/* Inode locking type flags, by default the exclusive lock is taken */
+#define BTRFS_ILOCK_SHARED (1U << 0)
+#define BTRFS_ILOCK_TRY (1U << 1)
+#define BTRFS_ILOCK_MMAP (1U << 2)
+
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+ const u64 add_bytes,
+ const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
+
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+int btrfs_ioctl_get_supported_features(void __user *arg);
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
+int __pure btrfs_is_empty_uuid(u8 *uuid);
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space);
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_balance_args *bargs);
+
+/* file.c */
+int __init btrfs_auto_defrag_init(void);
+void __cold btrfs_auto_defrag_exit(void);
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u32 extent_thresh);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+extern const struct file_operations btrfs_file_operations;
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args);
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
+ struct btrfs_trans_handle **trans_out);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+int btrfs_release_file(struct inode *inode, struct file *file);
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos, size_t write_bytes,
+ struct extent_state **cached, bool noreserve);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait);
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret);
+
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+
+/* super.c */
+int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
+ unsigned long new_flags);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid);
+
+static inline __printf(2, 3) __cold
+void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_printk(fs_info, fmt, args...) \
+do { \
+ printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
+ _btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#elif defined(CONFIG_PRINTK)
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ _btrfs_printk(fs_info, fmt, ##args)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#else
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, fmt, ##args)
+#endif
+
+#define btrfs_emerg(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+#define btrfs_debug(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#elif defined(DEBUG)
+#define btrfs_debug(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+#endif
+
+#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_no_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk_ratelimited(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#ifdef CONFIG_BTRFS_ASSERT
+__cold __noreturn
+static inline void assertfail(const char *expr, const char *file, int line)
+{
+ pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+ BUG();
+}
+
+#define ASSERT(expr) \
+ (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))
+
+#else
+static inline void assertfail(const char *expr, const char* file, int line) { }
+#define ASSERT(expr) (void)(expr)
+#endif
+
+#if BITS_PER_LONG == 32
+#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
+/*
+ * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
+ * addresses of extents.
+ *
+ * For 4K page size it's about 10T, for 64K it's 160T.
+ */
+#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
+void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
+void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
+#endif
+
+/*
+ * Get the correct offset inside the page of extent buffer.
+ *
+ * @eb: target extent buffer
+ * @start: offset inside the extent buffer
+ *
+ * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
+ */
+static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
+ unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
+ * to PAGE_SIZE, thus adding it won't cause any difference.
+ *
+ * For sectorsize < PAGE_SIZE, we must only read the data that belongs
+ * to the eb, thus we have to take the eb->start into consideration.
+ */
+ return offset_in_page(offset + eb->start);
+}
+
+static inline unsigned long get_eb_page_index(unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ *
+ * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
+ * and have ensured that all tree blocks are contained in one page,
+ * thus we always get index == 0.
+ */
+ return offset >> PAGE_SHIFT;
+}
+
+/*
+ * Use that for functions that are conditionally exported for sanity tests but
+ * otherwise static
+ */
+#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define EXPORT_FOR_TESTS static
+#else
+#define EXPORT_FOR_TESTS
+#endif
+
+__cold
+static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
+{
+ btrfs_err(fs_info,
+"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
+}
+
+__printf(5, 6)
+__cold
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+
+const char * __attribute_const__ btrfs_decode_error(int errno);
+
+__cold
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ const char *function,
+ unsigned int line, int errno, bool first_hit);
+
+bool __cold abort_should_print_stack(int errno);
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact stack trace is reported for some errors.
+ */
+#define btrfs_abort_transaction(trans, errno) \
+do { \
+ bool first = false; \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((trans)->fs_info->fs_state))) { \
+ first = true; \
+ if (WARN(abort_should_print_stack(errno), \
+ KERN_DEBUG \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno))) { \
+ /* Stack trace printed. */ \
+ } else { \
+ btrfs_debug((trans)->fs_info, \
+ "Transaction aborted (error %d)", \
+ (errno)); \
+ } \
+ } \
+ __btrfs_abort_transaction((trans), __func__, \
+ __LINE__, (errno), first); \
+} while (0)
+
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+do { \
+ printk_index_subsys_emit( \
+ "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
+ KERN_CRIT, fmt); \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
+} while (0)
+
+#else
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args)
+
+#endif
+
+#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+ &(fs_info)->fs_state)))
+#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
+ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
+ &(fs_info)->fs_state)))
+
+__printf(5, 6)
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic(). Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ BUG(); \
+} while (0)
+
+
+/* compatibility and incompatibility defines */
+
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
+ #opt)
+
+static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag, const char* name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "setting incompat feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_clear_fs_incompat(__fs_info, opt) \
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
+ #opt)
+
+static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag, const char* name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "clearing incompat feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_fs_incompat(fs_info, opt) \
+ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
+#define btrfs_set_fs_compat_ro(__fs_info, opt) \
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
+ #opt)
+
+static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag, const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "setting compat-ro feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
+ #opt)
+
+static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag, const char *name)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "clearing compat-ro feature flag for %s (0x%llx)",
+ name, flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_fs_compat_ro(fs_info, opt) \
+ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+}
+
+/* acl.c */
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type);
+#else
+#define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl,
+ int type)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
+
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace);
+void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
+void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
+int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
+ struct btrfs_scrub_progress *progress);
+
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+ btrfs_bio_counter_sub(fs_info, 1);
+}
+
+static inline int is_fstree(u64 rootid)
+{
+ if (rootid == BTRFS_FS_TREE_OBJECTID ||
+ ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
+ !btrfs_qgroup_level(rootid)))
+ return 1;
+ return 0;
+}
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+ return signal_pending(current);
+}
+
+/* verity.c */
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
+
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+ size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+ struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+ struct btrfs_verity_descriptor_item, size, 64);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ return 0;
+}
+
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ return -EPERM;
+}
+
+#endif
+
+/* Sanity test specific functions */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode);
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
+ return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+#else
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+#endif
+
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+ return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
+/*
+ * We use page status Private2 to indicate there is an ordered extent with
+ * unfinished IO.
+ *
+ * Rename the Private2 accessors to Ordered, to improve readability.
+ */
+#define PageOrdered(page) PagePrivate2(page)
+#define SetPageOrdered(page) SetPagePrivate2(page)
+#define ClearPageOrdered(page) ClearPagePrivate2(page)
+#define folio_test_ordered(folio) folio_test_private_2(folio)
+#define folio_set_ordered(folio) folio_set_private_2(folio)
+#define folio_clear_ordered(folio) folio_clear_private_2(folio)
+
+#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
new file mode 100644
index 000000000..f2bc5563c
--- /dev/null
+++ b/fs/btrfs/delalloc-space.c
@@ -0,0 +1,493 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "block-rsv.h"
+#include "btrfs_inode.h"
+#include "space-info.h"
+#include "transaction.h"
+#include "qgroup.h"
+#include "block-group.h"
+
+/*
+ * HOW DOES THIS WORK
+ *
+ * There are two stages to data reservations, one for data and one for metadata
+ * to handle the new extents and checksums generated by writing data.
+ *
+ *
+ * DATA RESERVATION
+ * The general flow of the data reservation is as follows
+ *
+ * -> Reserve
+ * We call into btrfs_reserve_data_bytes() for the user request bytes that
+ * they wish to write. We make this reservation and add it to
+ * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree
+ * for the range and carry on if this is buffered, or follow up trying to
+ * make a real allocation if we are pre-allocating or doing O_DIRECT.
+ *
+ * -> Use
+ * At writepages()/prealloc/O_DIRECT time we will call into
+ * btrfs_reserve_extent() for some part or all of this range of bytes. We
+ * will make the allocation and subtract space_info->bytes_may_use by the
+ * original requested length and increase the space_info->bytes_reserved by
+ * the allocated length. This distinction is important because compression
+ * may allocate a smaller on disk extent than we previously reserved.
+ *
+ * -> Allocation
+ * finish_ordered_io() will insert the new file extent item for this range,
+ * and then add a delayed ref update for the extent tree. Once that delayed
+ * ref is written the extent size is subtracted from
+ * space_info->bytes_reserved and added to space_info->bytes_used.
+ *
+ * Error handling
+ *
+ * -> By the reservation maker
+ * This is the simplest case, we haven't completed our operation and we know
+ * how much we reserved, we can simply call
+ * btrfs_free_reserved_data_space*() and it will be removed from
+ * space_info->bytes_may_use.
+ *
+ * -> After the reservation has been made, but before cow_file_range()
+ * This is specifically for the delalloc case. You must clear
+ * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
+ * be subtracted from space_info->bytes_may_use.
+ *
+ * METADATA RESERVATION
+ * The general metadata reservation lifetimes are discussed elsewhere, this
+ * will just focus on how it is used for delalloc space.
+ *
+ * We keep track of two things on a per inode bases
+ *
+ * ->outstanding_extents
+ * This is the number of file extent items we'll need to handle all of the
+ * outstanding DELALLOC space we have in this inode. We limit the maximum
+ * size of an extent, so a large contiguous dirty area may require more than
+ * one outstanding_extent, which is why count_max_extents() is used to
+ * determine how many outstanding_extents get added.
+ *
+ * ->csum_bytes
+ * This is essentially how many dirty bytes we have for this inode, so we
+ * can calculate the number of checksum items we would have to add in order
+ * to checksum our outstanding data.
+ *
+ * We keep a per-inode block_rsv in order to make it easier to keep track of
+ * our reservation. We use btrfs_calculate_inode_block_rsv_size() to
+ * calculate the current theoretical maximum reservation we would need for the
+ * metadata for this inode. We call this and then adjust our reservation as
+ * necessary, either by attempting to reserve more space, or freeing up excess
+ * space.
+ *
+ * OUTSTANDING_EXTENTS HANDLING
+ *
+ * ->outstanding_extents is used for keeping track of how many extents we will
+ * need to use for this inode, and it will fluctuate depending on where you are
+ * in the life cycle of the dirty data. Consider the following normal case for
+ * a completely clean inode, with a num_bytes < our maximum allowed extent size
+ *
+ * -> reserve
+ * ->outstanding_extents += 1 (current value is 1)
+ *
+ * -> set_delalloc
+ * ->outstanding_extents += 1 (current value is 2)
+ *
+ * -> btrfs_delalloc_release_extents()
+ * ->outstanding_extents -= 1 (current value is 1)
+ *
+ * We must call this once we are done, as we hold our reservation for the
+ * duration of our operation, and then assume set_delalloc will update the
+ * counter appropriately.
+ *
+ * -> add ordered extent
+ * ->outstanding_extents += 1 (current value is 2)
+ *
+ * -> btrfs_clear_delalloc_extent
+ * ->outstanding_extents -= 1 (current value is 1)
+ *
+ * -> finish_ordered_io/btrfs_remove_ordered_extent
+ * ->outstanding_extents -= 1 (current value is 0)
+ *
+ * Each stage is responsible for their own accounting of the extent, thus
+ * making error handling and cleanup easier.
+ */
+
+int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
+
+ /* Make sure bytes are sectorsize aligned */
+ bytes = ALIGN(bytes, fs_info->sectorsize);
+
+ if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+
+ return btrfs_reserve_data_bytes(fs_info, bytes, flush);
+}
+
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
+ struct extent_changeset **reserved, u64 start,
+ u64 len, bool noflush)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
+ int ret;
+
+ /* align the range */
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
+
+ if (noflush)
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ else if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+
+ ret = btrfs_reserve_data_bytes(fs_info, len, flush);
+ if (ret < 0)
+ return ret;
+
+ /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
+ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
+ if (ret < 0) {
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
+ extent_changeset_free(*reserved);
+ *reserved = NULL;
+ } else {
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
+ */
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
+ u64 len)
+{
+ struct btrfs_space_info *data_sinfo;
+
+ ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
+
+ data_sinfo = fs_info->data_sinfo;
+ btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-inode data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
+
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_qgroup_free_data(inode, reserved, start, len, NULL);
+}
+
+/**
+ * Release any excessive reservation
+ *
+ * @inode: the inode we need to release from
+ * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
+ * meta reservation needs to know if we are freeing qgroup
+ * reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal
+ * release.
+ *
+ * This is the same as btrfs_block_rsv_release, except that it handles the
+ * tracepoint for the reservation.
+ */
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 released = 0;
+ u64 qgroup_to_release = 0;
+
+ /*
+ * Since we statically set the block_rsv->size we just want to say we
+ * are releasing 0 bytes, and then we'll just get the reservation over
+ * the size free'd.
+ */
+ released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
+ &qgroup_to_release);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delalloc",
+ btrfs_ino(inode), released, 0);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
+ else
+ btrfs_qgroup_convert_reserved_meta(inode->root,
+ qgroup_to_release);
+}
+
+static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 reserve_size = 0;
+ u64 qgroup_rsv_size = 0;
+ u64 csum_leaves;
+ unsigned outstanding_extents;
+
+ lockdep_assert_held(&inode->lock);
+ outstanding_extents = inode->outstanding_extents;
+
+ /*
+ * Insert size for the number of outstanding extents, 1 normal size for
+ * updating the inode.
+ */
+ if (outstanding_extents) {
+ reserve_size = btrfs_calc_insert_metadata_size(fs_info,
+ outstanding_extents);
+ reserve_size += btrfs_calc_metadata_size(fs_info, 1);
+ }
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+ inode->csum_bytes);
+ reserve_size += btrfs_calc_insert_metadata_size(fs_info,
+ csum_leaves);
+ /*
+ * For qgroup rsv, the calculation is very simple:
+ * account one nodesize for each outstanding extent
+ *
+ * This is overestimating in most cases.
+ */
+ qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
+
+ spin_lock(&block_rsv->lock);
+ block_rsv->size = reserve_size;
+ block_rsv->qgroup_rsv_size = qgroup_rsv_size;
+ spin_unlock(&block_rsv->lock);
+}
+
+static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+ u64 num_bytes, u64 disk_num_bytes,
+ u64 *meta_reserve, u64 *qgroup_reserve)
+{
+ u64 nr_extents = count_max_extents(fs_info, num_bytes);
+ u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+ u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+
+ *meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
+ nr_extents + csum_leaves);
+
+ /*
+ * finish_ordered_io has to update the inode, so add the space required
+ * for an inode update.
+ */
+ *meta_reserve += inode_update;
+ *qgroup_reserve = nr_extents * fs_info->nodesize;
+}
+
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes, bool noflush)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 meta_reserve, qgroup_reserve;
+ unsigned nr_extents;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+ int ret = 0;
+
+ /*
+ * If we are a free space inode we need to not flush since we will be in
+ * the middle of a transaction commit. We also don't need the delalloc
+ * mutex since we won't race with anybody. We need this mostly to make
+ * lockdep shut its filthy mouth.
+ *
+ * If we have a transaction open (can happen if we call truncate_block
+ * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
+ */
+ if (noflush || btrfs_is_free_space_inode(inode)) {
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ } else {
+ if (current->journal_info)
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
+ }
+
+ num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
+
+ /*
+ * We always want to do it this way, every other way is wrong and ends
+ * in tears. Pre-reserving the amount we are going to add will always
+ * be the right way, because otherwise if we have enough parallelism we
+ * could end up with thousands of inodes all holding little bits of
+ * reservations they were able to make previously and the only way to
+ * reclaim that space is to ENOSPC out the operations and clear
+ * everything out and try again, which is bad. This way we just
+ * over-reserve slightly, and clean up the mess when we are done.
+ */
+ calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ &meta_reserve, &qgroup_reserve);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
+ noflush);
+ if (ret)
+ return ret;
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
+ if (ret) {
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
+ return ret;
+ }
+
+ /*
+ * Now we need to update our outstanding extents and csum bytes _first_
+ * and then add the reservation to the block_rsv. This keeps us from
+ * racing with an ordered completion or some such that would think it
+ * needs to free the reservation we just made.
+ */
+ spin_lock(&inode->lock);
+ nr_extents = count_max_extents(fs_info, num_bytes);
+ btrfs_mod_outstanding_extents(inode, nr_extents);
+ inode->csum_bytes += disk_num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ /* Now we can safely add our space to our block rsv */
+ btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), meta_reserve, 1);
+
+ spin_lock(&block_rsv->lock);
+ block_rsv->qgroup_rsv_reserved += qgroup_reserve;
+ spin_unlock(&block_rsv->lock);
+
+ return 0;
+}
+
+/**
+ * Release a metadata reservation for an inode
+ *
+ * @inode: the inode to release the reservation for.
+ * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
+ *
+ * This will release the metadata reservation for an inode. This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations, or on error for the same reason.
+ */
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ spin_lock(&inode->lock);
+ inode->csum_bytes -= num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ btrfs_inode_rsv_release(inode, qgroup_free);
+}
+
+/**
+ * btrfs_delalloc_release_extents - release our outstanding_extents
+ * @inode: the inode to balance the reservation for.
+ * @num_bytes: the number of bytes we originally reserved with
+ *
+ * When we reserve space we increase outstanding_extents for the extents we may
+ * add. Once we've set the range as delalloc or created our ordered extents we
+ * have outstanding_extents to track the real usage, so we use this to free our
+ * temporarily tracked outstanding_extents. This _must_ be used in conjunction
+ * with btrfs_delalloc_reserve_metadata.
+ */
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ unsigned num_extents;
+
+ spin_lock(&inode->lock);
+ num_extents = count_max_extents(fs_info, num_bytes);
+ btrfs_mod_outstanding_extents(inode, -num_extents);
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ btrfs_inode_rsv_release(inode, true);
+}
+
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ * @reserved: mandatory parameter, record actually reserved qgroup ranges of
+ * current reservation.
+ *
+ * This will do the following things
+ *
+ * - reserve space in data space info for num bytes
+ * and reserve precious corresponding qgroup space
+ * (Done in check_data_free_space)
+ *
+ * - reserve space for metadata space, based on the number of outstanding
+ * extents and how much csums will be needed
+ * also reserve metadata space in a per root over-reserve method.
+ * - add to the inodes->delalloc_bytes
+ * - add it to the fs_info's delalloc inodes list.
+ * (Above 3 all done in delalloc_reserve_metadata)
+ *
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
+ */
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len)
+{
+ int ret;
+
+ ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
+ if (ret < 0) {
+ btrfs_free_reserved_data_space(inode, *reserved, start, len);
+ extent_changeset_free(*reserved);
+ *reserved = NULL;
+ }
+ return ret;
+}
+
+/**
+ * Release data and metadata space for delalloc
+ *
+ * @inode: inode we're releasing space for
+ * @reserved: list of changed/reserved ranges
+ * @start: start position of the space already reserved
+ * @len: length of the space already reserved
+ * @qgroup_free: should qgroup reserved-space also be freed
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
+ */
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
+ struct extent_changeset *reserved,
+ u64 start, u64 len, bool qgroup_free)
+{
+ btrfs_delalloc_release_metadata(inode, len, qgroup_free);
+ btrfs_free_reserved_data_space(inode, reserved, start, len);
+}
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
new file mode 100644
index 000000000..e07d46043
--- /dev/null
+++ b/fs/btrfs/delalloc-space.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DELALLOC_SPACE_H
+#define BTRFS_DELALLOC_SPACE_H
+
+struct extent_changeset;
+
+int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len,
+ bool noflush);
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
+ struct extent_changeset *reserved,
+ u64 start, u64 len, bool qgroup_free);
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
+ u64 len);
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free);
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
+
+#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
new file mode 100644
index 000000000..c6426080c
--- /dev/null
+++ b/fs/btrfs/delayed-inode.c
@@ -0,0 +1,2198 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2011 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/iversion.h>
+#include "misc.h"
+#include "delayed-inode.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "ctree.h"
+#include "qgroup.h"
+#include "locking.h"
+#include "inode-item.h"
+
+#define BTRFS_DELAYED_WRITEBACK 512
+#define BTRFS_DELAYED_BACKGROUND 128
+#define BTRFS_DELAYED_BATCH 16
+
+static struct kmem_cache *delayed_node_cache;
+
+int __init btrfs_delayed_inode_init(void)
+{
+ delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
+ sizeof(struct btrfs_delayed_node),
+ 0,
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!delayed_node_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_delayed_inode_exit(void)
+{
+ kmem_cache_destroy(delayed_node_cache);
+}
+
+static inline void btrfs_init_delayed_node(
+ struct btrfs_delayed_node *delayed_node,
+ struct btrfs_root *root, u64 inode_id)
+{
+ delayed_node->root = root;
+ delayed_node->inode_id = inode_id;
+ refcount_set(&delayed_node->refs, 0);
+ delayed_node->ins_root = RB_ROOT_CACHED;
+ delayed_node->del_root = RB_ROOT_CACHED;
+ mutex_init(&delayed_node->mutex);
+ INIT_LIST_HEAD(&delayed_node->n_list);
+ INIT_LIST_HEAD(&delayed_node->p_list);
+}
+
+static struct btrfs_delayed_node *btrfs_get_delayed_node(
+ struct btrfs_inode *btrfs_inode)
+{
+ struct btrfs_root *root = btrfs_inode->root;
+ u64 ino = btrfs_ino(btrfs_inode);
+ struct btrfs_delayed_node *node;
+
+ node = READ_ONCE(btrfs_inode->delayed_node);
+ if (node) {
+ refcount_inc(&node->refs);
+ return node;
+ }
+
+ spin_lock(&root->inode_lock);
+ node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+
+ if (node) {
+ if (btrfs_inode->delayed_node) {
+ refcount_inc(&node->refs); /* can be accessed */
+ BUG_ON(btrfs_inode->delayed_node != node);
+ spin_unlock(&root->inode_lock);
+ return node;
+ }
+
+ /*
+ * It's possible that we're racing into the middle of removing
+ * this node from the radix tree. In this case, the refcount
+ * was zero and it should never go back to one. Just return
+ * NULL like it was never in the radix at all; our release
+ * function is in the process of removing it.
+ *
+ * Some implementations of refcount_inc refuse to bump the
+ * refcount once it has hit zero. If we don't do this dance
+ * here, refcount_inc() may decide to just WARN_ONCE() instead
+ * of actually bumping the refcount.
+ *
+ * If this node is properly in the radix, we want to bump the
+ * refcount twice, once for the inode and once for this get
+ * operation.
+ */
+ if (refcount_inc_not_zero(&node->refs)) {
+ refcount_inc(&node->refs);
+ btrfs_inode->delayed_node = node;
+ } else {
+ node = NULL;
+ }
+
+ spin_unlock(&root->inode_lock);
+ return node;
+ }
+ spin_unlock(&root->inode_lock);
+
+ return NULL;
+}
+
+/* Will return either the node or PTR_ERR(-ENOMEM) */
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+ struct btrfs_inode *btrfs_inode)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_root *root = btrfs_inode->root;
+ u64 ino = btrfs_ino(btrfs_inode);
+ int ret;
+
+again:
+ node = btrfs_get_delayed_node(btrfs_inode);
+ if (node)
+ return node;
+
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
+
+ /* cached in the btrfs inode and can be accessed */
+ refcount_set(&node->refs, 2);
+
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ kmem_cache_free(delayed_node_cache, node);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&root->inode_lock);
+ ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+ if (ret == -EEXIST) {
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
+ radix_tree_preload_end();
+ goto again;
+ }
+ btrfs_inode->delayed_node = node;
+ spin_unlock(&root->inode_lock);
+ radix_tree_preload_end();
+
+ return node;
+}
+
+/*
+ * Call it when holding delayed_node->mutex
+ *
+ * If mod = 1, add this node into the prepared list.
+ */
+static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
+ struct btrfs_delayed_node *node,
+ int mod)
+{
+ spin_lock(&root->lock);
+ if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+ if (!list_empty(&node->p_list))
+ list_move_tail(&node->p_list, &root->prepare_list);
+ else if (mod)
+ list_add_tail(&node->p_list, &root->prepare_list);
+ } else {
+ list_add_tail(&node->n_list, &root->node_list);
+ list_add_tail(&node->p_list, &root->prepare_list);
+ refcount_inc(&node->refs); /* inserted into list */
+ root->nodes++;
+ set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
+ }
+ spin_unlock(&root->lock);
+}
+
+/* Call it when holding delayed_node->mutex */
+static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
+ struct btrfs_delayed_node *node)
+{
+ spin_lock(&root->lock);
+ if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+ root->nodes--;
+ refcount_dec(&node->refs); /* not in the list */
+ list_del_init(&node->n_list);
+ if (!list_empty(&node->p_list))
+ list_del_init(&node->p_list);
+ clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
+ }
+ spin_unlock(&root->lock);
+}
+
+static struct btrfs_delayed_node *btrfs_first_delayed_node(
+ struct btrfs_delayed_root *delayed_root)
+{
+ struct list_head *p;
+ struct btrfs_delayed_node *node = NULL;
+
+ spin_lock(&delayed_root->lock);
+ if (list_empty(&delayed_root->node_list))
+ goto out;
+
+ p = delayed_root->node_list.next;
+ node = list_entry(p, struct btrfs_delayed_node, n_list);
+ refcount_inc(&node->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return node;
+}
+
+static struct btrfs_delayed_node *btrfs_next_delayed_node(
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct list_head *p;
+ struct btrfs_delayed_node *next = NULL;
+
+ delayed_root = node->root->fs_info->delayed_root;
+ spin_lock(&delayed_root->lock);
+ if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+ /* not in the list */
+ if (list_empty(&delayed_root->node_list))
+ goto out;
+ p = delayed_root->node_list.next;
+ } else if (list_is_last(&node->n_list, &delayed_root->node_list))
+ goto out;
+ else
+ p = node->n_list.next;
+
+ next = list_entry(p, struct btrfs_delayed_node, n_list);
+ refcount_inc(&next->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return next;
+}
+
+static void __btrfs_release_delayed_node(
+ struct btrfs_delayed_node *delayed_node,
+ int mod)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ if (!delayed_node)
+ return;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+
+ mutex_lock(&delayed_node->mutex);
+ if (delayed_node->count)
+ btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
+ else
+ btrfs_dequeue_delayed_node(delayed_root, delayed_node);
+ mutex_unlock(&delayed_node->mutex);
+
+ if (refcount_dec_and_test(&delayed_node->refs)) {
+ struct btrfs_root *root = delayed_node->root;
+
+ spin_lock(&root->inode_lock);
+ /*
+ * Once our refcount goes to zero, nobody is allowed to bump it
+ * back up. We can delete it now.
+ */
+ ASSERT(refcount_read(&delayed_node->refs) == 0);
+ radix_tree_delete(&root->delayed_nodes_tree,
+ delayed_node->inode_id);
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, delayed_node);
+ }
+}
+
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+{
+ __btrfs_release_delayed_node(node, 0);
+}
+
+static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+ struct btrfs_delayed_root *delayed_root)
+{
+ struct list_head *p;
+ struct btrfs_delayed_node *node = NULL;
+
+ spin_lock(&delayed_root->lock);
+ if (list_empty(&delayed_root->prepare_list))
+ goto out;
+
+ p = delayed_root->prepare_list.next;
+ list_del_init(p);
+ node = list_entry(p, struct btrfs_delayed_node, p_list);
+ refcount_inc(&node->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return node;
+}
+
+static inline void btrfs_release_prepared_delayed_node(
+ struct btrfs_delayed_node *node)
+{
+ __btrfs_release_delayed_node(node, 1);
+}
+
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
+ struct btrfs_delayed_node *node,
+ enum btrfs_delayed_item_type type)
+{
+ struct btrfs_delayed_item *item;
+
+ item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+ if (item) {
+ item->data_len = data_len;
+ item->type = type;
+ item->bytes_reserved = 0;
+ item->delayed_node = node;
+ RB_CLEAR_NODE(&item->rb_node);
+ INIT_LIST_HEAD(&item->log_list);
+ item->logged = false;
+ refcount_set(&item->refs, 1);
+ }
+ return item;
+}
+
+/*
+ * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * @delayed_node: pointer to the delayed node
+ * @index: the dir index value to lookup (offset of a dir index key)
+ *
+ * Note: if we don't find the right item, we will return the prev item and
+ * the next item.
+ */
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
+ struct rb_root *root,
+ u64 index)
+{
+ struct rb_node *node = root->rb_node;
+ struct btrfs_delayed_item *delayed_item = NULL;
+
+ while (node) {
+ delayed_item = rb_entry(node, struct btrfs_delayed_item,
+ rb_node);
+ if (delayed_item->index < index)
+ node = node->rb_right;
+ else if (delayed_item->index > index)
+ node = node->rb_left;
+ else
+ return delayed_item;
+ }
+
+ return NULL;
+}
+
+static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
+ struct btrfs_delayed_item *ins)
+{
+ struct rb_node **p, *node;
+ struct rb_node *parent_node = NULL;
+ struct rb_root_cached *root;
+ struct btrfs_delayed_item *item;
+ bool leftmost = true;
+
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
+ root = &delayed_node->ins_root;
+ else
+ root = &delayed_node->del_root;
+
+ p = &root->rb_root.rb_node;
+ node = &ins->rb_node;
+
+ while (*p) {
+ parent_node = *p;
+ item = rb_entry(parent_node, struct btrfs_delayed_item,
+ rb_node);
+
+ if (item->index < ins->index) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else if (item->index > ins->index) {
+ p = &(*p)->rb_left;
+ } else {
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color_cached(node, root, leftmost);
+
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
+ ins->index >= delayed_node->index_cnt)
+ delayed_node->index_cnt = ins->index + 1;
+
+ delayed_node->count++;
+ atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+ return 0;
+}
+
+static void finish_one_item(struct btrfs_delayed_root *delayed_root)
+{
+ int seq = atomic_inc_return(&delayed_root->items_seq);
+
+ /* atomic_dec_return implies a barrier */
+ if ((atomic_dec_return(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
+ cond_wake_up_nomb(&delayed_root->wait);
+}
+
+static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
+{
+ struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node;
+ struct rb_root_cached *root;
+ struct btrfs_delayed_root *delayed_root;
+
+ /* Not inserted, ignore it. */
+ if (RB_EMPTY_NODE(&delayed_item->rb_node))
+ return;
+
+ /* If it's in a rbtree, then we need to have delayed node locked. */
+ lockdep_assert_held(&delayed_node->mutex);
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+
+ BUG_ON(!delayed_root);
+
+ if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
+ root = &delayed_node->ins_root;
+ else
+ root = &delayed_node->del_root;
+
+ rb_erase_cached(&delayed_item->rb_node, root);
+ RB_CLEAR_NODE(&delayed_item->rb_node);
+ delayed_node->count--;
+
+ finish_one_item(delayed_root);
+}
+
+static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
+{
+ if (item) {
+ __btrfs_remove_delayed_item(item);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+}
+
+static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+ struct btrfs_delayed_node *delayed_node)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *item = NULL;
+
+ p = rb_first_cached(&delayed_node->ins_root);
+ if (p)
+ item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return item;
+}
+
+static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+ struct btrfs_delayed_node *delayed_node)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *item = NULL;
+
+ p = rb_first_cached(&delayed_node->del_root);
+ if (p)
+ item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return item;
+}
+
+static struct btrfs_delayed_item *__btrfs_next_delayed_item(
+ struct btrfs_delayed_item *item)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *next = NULL;
+
+ p = rb_next(&item->rb_node);
+ if (p)
+ next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return next;
+}
+
+static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_block_rsv *src_rsv;
+ struct btrfs_block_rsv *dst_rsv;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 num_bytes;
+ int ret;
+
+ if (!trans->bytes_reserved)
+ return 0;
+
+ src_rsv = trans->block_rsv;
+ dst_rsv = &fs_info->delayed_block_rsv;
+
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ /*
+ * Here we migrate space rsv from transaction rsv, since have already
+ * reserved space when starting a transaction. So no need to reserve
+ * qgroup space here.
+ */
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
+ if (!ret) {
+ trace_btrfs_space_reservation(fs_info, "delayed_item",
+ item->delayed_node->inode_id,
+ num_bytes, 1);
+ /*
+ * For insertions we track reserved metadata space by accounting
+ * for the number of leaves that will be used, based on the delayed
+ * node's index_items_size field.
+ */
+ if (item->type == BTRFS_DELAYED_DELETION_ITEM)
+ item->bytes_reserved = num_bytes;
+ }
+
+ return ret;
+}
+
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_block_rsv *rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!item->bytes_reserved)
+ return;
+
+ rsv = &fs_info->delayed_block_rsv;
+ /*
+ * Check btrfs_delayed_item_reserve_metadata() to see why we don't need
+ * to release/reserve qgroup space.
+ */
+ trace_btrfs_space_reservation(fs_info, "delayed_item",
+ item->delayed_node->inode_id,
+ item->bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
+}
+
+static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
+ unsigned int num_leaves)
+{
+ struct btrfs_fs_info *fs_info = node->root->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
+
+ /* There are no space reservations during log replay, bail out. */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
+ bytes, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
+}
+
+static int btrfs_delayed_inode_reserve_metadata(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *src_rsv;
+ struct btrfs_block_rsv *dst_rsv;
+ u64 num_bytes;
+ int ret;
+
+ src_rsv = trans->block_rsv;
+ dst_rsv = &fs_info->delayed_block_rsv;
+
+ num_bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ /*
+ * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+ * which doesn't reserve space for speed. This is a problem since we
+ * still need to reserve space for this update, so try to reserve the
+ * space.
+ *
+ * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+ * we always reserve enough to update the inode item.
+ */
+ if (!src_rsv || (!trans->bytes_reserved &&
+ src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC, true);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
+ /* NO_FLUSH could only fail with -ENOSPC */
+ ASSERT(ret == 0 || ret == -ENOSPC);
+ if (ret)
+ btrfs_qgroup_free_meta_prealloc(root, num_bytes);
+ } else {
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
+ }
+
+ if (!ret) {
+ trace_btrfs_space_reservation(fs_info, "delayed_inode",
+ node->inode_id, num_bytes, 1);
+ node->bytes_reserved = num_bytes;
+ }
+
+ return ret;
+}
+
+static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_node *node,
+ bool qgroup_free)
+{
+ struct btrfs_block_rsv *rsv;
+
+ if (!node->bytes_reserved)
+ return;
+
+ rsv = &fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(fs_info, "delayed_inode",
+ node->inode_id, node->bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(node->root,
+ node->bytes_reserved);
+ else
+ btrfs_qgroup_convert_reserved_meta(node->root,
+ node->bytes_reserved);
+ node->bytes_reserved = 0;
+}
+
+/*
+ * Insert a single delayed item or a batch of delayed items, as many as possible
+ * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
+ * in the rbtree, and if there's a gap between two consecutive dir index items,
+ * then it means at some point we had delayed dir indexes to add but they got
+ * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
+ * into the subvolume tree. Dir index keys also have their offsets coming from a
+ * monotonically increasing counter, so we can't get new keys with an offset that
+ * fits within a gap between delayed dir index items.
+ */
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *first_item)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_delayed_node *node = first_item->delayed_node;
+ LIST_HEAD(item_list);
+ struct btrfs_delayed_item *curr;
+ struct btrfs_delayed_item *next;
+ const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
+ struct btrfs_item_batch batch;
+ struct btrfs_key first_key;
+ const u32 first_data_size = first_item->data_len;
+ int total_size;
+ char *ins_data = NULL;
+ int ret;
+ bool continuous_keys_only = false;
+
+ lockdep_assert_held(&node->mutex);
+
+ /*
+ * During normal operation the delayed index offset is continuously
+ * increasing, so we can batch insert all items as there will not be any
+ * overlapping keys in the tree.
+ *
+ * The exception to this is log replay, where we may have interleaved
+ * offsets in the tree, so our batch needs to be continuous keys only in
+ * order to ensure we do not end up with out of order items in our leaf.
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ continuous_keys_only = true;
+
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(first_item->bytes_reserved == 0);
+
+ list_add_tail(&first_item->tree_list, &item_list);
+ batch.total_data_size = first_data_size;
+ batch.nr = 1;
+ total_size = first_data_size + sizeof(struct btrfs_item);
+ curr = first_item;
+
+ while (true) {
+ int next_size;
+
+ next = __btrfs_next_delayed_item(curr);
+ if (!next)
+ break;
+
+ /*
+ * We cannot allow gaps in the key space if we're doing log
+ * replay.
+ */
+ if (continuous_keys_only && (next->index != curr->index + 1))
+ break;
+
+ ASSERT(next->bytes_reserved == 0);
+
+ next_size = next->data_len + sizeof(struct btrfs_item);
+ if (total_size + next_size > max_size)
+ break;
+
+ list_add_tail(&next->tree_list, &item_list);
+ batch.nr++;
+ total_size += next_size;
+ batch.total_data_size += next->data_len;
+ curr = next;
+ }
+
+ if (batch.nr == 1) {
+ first_key.objectid = node->inode_id;
+ first_key.type = BTRFS_DIR_INDEX_KEY;
+ first_key.offset = first_item->index;
+ batch.keys = &first_key;
+ batch.data_sizes = &first_data_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ int i = 0;
+
+ ins_data = kmalloc(batch.nr * sizeof(u32) +
+ batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ list_for_each_entry(curr, &item_list, tree_list) {
+ ins_keys[i].objectid = node->inode_id;
+ ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[i].offset = curr->index;
+ ins_sizes[i] = curr->data_len;
+ i++;
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
+ if (ret)
+ goto out;
+
+ list_for_each_entry(curr, &item_list, tree_list) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ path->slots[0]++;
+ }
+
+ /*
+ * Now release our path before releasing the delayed items and their
+ * metadata reservations, so that we don't block other tasks for more
+ * time than needed.
+ */
+ btrfs_release_path(path);
+
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * For normal operations we will batch an entire leaf's worth of delayed
+ * items, so if there are more items to process we can decrement
+ * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
+ *
+ * However for log replay we may not have inserted an entire leaf's
+ * worth of items, we may have not had continuous items, so decrementing
+ * here would mess up the index_item_leaves accounting. For this case
+ * only clean up the accounting when there are no items left.
+ */
+ if (next && !continuous_keys_only) {
+ /*
+ * We inserted one batch of items into a leaf a there are more
+ * items to flush in a future batch, now release one unit of
+ * metadata space from the delayed block reserve, corresponding
+ * the leaf we just flushed to.
+ */
+ btrfs_delayed_item_release_leaves(node, 1);
+ node->index_item_leaves--;
+ } else if (!next) {
+ /*
+ * There are no more items to insert. We can have a number of
+ * reserved leaves > 1 here - this happens when many dir index
+ * items are added and then removed before they are flushed (file
+ * names with a very short life, never span a transaction). So
+ * release all remaining leaves.
+ */
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
+ list_for_each_entry_safe(curr, next, &item_list, tree_list) {
+ list_del(&curr->tree_list);
+ btrfs_release_delayed_item(curr);
+ }
+out:
+ kfree(ins_data);
+ return ret;
+}
+
+static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ int ret = 0;
+
+ while (ret == 0) {
+ struct btrfs_delayed_item *curr;
+
+ mutex_lock(&node->mutex);
+ curr = __btrfs_first_delayed_insertion_item(node);
+ if (!curr) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+ ret = btrfs_insert_delayed_item(trans, root, path, curr);
+ mutex_unlock(&node->mutex);
+ }
+
+ return ret;
+}
+
+static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *item)
+{
+ const u64 ino = item->delayed_node->inode_id;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_delayed_item *curr, *next;
+ struct extent_buffer *leaf = path->nodes[0];
+ LIST_HEAD(batch_list);
+ int nitems, slot, last_slot;
+ int ret;
+ u64 total_reserved_size = item->bytes_reserved;
+
+ ASSERT(leaf != NULL);
+
+ slot = path->slots[0];
+ last_slot = btrfs_header_nritems(leaf) - 1;
+ /*
+ * Our caller always gives us a path pointing to an existing item, so
+ * this can not happen.
+ */
+ ASSERT(slot <= last_slot);
+ if (WARN_ON(slot > last_slot))
+ return -ENOENT;
+
+ nitems = 1;
+ curr = item;
+ list_add_tail(&curr->tree_list, &batch_list);
+
+ /*
+ * Keep checking if the next delayed item matches the next item in the
+ * leaf - if so, we can add it to the batch of items to delete from the
+ * leaf.
+ */
+ while (slot < last_slot) {
+ struct btrfs_key key;
+
+ next = __btrfs_next_delayed_item(curr);
+ if (!next)
+ break;
+
+ slot++;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
+ break;
+ nitems++;
+ curr = next;
+ list_add_tail(&curr->tree_list, &batch_list);
+ total_reserved_size += curr->bytes_reserved;
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
+ if (ret)
+ return ret;
+
+ /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
+ if (total_reserved_size > 0) {
+ /*
+ * Check btrfs_delayed_item_reserve_metadata() to see why we
+ * don't need to release/reserve qgroup space.
+ */
+ trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
+ total_reserved_size, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
+ total_reserved_size, NULL);
+ }
+
+ list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
+ list_del(&curr->tree_list);
+ btrfs_release_delayed_item(curr);
+ }
+
+ return 0;
+}
+
+static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = node->inode_id;
+ key.type = BTRFS_DIR_INDEX_KEY;
+
+ while (ret == 0) {
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_deletion_item(node);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+
+ key.offset = item->index;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ /*
+ * There's no matching item in the leaf. This means we
+ * have already deleted this item in a past run of the
+ * delayed items. We ignore errors when running delayed
+ * items from an async context, through a work queue job
+ * running btrfs_async_run_delayed_root(), and don't
+ * release delayed items that failed to complete. This
+ * is because we will retry later, and at transaction
+ * commit time we always run delayed items and will
+ * then deal with errors if they fail to run again.
+ *
+ * So just release delayed items for which we can't find
+ * an item in the tree, and move to the next item.
+ */
+ btrfs_release_path(path);
+ btrfs_release_delayed_item(item);
+ ret = 0;
+ } else if (ret == 0) {
+ ret = btrfs_batch_delete_items(trans, root, path, item);
+ btrfs_release_path(path);
+ }
+
+ /*
+ * We unlock and relock on each iteration, this is to prevent
+ * blocking other tasks for too long while we are being run from
+ * the async context (work queue job). Those tasks are typically
+ * running system calls like creat/mkdir/rename/unlink/etc which
+ * need to add delayed items to this delayed node.
+ */
+ mutex_unlock(&node->mutex);
+ }
+
+ return ret;
+}
+
+static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ if (delayed_node &&
+ test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ BUG_ON(!delayed_node->root);
+ clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ finish_one_item(delayed_root);
+ }
+}
+
+static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
+{
+
+ if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
+ struct btrfs_delayed_root *delayed_root;
+
+ ASSERT(delayed_node->root);
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ finish_one_item(delayed_root);
+ }
+}
+
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ int mod;
+ int ret;
+
+ key.objectid = node->inode_id;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+ mod = -1;
+ else
+ mod = 1;
+
+ ret = btrfs_lookup_inode(trans, root, path, &key, mod);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+ write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
+ sizeof(struct btrfs_inode_item));
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+ goto out;
+
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(leaf))
+ goto search;
+again:
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != node->inode_id)
+ goto out;
+
+ if (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ goto out;
+
+ /*
+ * Delayed iref deletion is for the inode who has only one link,
+ * so there is only one iref. The case that several irefs are
+ * in the same item doesn't exist.
+ */
+ btrfs_del_item(trans, root, path);
+out:
+ btrfs_release_delayed_iref(node);
+ btrfs_release_path(path);
+err_out:
+ btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
+ btrfs_release_delayed_inode(node);
+
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the improper
+ * counts behind.
+ */
+ if (ret && ret != -ENOENT)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+
+search:
+ btrfs_release_path(path);
+
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = -1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto err_out;
+ ASSERT(ret);
+
+ ret = 0;
+ leaf = path->nodes[0];
+ path->slots[0]--;
+ goto again;
+}
+
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ int ret;
+
+ mutex_lock(&node->mutex);
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
+ mutex_unlock(&node->mutex);
+ return 0;
+ }
+
+ ret = __btrfs_update_delayed_inode(trans, root, path, node);
+ mutex_unlock(&node->mutex);
+ return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ int ret;
+
+ ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+ return ret;
+}
+
+/*
+ * Called when committing the transaction.
+ * Returns 0 on success.
+ * Returns < 0 on error and returns with an aborted transaction with any
+ * outstanding delayed items cleaned up.
+ */
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *block_rsv;
+ int ret = 0;
+ bool count = (nr > 0);
+
+ if (TRANS_ABORTED(trans))
+ return -EIO;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &fs_info->delayed_block_rsv;
+
+ delayed_root = fs_info->delayed_root;
+
+ curr_node = btrfs_first_delayed_node(delayed_root);
+ while (curr_node && (!count || nr--)) {
+ ret = __btrfs_commit_inode_delayed_items(trans, path,
+ curr_node);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ prev_node = curr_node;
+ curr_node = btrfs_next_delayed_node(curr_node);
+ /*
+ * See the comment below about releasing path before releasing
+ * node. If the commit of delayed items was successful the path
+ * should always be released, but in case of an error, it may
+ * point to locked extent buffers (a leaf at the very least).
+ */
+ ASSERT(path->nodes[0] == NULL);
+ btrfs_release_delayed_node(prev_node);
+ }
+
+ /*
+ * Release the path to avoid a potential deadlock and lockdep splat when
+ * releasing the delayed node, as that requires taking the delayed node's
+ * mutex. If another task starts running delayed items before we take
+ * the mutex, it will first lock the mutex and then it may try to lock
+ * the same btree path (leaf).
+ */
+ btrfs_free_path(path);
+
+ if (curr_node)
+ btrfs_release_delayed_node(curr_node);
+ trans->block_rsv = block_rsv;
+
+ return ret;
+}
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
+{
+ return __btrfs_run_delayed_items(trans, -1);
+}
+
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
+{
+ return __btrfs_run_delayed_items(trans, nr);
+}
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *block_rsv;
+ int ret;
+
+ if (!delayed_node)
+ return 0;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!delayed_node->count) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+ }
+ mutex_unlock(&delayed_node->mutex);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ btrfs_release_delayed_node(delayed_node);
+ return -ENOMEM;
+ }
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+ ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+
+ btrfs_release_delayed_node(delayed_node);
+ btrfs_free_path(path);
+ trans->block_rsv = block_rsv;
+
+ return ret;
+}
+
+int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *block_rsv;
+ int ret;
+
+ if (!delayed_node)
+ return 0;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+ }
+ mutex_unlock(&delayed_node->mutex);
+
+ trans = btrfs_join_transaction(delayed_node->root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto trans_out;
+ }
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &fs_info->delayed_block_rsv;
+
+ mutex_lock(&delayed_node->mutex);
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
+ ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+ path, delayed_node);
+ else
+ ret = 0;
+ mutex_unlock(&delayed_node->mutex);
+
+ btrfs_free_path(path);
+ trans->block_rsv = block_rsv;
+trans_out:
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+out:
+ btrfs_release_delayed_node(delayed_node);
+
+ return ret;
+}
+
+void btrfs_remove_delayed_node(struct btrfs_inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+
+ delayed_node = READ_ONCE(inode->delayed_node);
+ if (!delayed_node)
+ return;
+
+ inode->delayed_node = NULL;
+ btrfs_release_delayed_node(delayed_node);
+}
+
+struct btrfs_async_delayed_work {
+ struct btrfs_delayed_root *delayed_root;
+ int nr;
+ struct btrfs_work work;
+};
+
+static void btrfs_async_run_delayed_root(struct btrfs_work *work)
+{
+ struct btrfs_async_delayed_work *async_work;
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_delayed_node *delayed_node = NULL;
+ struct btrfs_root *root;
+ struct btrfs_block_rsv *block_rsv;
+ int total_done = 0;
+
+ async_work = container_of(work, struct btrfs_async_delayed_work, work);
+ delayed_root = async_work->delayed_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out;
+
+ do {
+ if (atomic_read(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND / 2)
+ break;
+
+ delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+ if (!delayed_node)
+ break;
+
+ root = delayed_node->root;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_release_path(path);
+ btrfs_release_prepared_delayed_node(delayed_node);
+ total_done++;
+ continue;
+ }
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
+
+ __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+
+ trans->block_rsv = block_rsv;
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty_nodelay(root->fs_info);
+
+ btrfs_release_path(path);
+ btrfs_release_prepared_delayed_node(delayed_node);
+ total_done++;
+
+ } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
+ || total_done < async_work->nr);
+
+ btrfs_free_path(path);
+out:
+ wake_up(&delayed_root->wait);
+ kfree(async_work);
+}
+
+
+static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
+ struct btrfs_fs_info *fs_info, int nr)
+{
+ struct btrfs_async_delayed_work *async_work;
+
+ async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
+ if (!async_work)
+ return -ENOMEM;
+
+ async_work->delayed_root = delayed_root;
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
+ NULL);
+ async_work->nr = nr;
+
+ btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
+ return 0;
+}
+
+void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
+{
+ WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
+}
+
+static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
+{
+ int val = atomic_read(&delayed_root->items_seq);
+
+ if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
+ return 1;
+
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ return 1;
+
+ return 0;
+}
+
+void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
+
+ if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
+ btrfs_workqueue_normal_congested(fs_info->delayed_workers))
+ return;
+
+ if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+ int seq;
+ int ret;
+
+ seq = atomic_read(&delayed_root->items_seq);
+
+ ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
+ if (ret)
+ return;
+
+ wait_event_interruptible(delayed_root->wait,
+ could_end_wait(delayed_root, seq));
+ return;
+ }
+
+ btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
+}
+
+static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ /*
+ * Adding the new dir index item does not require touching another
+ * leaf, so we can release 1 unit of metadata that was previously
+ * reserved when starting the transaction. This applies only to
+ * the case where we had a transaction start and excludes the
+ * transaction join case (when replaying log trees).
+ */
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, bytes, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+ ASSERT(trans->bytes_reserved >= bytes);
+ trans->bytes_reserved -= bytes;
+}
+
+/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+ const char *name, int name_len,
+ struct btrfs_inode *dir,
+ struct btrfs_disk_key *disk_key, u8 type,
+ u64 index)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_delayed_item *delayed_item;
+ struct btrfs_dir_item *dir_item;
+ bool reserve_leaf_space;
+ u32 data_len;
+ int ret;
+
+ delayed_node = btrfs_get_or_create_delayed_node(dir);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
+ delayed_node,
+ BTRFS_DELAYED_INSERTION_ITEM);
+ if (!delayed_item) {
+ ret = -ENOMEM;
+ goto release_node;
+ }
+
+ delayed_item->index = index;
+
+ dir_item = (struct btrfs_dir_item *)delayed_item->data;
+ dir_item->location = *disk_key;
+ btrfs_set_stack_dir_transid(dir_item, trans->transid);
+ btrfs_set_stack_dir_data_len(dir_item, 0);
+ btrfs_set_stack_dir_name_len(dir_item, name_len);
+ btrfs_set_stack_dir_type(dir_item, type);
+ memcpy((char *)(dir_item + 1), name, name_len);
+
+ data_len = delayed_item->data_len + sizeof(struct btrfs_item);
+
+ mutex_lock(&delayed_node->mutex);
+
+ /*
+ * First attempt to insert the delayed item. This is to make the error
+ * handling path simpler in case we fail (-EEXIST). There's no risk of
+ * any other task coming in and running the delayed item before we do
+ * the metadata space reservation below, because we are holding the
+ * delayed node's mutex and that mutex must also be locked before the
+ * node's delayed items can be run.
+ */
+ ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
+ if (unlikely(ret)) {
+ btrfs_err(trans->fs_info,
+"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d",
+ name_len, name, index, btrfs_root_id(delayed_node->root),
+ delayed_node->inode_id, dir->index_cnt,
+ delayed_node->index_cnt, ret);
+ btrfs_release_delayed_item(delayed_item);
+ btrfs_release_dir_index_item_space(trans);
+ mutex_unlock(&delayed_node->mutex);
+ goto release_node;
+ }
+
+ if (delayed_node->index_item_leaves == 0 ||
+ delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
+ delayed_node->curr_index_batch_size = data_len;
+ reserve_leaf_space = true;
+ } else {
+ delayed_node->curr_index_batch_size += data_len;
+ reserve_leaf_space = false;
+ }
+
+ if (reserve_leaf_space) {
+ ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
+ /*
+ * Space was reserved for a dir index item insertion when we
+ * started the transaction, so getting a failure here should be
+ * impossible.
+ */
+ if (WARN_ON(ret)) {
+ btrfs_release_delayed_item(delayed_item);
+ mutex_unlock(&delayed_node->mutex);
+ goto release_node;
+ }
+
+ delayed_node->index_item_leaves++;
+ } else {
+ btrfs_release_dir_index_item_space(trans);
+ }
+ mutex_unlock(&delayed_node->mutex);
+
+release_node:
+ btrfs_release_delayed_node(delayed_node);
+ return ret;
+}
+
+static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_node *node,
+ u64 index)
+{
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ return 1;
+ }
+
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(item->bytes_reserved == 0);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * If there's only one leaf reserved, we can decrement this item from the
+ * current batch, otherwise we can not because we don't know which leaf
+ * it belongs to. With the current limit on delayed items, we rarely
+ * accumulate enough dir index items to fill more than one leaf (even
+ * when using a leaf size of 4K).
+ */
+ if (node->index_item_leaves == 1) {
+ const u32 data_len = item->data_len + sizeof(struct btrfs_item);
+
+ ASSERT(node->curr_index_batch_size >= data_len);
+ node->curr_index_batch_size -= data_len;
+ }
+
+ btrfs_release_delayed_item(item);
+
+ /* If we now have no more dir index items, we can release all leaves. */
+ if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
+ mutex_unlock(&node->mutex);
+ return 0;
+}
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir, u64 index)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ int ret;
+
+ node = btrfs_get_or_create_delayed_node(dir);
+ if (IS_ERR(node))
+ return PTR_ERR(node);
+
+ ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
+ if (!ret)
+ goto end;
+
+ item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
+ if (!item) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ item->index = index;
+
+ ret = btrfs_delayed_item_reserve_metadata(trans, item);
+ /*
+ * we have reserved enough space when we start a new transaction,
+ * so reserving metadata failure is impossible.
+ */
+ if (ret < 0) {
+ btrfs_err(trans->fs_info,
+"metadata reservation failed for delayed dir item deltiona, should have been reserved");
+ btrfs_release_delayed_item(item);
+ goto end;
+ }
+
+ mutex_lock(&node->mutex);
+ ret = __btrfs_add_delayed_item(node, item);
+ if (unlikely(ret)) {
+ btrfs_err(trans->fs_info,
+ "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
+ index, node->root->root_key.objectid,
+ node->inode_id, ret);
+ btrfs_delayed_item_release_metadata(dir->root, item);
+ btrfs_release_delayed_item(item);
+ }
+ mutex_unlock(&node->mutex);
+end:
+ btrfs_release_delayed_node(node);
+ return ret;
+}
+
+int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+
+ if (!delayed_node)
+ return -ENOENT;
+
+ /*
+ * Since we have held i_mutex of this directory, it is impossible that
+ * a new directory index is added into the delayed node and index_cnt
+ * is updated now. So we needn't lock the delayed node.
+ */
+ if (!delayed_node->index_cnt) {
+ btrfs_release_delayed_node(delayed_node);
+ return -EINVAL;
+ }
+
+ inode->index_cnt = delayed_node->index_cnt;
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+}
+
+bool btrfs_readdir_get_delayed_items(struct inode *inode,
+ u64 last_index,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_delayed_item *item;
+
+ delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ if (!delayed_node)
+ return false;
+
+ /*
+ * We can only do one readdir with delayed items at a time because of
+ * item->readdir_list.
+ */
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, 0);
+
+ mutex_lock(&delayed_node->mutex);
+ item = __btrfs_first_delayed_insertion_item(delayed_node);
+ while (item && item->index <= last_index) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->readdir_list, ins_list);
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(delayed_node);
+ while (item && item->index <= last_index) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->readdir_list, del_list);
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&delayed_node->mutex);
+ /*
+ * This delayed node is still cached in the btrfs inode, so refs
+ * must be > 1 now, and we needn't check it is going to be freed
+ * or not.
+ *
+ * Besides that, this function is used to read dir, we do not
+ * insert/delete delayed items in this period. So we also needn't
+ * requeue or dequeue this delayed node.
+ */
+ refcount_dec(&delayed_node->refs);
+
+ return true;
+}
+
+void btrfs_readdir_put_delayed_items(struct inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_item *curr, *next;
+
+ list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ list_del(&curr->readdir_list);
+ if (refcount_dec_and_test(&curr->refs))
+ kfree(curr);
+ }
+
+ list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+ list_del(&curr->readdir_list);
+ if (refcount_dec_and_test(&curr->refs))
+ kfree(curr);
+ }
+
+ /*
+ * The VFS is going to do up_read(), so we need to downgrade back to a
+ * read lock.
+ */
+ downgrade_write(&inode->i_rwsem);
+}
+
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+ u64 index)
+{
+ struct btrfs_delayed_item *curr;
+ int ret = 0;
+
+ list_for_each_entry(curr, del_list, readdir_list) {
+ if (curr->index > index)
+ break;
+ if (curr->index == index) {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
+ *
+ */
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ struct list_head *ins_list)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_delayed_item *curr, *next;
+ struct btrfs_key location;
+ char *name;
+ int name_len;
+ int over = 0;
+ unsigned char d_type;
+
+ if (list_empty(ins_list))
+ return 0;
+
+ /*
+ * Changing the data of the delayed item is impossible. So
+ * we needn't lock them. And we have held i_mutex of the
+ * directory, nobody can delete any directory indexes now.
+ */
+ list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ list_del(&curr->readdir_list);
+
+ if (curr->index < ctx->pos) {
+ if (refcount_dec_and_test(&curr->refs))
+ kfree(curr);
+ continue;
+ }
+
+ ctx->pos = curr->index;
+
+ di = (struct btrfs_dir_item *)curr->data;
+ name = (char *)(di + 1);
+ name_len = btrfs_stack_dir_name_len(di);
+
+ d_type = fs_ftype_to_dtype(di->type);
+ btrfs_disk_key_to_cpu(&location, &di->location);
+
+ over = !dir_emit(ctx, name, name_len,
+ location.objectid, d_type);
+
+ if (refcount_dec_and_test(&curr->refs))
+ kfree(curr);
+
+ if (over)
+ return 1;
+ ctx->pos++;
+ }
+ return 0;
+}
+
+static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
+ struct btrfs_inode_item *inode_item,
+ struct inode *inode)
+{
+ u64 flags;
+
+ btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
+ btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
+ btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
+ btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
+ btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
+ btrfs_set_stack_inode_generation(inode_item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_stack_inode_sequence(inode_item,
+ inode_peek_iversion(inode));
+ btrfs_set_stack_inode_transid(inode_item, trans->transid);
+ btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_stack_inode_flags(inode_item, flags);
+ btrfs_set_stack_inode_block_group(inode_item, 0);
+
+ btrfs_set_stack_timespec_sec(&inode_item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->atime,
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->mtime,
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->ctime,
+ inode->i_ctime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
+}
+
+int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_inode_item *inode_item;
+
+ delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ if (!delayed_node)
+ return -ENOENT;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return -ENOENT;
+ }
+
+ inode_item = &delayed_node->inode_item;
+
+ i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
+ i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
+ btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+ round_up(i_size_read(inode), fs_info->sectorsize));
+ inode->i_mode = btrfs_stack_inode_mode(inode_item);
+ set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
+ inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
+ BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+ BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
+
+ inode_set_iversion_queried(inode,
+ btrfs_stack_inode_sequence(inode_item));
+ inode->i_rdev = 0;
+ *rdev = btrfs_stack_inode_rdev(inode_item);
+ btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+
+ inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
+
+ inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
+
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_stack_timespec_sec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_stack_timespec_nsec(&inode_item->otime);
+
+ inode->i_generation = BTRFS_I(inode)->generation;
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+}
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+ int ret = 0;
+
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ mutex_lock(&delayed_node->mutex);
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ fill_stack_inode_item(trans, &delayed_node->inode_item,
+ &inode->vfs_inode);
+ goto release_node;
+ }
+
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+ if (ret)
+ goto release_node;
+
+ fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
+ set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
+ delayed_node->count++;
+ atomic_inc(&root->fs_info->delayed_root->items);
+release_node:
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return ret;
+}
+
+int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_delayed_node *delayed_node;
+
+ /*
+ * we don't do delayed inode updates during log recovery because it
+ * leads to enospc problems. This means we also can't do
+ * delayed inode refs
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return -EAGAIN;
+
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ /*
+ * We don't reserve space for inode ref deletion is because:
+ * - We ONLY do async inode ref deletion for the inode who has only
+ * one link(i_nlink == 1), it means there is only one inode ref.
+ * And in most case, the inode ref and the inode item are in the
+ * same leaf, and we will deal with them at the same time.
+ * Since we are sure we will reserve the space for the inode item,
+ * it is unnecessary to reserve space for inode ref deletion.
+ * - If the inode ref and the inode item are not in the same leaf,
+ * We also needn't worry about enospc problem, because we reserve
+ * much more space for the inode update than it needs.
+ * - At the worst, we can steal some space from the global reservation.
+ * It is very rare.
+ */
+ mutex_lock(&delayed_node->mutex);
+ if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+ goto release_node;
+
+ set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+ delayed_node->count++;
+ atomic_inc(&fs_info->delayed_root->items);
+release_node:
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+}
+
+static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
+{
+ struct btrfs_root *root = delayed_node->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_delayed_item *curr_item, *prev_item;
+
+ mutex_lock(&delayed_node->mutex);
+ curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
+ while (curr_item) {
+ prev_item = curr_item;
+ curr_item = __btrfs_next_delayed_item(prev_item);
+ btrfs_release_delayed_item(prev_item);
+ }
+
+ if (delayed_node->index_item_leaves > 0) {
+ btrfs_delayed_item_release_leaves(delayed_node,
+ delayed_node->index_item_leaves);
+ delayed_node->index_item_leaves = 0;
+ }
+
+ curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
+ while (curr_item) {
+ btrfs_delayed_item_release_metadata(root, curr_item);
+ prev_item = curr_item;
+ curr_item = __btrfs_next_delayed_item(prev_item);
+ btrfs_release_delayed_item(prev_item);
+ }
+
+ btrfs_release_delayed_iref(delayed_node);
+
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
+ btrfs_release_delayed_inode(delayed_node);
+ }
+ mutex_unlock(&delayed_node->mutex);
+}
+
+void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+
+ delayed_node = btrfs_get_delayed_node(inode);
+ if (!delayed_node)
+ return;
+
+ __btrfs_kill_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node);
+}
+
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
+{
+ u64 inode_id = 0;
+ struct btrfs_delayed_node *delayed_nodes[8];
+ int i, n;
+
+ while (1) {
+ spin_lock(&root->inode_lock);
+ n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+ (void **)delayed_nodes, inode_id,
+ ARRAY_SIZE(delayed_nodes));
+ if (!n) {
+ spin_unlock(&root->inode_lock);
+ break;
+ }
+
+ inode_id = delayed_nodes[n - 1]->inode_id + 1;
+ for (i = 0; i < n; i++) {
+ /*
+ * Don't increase refs in case the node is dead and
+ * about to be removed from the tree in the loop below
+ */
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
+ }
+ spin_unlock(&root->inode_lock);
+
+ for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
+ __btrfs_kill_delayed_node(delayed_nodes[i]);
+ btrfs_release_delayed_node(delayed_nodes[i]);
+ }
+ }
+}
+
+void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_delayed_node *curr_node, *prev_node;
+
+ curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
+ while (curr_node) {
+ __btrfs_kill_delayed_node(curr_node);
+
+ prev_node = curr_node;
+ curr_node = btrfs_next_delayed_node(curr_node);
+ btrfs_release_delayed_node(prev_node);
+ }
+}
+
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_insertion_item(node);
+ while (item) {
+ /*
+ * It's possible that the item is already in a log list. This
+ * can happen in case two tasks are trying to log the same
+ * directory. For example if we have tasks A and task B:
+ *
+ * Task A collected the delayed items into a log list while
+ * under the inode's log_mutex (at btrfs_log_inode()), but it
+ * only releases the items after logging the inodes they point
+ * to (if they are new inodes), which happens after unlocking
+ * the log mutex;
+ *
+ * Task B enters btrfs_log_inode() and acquires the log_mutex
+ * of the same directory inode, before task B releases the
+ * delayed items. This can happen for example when logging some
+ * inode we need to trigger logging of its parent directory, so
+ * logging two files that have the same parent directory can
+ * lead to this.
+ *
+ * If this happens, just ignore delayed items already in a log
+ * list. All the tasks logging the directory are under a log
+ * transaction and whichever finishes first can not sync the log
+ * before the other completes and leaves the log transaction.
+ */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, ins_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(node);
+ while (item) {
+ /* It may be non-empty, for the same reason mentioned above. */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, del_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
+
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ struct btrfs_delayed_item *next;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+
+ list_for_each_entry_safe(item, next, ins_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ list_for_each_entry_safe(item, next, del_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
new file mode 100644
index 000000000..fc6fae964
--- /dev/null
+++ b/fs/btrfs/delayed-inode.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2011 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ */
+
+#ifndef BTRFS_DELAYED_INODE_H
+#define BTRFS_DELAYED_INODE_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/atomic.h>
+#include <linux/refcount.h>
+#include "ctree.h"
+
+enum btrfs_delayed_item_type {
+ BTRFS_DELAYED_INSERTION_ITEM,
+ BTRFS_DELAYED_DELETION_ITEM
+};
+
+struct btrfs_delayed_root {
+ spinlock_t lock;
+ struct list_head node_list;
+ /*
+ * Used for delayed nodes which is waiting to be dealt with by the
+ * worker. If the delayed node is inserted into the work queue, we
+ * drop it from this list.
+ */
+ struct list_head prepare_list;
+ atomic_t items; /* for delayed items */
+ atomic_t items_seq; /* for delayed items */
+ int nodes; /* for delayed nodes */
+ wait_queue_head_t wait;
+};
+
+#define BTRFS_DELAYED_NODE_IN_LIST 0
+#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
+#define BTRFS_DELAYED_NODE_DEL_IREF 2
+
+struct btrfs_delayed_node {
+ u64 inode_id;
+ u64 bytes_reserved;
+ struct btrfs_root *root;
+ /* Used to add the node into the delayed root's node list. */
+ struct list_head n_list;
+ /*
+ * Used to add the node into the prepare list, the nodes in this list
+ * is waiting to be dealt with by the async worker.
+ */
+ struct list_head p_list;
+ struct rb_root_cached ins_root;
+ struct rb_root_cached del_root;
+ struct mutex mutex;
+ struct btrfs_inode_item inode_item;
+ refcount_t refs;
+ u64 index_cnt;
+ unsigned long flags;
+ int count;
+ /*
+ * The size of the next batch of dir index items to insert (if this
+ * node is from a directory inode). Protected by @mutex.
+ */
+ u32 curr_index_batch_size;
+ /*
+ * Number of leaves reserved for inserting dir index items (if this
+ * node belongs to a directory inode). This may be larger then the
+ * actual number of leaves we end up using. Protected by @mutex.
+ */
+ u32 index_item_leaves;
+};
+
+struct btrfs_delayed_item {
+ struct rb_node rb_node;
+ /* Offset value of the corresponding dir index key. */
+ u64 index;
+ struct list_head tree_list; /* used for batch insert/delete items */
+ struct list_head readdir_list; /* used for readdir items */
+ /*
+ * Used when logging a directory.
+ * Insertions and deletions to this list are protected by the parent
+ * delayed node's mutex.
+ */
+ struct list_head log_list;
+ u64 bytes_reserved;
+ struct btrfs_delayed_node *delayed_node;
+ refcount_t refs;
+ enum btrfs_delayed_item_type type:8;
+ /*
+ * Track if this delayed item was already logged.
+ * Protected by the mutex of the parent delayed inode.
+ */
+ bool logged;
+ /* The maximum leaf size is 64K, so u16 is more than enough. */
+ u16 data_len;
+ char data[];
+};
+
+static inline void btrfs_init_delayed_root(
+ struct btrfs_delayed_root *delayed_root)
+{
+ atomic_set(&delayed_root->items, 0);
+ atomic_set(&delayed_root->items_seq, 0);
+ delayed_root->nodes = 0;
+ spin_lock_init(&delayed_root->lock);
+ init_waitqueue_head(&delayed_root->wait);
+ INIT_LIST_HEAD(&delayed_root->node_list);
+ INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+ const char *name, int name_len,
+ struct btrfs_inode *dir,
+ struct btrfs_disk_key *disk_key, u8 type,
+ u64 index);
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir, u64 index);
+
+int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode);
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr);
+
+void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info);
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode);
+/* Used for evicting the inode. */
+void btrfs_remove_delayed_node(struct btrfs_inode *inode);
+void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode);
+int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
+
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_inode *inode);
+int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
+
+/* Used for drop dead root */
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+
+/* Used for clean the transaction */
+void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
+
+/* Used for readdir() */
+bool btrfs_readdir_get_delayed_items(struct inode *inode,
+ u64 last_index,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_readdir_put_delayed_items(struct inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+ u64 index);
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ struct list_head *ins_list);
+
+/* Used during directory logging. */
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+
+/* for init */
+int __init btrfs_delayed_inode_init(void);
+void __cold btrfs_delayed_inode_exit(void);
+
+/* for debugging */
+void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000..e08e3852c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+#include "qgroup.h"
+#include "space-info.h"
+#include "tree-mod-log.h"
+
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
+/*
+ * delayed back reference update tracking. For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing. This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ */
+
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ bool ret = false;
+ u64 reserved;
+
+ spin_lock(&global_rsv->lock);
+ reserved = global_rsv->reserved;
+ spin_unlock(&global_rsv->lock);
+
+ /*
+ * Since the global reserve is just kind of magic we don't really want
+ * to rely on it to save our bacon, so if our size is more than the
+ * delayed_refs_rsv and the global rsv then it's time to think about
+ * bailing.
+ */
+ spin_lock(&delayed_refs_rsv->lock);
+ reserved += delayed_refs_rsv->reserved;
+ if (delayed_refs_rsv->size >= reserved)
+ ret = true;
+ spin_unlock(&delayed_refs_rsv->lock);
+ return ret;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
+{
+ u64 num_entries =
+ atomic_read(&trans->transaction->delayed_refs.num_entries);
+ u64 avg_runtime;
+ u64 val;
+
+ smp_mb();
+ avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
+ val = num_entries * avg_runtime;
+ if (val >= NSEC_PER_SEC)
+ return 1;
+ if (val >= NSEC_PER_SEC / 2)
+ return 2;
+
+ return btrfs_check_space_for_delayed_refs(trans->fs_info);
+}
+
+/**
+ * Release a ref head's reservation
+ *
+ * @fs_info: the filesystem
+ * @nr: number of items to drop
+ *
+ * This drops the delayed ref head's count from the delayed refs rsv and frees
+ * any excess reservation we had.
+ */
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
+ u64 released = 0;
+
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
+ released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
+ if (released)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
+}
+
+/*
+ * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
+ * @trans - the trans that may have generated delayed refs
+ *
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
+ * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ */
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ u64 num_bytes;
+
+ if (!trans->delayed_ref_updates)
+ return;
+
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info,
+ trans->delayed_ref_updates);
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
+ spin_lock(&delayed_rsv->lock);
+ delayed_rsv->size += num_bytes;
+ delayed_rsv->full = false;
+ spin_unlock(&delayed_rsv->lock);
+ trans->delayed_ref_updates = 0;
+}
+
+/**
+ * Transfer bytes to our delayed refs rsv
+ *
+ * @fs_info: the filesystem
+ * @num_bytes: number of bytes to transfer
+ *
+ * This transfers up to the num_bytes amount, previously reserved, to the
+ * delayed_refs_rsv. Any extra bytes are returned to the space info.
+ */
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ u64 num_bytes)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ u64 to_free = 0;
+
+ spin_lock(&delayed_refs_rsv->lock);
+ if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
+ u64 delta = delayed_refs_rsv->size -
+ delayed_refs_rsv->reserved;
+ if (num_bytes > delta) {
+ to_free = num_bytes - delta;
+ num_bytes = delta;
+ }
+ } else {
+ to_free = num_bytes;
+ num_bytes = 0;
+ }
+
+ if (num_bytes)
+ delayed_refs_rsv->reserved += num_bytes;
+ if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
+ delayed_refs_rsv->full = true;
+ spin_unlock(&delayed_refs_rsv->lock);
+
+ if (num_bytes)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, num_bytes, 1);
+ if (to_free)
+ btrfs_space_info_free_bytes_may_use(fs_info,
+ delayed_refs_rsv->space_info, to_free);
+}
+
+/**
+ * Refill based on our delayed refs usage
+ *
+ * @fs_info: the filesystem
+ * @flush: control how we can flush for this reservation.
+ *
+ * This will refill the delayed block_rsv up to 1 items size worth of space and
+ * will return -ENOSPC if we can't make the reservation.
+ */
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ num_bytes = block_rsv->size - block_rsv->reserved;
+ num_bytes = min(num_bytes, limit);
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (!num_bytes)
+ return 0;
+
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ if (ret)
+ return ret;
+ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, num_bytes, 1);
+ return 0;
+}
+
+/*
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
+ struct btrfs_delayed_tree_ref *ref2)
+{
+ if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ } else {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * compare two delayed data backrefs with same bytenr and type
+ */
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
+ struct btrfs_delayed_data_ref *ref2)
+{
+ if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ if (ref1->objectid < ref2->objectid)
+ return -1;
+ if (ref1->objectid > ref2->objectid)
+ return 1;
+ if (ref1->offset < ref2->offset)
+ return -1;
+ if (ref1->offset > ref2->offset)
+ return 1;
+ } else {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ }
+ return 0;
+}
+
+static int comp_refs(struct btrfs_delayed_ref_node *ref1,
+ struct btrfs_delayed_ref_node *ref2,
+ bool check_seq)
+{
+ int ret = 0;
+
+ if (ref1->type < ref2->type)
+ return -1;
+ if (ref1->type > ref2->type)
+ return 1;
+ if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
+ btrfs_delayed_node_to_tree_ref(ref2));
+ else
+ ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
+ btrfs_delayed_node_to_data_ref(ref2));
+ if (ret)
+ return ret;
+ if (check_seq) {
+ if (ref1->seq < ref2->seq)
+ return -1;
+ if (ref1->seq > ref2->seq)
+ return 1;
+ }
+ return 0;
+}
+
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_root.rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_head *entry;
+ struct btrfs_delayed_ref_head *ins;
+ u64 bytenr;
+ bool leftmost = true;
+
+ ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+ bytenr = ins->bytenr;
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+ href_node);
+
+ if (bytenr < entry->bytenr) {
+ p = &(*p)->rb_left;
+ } else if (bytenr > entry->bytenr) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ return entry;
+ }
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color_cached(node, root, leftmost);
+ return NULL;
+}
+
+static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
+ struct btrfs_delayed_ref_node *ins)
+{
+ struct rb_node **p = &root->rb_root.rb_node;
+ struct rb_node *node = &ins->ref_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_node *entry;
+ bool leftmost = true;
+
+ while (*p) {
+ int comp;
+
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+ ref_node);
+ comp = comp_refs(ins, entry, true);
+ if (comp < 0) {
+ p = &(*p)->rb_left;
+ } else if (comp > 0) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ return entry;
+ }
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color_cached(node, root, leftmost);
+ return NULL;
+}
+
+static struct btrfs_delayed_ref_head *find_first_ref_head(
+ struct btrfs_delayed_ref_root *dr)
+{
+ struct rb_node *n;
+ struct btrfs_delayed_ref_head *entry;
+
+ n = rb_first_cached(&dr->href_root);
+ if (!n)
+ return NULL;
+
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+ return entry;
+}
+
+/*
+ * Find a head entry based on bytenr. This returns the delayed ref head if it
+ * was able to find one, or NULL if nothing was in that spot. If return_bigger
+ * is given, the next bigger entry is returned if no exact match is found.
+ */
+static struct btrfs_delayed_ref_head *find_ref_head(
+ struct btrfs_delayed_ref_root *dr, u64 bytenr,
+ bool return_bigger)
+{
+ struct rb_root *root = &dr->href_root.rb_root;
+ struct rb_node *n;
+ struct btrfs_delayed_ref_head *entry;
+
+ n = root->rb_node;
+ entry = NULL;
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+ if (bytenr < entry->bytenr)
+ n = n->rb_left;
+ else if (bytenr > entry->bytenr)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ if (entry && return_bigger) {
+ if (bytenr > entry->bytenr) {
+ n = rb_next(&entry->href_node);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_delayed_ref_head,
+ href_node);
+ }
+ return entry;
+ }
+ return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ lockdep_assert_held(&delayed_refs->lock);
+ if (mutex_trylock(&head->mutex))
+ return 0;
+
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ mutex_lock(&head->mutex);
+ spin_lock(&delayed_refs->lock);
+ if (RB_EMPTY_NODE(&head->href_node)) {
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref_head(head);
+ return -EAGAIN;
+ }
+ btrfs_put_delayed_ref_head(head);
+ return 0;
+}
+
+static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *ref)
+{
+ lockdep_assert_held(&head->lock);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
+ RB_CLEAR_NODE(&ref->ref_node);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
+ ref->in_tree = 0;
+ btrfs_put_delayed_ref(ref);
+ atomic_dec(&delayed_refs->num_entries);
+}
+
+static bool merge_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *ref,
+ u64 seq)
+{
+ struct btrfs_delayed_ref_node *next;
+ struct rb_node *node = rb_next(&ref->ref_node);
+ bool done = false;
+
+ while (!done && node) {
+ int mod;
+
+ next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+ node = rb_next(node);
+ if (seq && next->seq >= seq)
+ break;
+ if (comp_refs(ref, next, false))
+ break;
+
+ if (ref->action == next->action) {
+ mod = next->ref_mod;
+ } else {
+ if (ref->ref_mod < next->ref_mod) {
+ swap(ref, next);
+ done = true;
+ }
+ mod = -next->ref_mod;
+ }
+
+ drop_delayed_ref(trans, delayed_refs, head, next);
+ ref->ref_mod += mod;
+ if (ref->ref_mod == 0) {
+ drop_delayed_ref(trans, delayed_refs, head, ref);
+ done = true;
+ } else {
+ /*
+ * Can't have multiples of the same ref on a tree block.
+ */
+ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+ }
+
+ return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_node *ref;
+ struct rb_node *node;
+ u64 seq = 0;
+
+ lockdep_assert_held(&head->lock);
+
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
+ return;
+
+ /* We don't have too many refs to merge for data. */
+ if (head->is_data)
+ return;
+
+ seq = btrfs_tree_mod_log_lowest_seq(fs_info);
+again:
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+ if (seq && ref->seq >= seq)
+ continue;
+ if (merge_ref(trans, delayed_refs, head, ref, seq))
+ goto again;
+ }
+}
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
+{
+ int ret = 0;
+ u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);
+
+ if (min_seq != 0 && seq >= min_seq) {
+ btrfs_debug(fs_info,
+ "holding back delayed_ref %llu, lowest is %llu",
+ seq, min_seq);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs)
+{
+ struct btrfs_delayed_ref_head *head;
+
+again:
+ head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+ true);
+ if (!head && delayed_refs->run_delayed_start != 0) {
+ delayed_refs->run_delayed_start = 0;
+ head = find_first_ref_head(delayed_refs);
+ }
+ if (!head)
+ return NULL;
+
+ while (head->processing) {
+ struct rb_node *node;
+
+ node = rb_next(&head->href_node);
+ if (!node) {
+ if (delayed_refs->run_delayed_start == 0)
+ return NULL;
+ delayed_refs->run_delayed_start = 0;
+ goto again;
+ }
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ }
+
+ head->processing = 1;
+ WARN_ON(delayed_refs->num_heads_ready == 0);
+ delayed_refs->num_heads_ready--;
+ delayed_refs->run_delayed_start = head->bytenr +
+ head->num_bytes;
+ return head;
+}
+
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ lockdep_assert_held(&delayed_refs->lock);
+ lockdep_assert_held(&head->lock);
+
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
+ RB_CLEAR_NODE(&head->href_node);
+ atomic_dec(&delayed_refs->num_entries);
+ delayed_refs->num_heads--;
+ if (head->processing == 0)
+ delayed_refs->num_heads_ready--;
+}
+
+/*
+ * Helper to insert the ref_node to the tail or merge with tail.
+ *
+ * Return 0 for insert.
+ * Return >0 for merge.
+ */
+static int insert_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *root,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *ref)
+{
+ struct btrfs_delayed_ref_node *exist;
+ int mod;
+ int ret = 0;
+
+ spin_lock(&href->lock);
+ exist = tree_insert(&href->ref_tree, ref);
+ if (!exist)
+ goto inserted;
+
+ /* Now we are sure we can merge */
+ ret = 1;
+ if (exist->action == ref->action) {
+ mod = ref->ref_mod;
+ } else {
+ /* Need to change action */
+ if (exist->ref_mod < ref->ref_mod) {
+ exist->action = ref->action;
+ mod = -exist->ref_mod;
+ exist->ref_mod = ref->ref_mod;
+ if (ref->action == BTRFS_ADD_DELAYED_REF)
+ list_add_tail(&exist->add_list,
+ &href->ref_add_list);
+ else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+ ASSERT(!list_empty(&exist->add_list));
+ list_del(&exist->add_list);
+ } else {
+ ASSERT(0);
+ }
+ } else
+ mod = -ref->ref_mod;
+ }
+ exist->ref_mod += mod;
+
+ /* remove existing tail if its ref_mod is zero */
+ if (exist->ref_mod == 0)
+ drop_delayed_ref(trans, root, href, exist);
+ spin_unlock(&href->lock);
+ return ret;
+inserted:
+ if (ref->action == BTRFS_ADD_DELAYED_REF)
+ list_add_tail(&ref->add_list, &href->ref_add_list);
+ atomic_inc(&root->num_entries);
+ spin_unlock(&href->lock);
+ return ret;
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *existing,
+ struct btrfs_delayed_ref_head *update)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int old_ref_mod;
+
+ BUG_ON(existing->is_data != update->is_data);
+
+ spin_lock(&existing->lock);
+ if (update->must_insert_reserved) {
+ /* if the extent was freed and then
+ * reallocated before the delayed ref
+ * entries were processed, we can end up
+ * with an existing head ref without
+ * the must_insert_reserved flag set.
+ * Set it again here
+ */
+ existing->must_insert_reserved = update->must_insert_reserved;
+
+ /*
+ * update the num_bytes so we make sure the accounting
+ * is done correctly
+ */
+ existing->num_bytes = update->num_bytes;
+
+ }
+
+ if (update->extent_op) {
+ if (!existing->extent_op) {
+ existing->extent_op = update->extent_op;
+ } else {
+ if (update->extent_op->update_key) {
+ memcpy(&existing->extent_op->key,
+ &update->extent_op->key,
+ sizeof(update->extent_op->key));
+ existing->extent_op->update_key = true;
+ }
+ if (update->extent_op->update_flags) {
+ existing->extent_op->flags_to_set |=
+ update->extent_op->flags_to_set;
+ existing->extent_op->update_flags = true;
+ }
+ btrfs_free_delayed_extent_op(update->extent_op);
+ }
+ }
+ /*
+ * update the reference mod on the head to reflect this new operation,
+ * only need the lock for this case cause we could be processing it
+ * currently, for refs we just added we know we're a-ok.
+ */
+ old_ref_mod = existing->total_ref_mod;
+ existing->ref_mod += update->ref_mod;
+ existing->total_ref_mod += update->ref_mod;
+
+ /*
+ * If we are going to from a positive ref mod to a negative or vice
+ * versa we need to make sure to adjust pending_csums accordingly.
+ */
+ if (existing->is_data) {
+ u64 csum_leaves =
+ btrfs_csum_bytes_to_leaves(fs_info,
+ existing->num_bytes);
+
+ if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
+ delayed_refs->pending_csums -= existing->num_bytes;
+ btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+ }
+ if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
+ delayed_refs->pending_csums += existing->num_bytes;
+ trans->delayed_ref_updates += csum_leaves;
+ }
+ }
+
+ spin_unlock(&existing->lock);
+}
+
+static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr, u64 num_bytes, u64 ref_root,
+ u64 reserved, int action, bool is_data,
+ bool is_system)
+{
+ int count_mod = 1;
+ int must_insert_reserved = 0;
+
+ /* If reserved is provided, it must be a data extent. */
+ BUG_ON(!is_data && reserved);
+
+ /*
+ * The head node stores the sum of all the mods, so dropping a ref
+ * should drop the sum in the head node by one.
+ */
+ if (action == BTRFS_UPDATE_DELAYED_HEAD)
+ count_mod = 0;
+ else if (action == BTRFS_DROP_DELAYED_REF)
+ count_mod = -1;
+
+ /*
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
+ * accounting when the extent is finally added, or if a later
+ * modification deletes the delayed ref without ever inserting the
+ * extent into the extent allocation tree. ref->must_insert_reserved
+ * is the flag used to record that accounting mods are required.
+ *
+ * Once we record must_insert_reserved, switch the action to
+ * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+ */
+ if (action == BTRFS_ADD_DELAYED_EXTENT)
+ must_insert_reserved = 1;
+ else
+ must_insert_reserved = 0;
+
+ refcount_set(&head_ref->refs, 1);
+ head_ref->bytenr = bytenr;
+ head_ref->num_bytes = num_bytes;
+ head_ref->ref_mod = count_mod;
+ head_ref->must_insert_reserved = must_insert_reserved;
+ head_ref->is_data = is_data;
+ head_ref->is_system = is_system;
+ head_ref->ref_tree = RB_ROOT_CACHED;
+ INIT_LIST_HEAD(&head_ref->ref_add_list);
+ RB_CLEAR_NODE(&head_ref->href_node);
+ head_ref->processing = 0;
+ head_ref->total_ref_mod = count_mod;
+ spin_lock_init(&head_ref->lock);
+ mutex_init(&head_ref->mutex);
+
+ if (qrecord) {
+ if (ref_root && reserved) {
+ qrecord->data_rsv = reserved;
+ qrecord->data_rsv_refroot = ref_root;
+ }
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+ }
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ int action, int *qrecord_inserted_ret)
+{
+ struct btrfs_delayed_ref_head *existing;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int qrecord_inserted = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* Record qgroup extent info if provided */
+ if (qrecord) {
+ if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
+ delayed_refs, qrecord))
+ kfree(qrecord);
+ else
+ qrecord_inserted = 1;
+ }
+
+ trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
+
+ existing = htree_insert(&delayed_refs->href_root,
+ &head_ref->href_node);
+ if (existing) {
+ update_existing_head_ref(trans, existing, head_ref);
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+ head_ref = existing;
+ } else {
+ if (head_ref->is_data && head_ref->ref_mod < 0) {
+ delayed_refs->pending_csums += head_ref->num_bytes;
+ trans->delayed_ref_updates +=
+ btrfs_csum_bytes_to_leaves(trans->fs_info,
+ head_ref->num_bytes);
+ }
+ delayed_refs->num_heads++;
+ delayed_refs->num_heads_ready++;
+ atomic_inc(&delayed_refs->num_entries);
+ trans->delayed_ref_updates++;
+ }
+ if (qrecord_inserted_ret)
+ *qrecord_inserted_ret = qrecord_inserted;
+
+ return head_ref;
+}
+
+/*
+ * init_delayed_ref_common - Initialize the structure which represents a
+ * modification to a an extent.
+ *
+ * @fs_info: Internal to the mounted filesystem mount structure.
+ *
+ * @ref: The structure which is going to be initialized.
+ *
+ * @bytenr: The logical address of the extent for which a modification is
+ * going to be recorded.
+ *
+ * @num_bytes: Size of the extent whose modification is being recorded.
+ *
+ * @ref_root: The id of the root where this modification has originated, this
+ * can be either one of the well-known metadata trees or the
+ * subvolume id which references this extent.
+ *
+ * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
+ * BTRFS_ADD_DELAYED_EXTENT
+ *
+ * @ref_type: Holds the type of the extent which is being recorded, can be
+ * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
+ * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
+ * BTRFS_EXTENT_DATA_REF_KEY when recording data extent
+ */
+static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes, u64 ref_root,
+ int action, u8 ref_type)
+{
+ u64 seq = 0;
+
+ if (action == BTRFS_ADD_DELAYED_EXTENT)
+ action = BTRFS_ADD_DELAYED_REF;
+
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
+
+ refcount_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->num_bytes = num_bytes;
+ ref->ref_mod = 1;
+ ref->action = action;
+ ref->is_head = 0;
+ ref->in_tree = 1;
+ ref->seq = seq;
+ ref->type = ref_type;
+ RB_CLEAR_NODE(&ref->ref_node);
+ INIT_LIST_HEAD(&ref->add_list);
+}
+
+/*
+ * add a delayed tree ref. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
+ int qrecord_inserted;
+ bool is_system;
+ int action = generic_ref->action;
+ int level = generic_ref->tree_ref.level;
+ int ret;
+ u64 bytenr = generic_ref->bytenr;
+ u64 num_bytes = generic_ref->len;
+ u64 parent = generic_ref->parent;
+ u8 ref_type;
+
+ is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
+
+ ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
+ ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+ if (!head_ref) {
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ return -ENOMEM;
+ }
+
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
+ !generic_ref->skip_qgroup) {
+ record = kzalloc(sizeof(*record), GFP_NOFS);
+ if (!record) {
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+ return -ENOMEM;
+ }
+ }
+
+ if (parent)
+ ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ ref_type = BTRFS_TREE_BLOCK_REF_KEY;
+
+ init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+ generic_ref->tree_ref.owning_root, action,
+ ref_type);
+ ref->root = generic_ref->tree_ref.owning_root;
+ ref->parent = parent;
+ ref->level = level;
+
+ init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+ generic_ref->tree_ref.owning_root, 0, action,
+ false, is_system);
+ head_ref->extent_op = extent_op;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ head_ref = add_delayed_ref_head(trans, head_ref, record,
+ action, &qrecord_inserted);
+
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
+
+ trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
+ action == BTRFS_ADD_DELAYED_EXTENT ?
+ BTRFS_ADD_DELAYED_REF : action);
+ if (ret > 0)
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
+ if (qrecord_inserted)
+ btrfs_qgroup_trace_extent_post(trans, record);
+
+ return 0;
+}
+
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ u64 reserved)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
+ int qrecord_inserted;
+ int action = generic_ref->action;
+ int ret;
+ u64 bytenr = generic_ref->bytenr;
+ u64 num_bytes = generic_ref->len;
+ u64 parent = generic_ref->parent;
+ u64 ref_root = generic_ref->data_ref.owning_root;
+ u64 owner = generic_ref->data_ref.ino;
+ u64 offset = generic_ref->data_ref.offset;
+ u8 ref_type;
+
+ ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
+ ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ if (parent)
+ ref_type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ ref_type = BTRFS_EXTENT_DATA_REF_KEY;
+ init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+ ref_root, action, ref_type);
+ ref->root = ref_root;
+ ref->parent = parent;
+ ref->objectid = owner;
+ ref->offset = offset;
+
+
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+ if (!head_ref) {
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ return -ENOMEM;
+ }
+
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
+ !generic_ref->skip_qgroup) {
+ record = kzalloc(sizeof(*record), GFP_NOFS);
+ if (!record) {
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep,
+ head_ref);
+ return -ENOMEM;
+ }
+ }
+
+ init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
+ reserved, action, true, false);
+ head_ref->extent_op = NULL;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ head_ref = add_delayed_ref_head(trans, head_ref, record,
+ action, &qrecord_inserted);
+
+ ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
+
+ trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
+ action == BTRFS_ADD_DELAYED_EXTENT ?
+ BTRFS_ADD_DELAYED_REF : action);
+ if (ret > 0)
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+
+
+ if (qrecord_inserted)
+ return btrfs_qgroup_trace_extent_post(trans, record);
+ return 0;
+}
+
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+ if (!head_ref)
+ return -ENOMEM;
+
+ init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
+ BTRFS_UPDATE_DELAYED_HEAD, false, false);
+ head_ref->extent_op = extent_op;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
+ NULL);
+
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * Need to update the delayed_refs_rsv with any changes we may have
+ * made.
+ */
+ btrfs_update_delayed_refs_rsv(trans);
+ return 0;
+}
+
+/*
+ * This does a simple search for the head node for a given extent. Returns the
+ * head node if found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
+{
+ lockdep_assert_held(&delayed_refs->lock);
+
+ return find_ref_head(delayed_refs, bytenr, false);
+}
+
+void __cold btrfs_delayed_ref_exit(void)
+{
+ kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+ kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+ kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+ kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+
+int __init btrfs_delayed_ref_init(void)
+{
+ btrfs_delayed_ref_head_cachep = kmem_cache_create(
+ "btrfs_delayed_ref_head",
+ sizeof(struct btrfs_delayed_ref_head), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_ref_head_cachep)
+ goto fail;
+
+ btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+ "btrfs_delayed_tree_ref",
+ sizeof(struct btrfs_delayed_tree_ref), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_tree_ref_cachep)
+ goto fail;
+
+ btrfs_delayed_data_ref_cachep = kmem_cache_create(
+ "btrfs_delayed_data_ref",
+ sizeof(struct btrfs_delayed_data_ref), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_data_ref_cachep)
+ goto fail;
+
+ btrfs_delayed_extent_op_cachep = kmem_cache_create(
+ "btrfs_delayed_extent_op",
+ sizeof(struct btrfs_delayed_extent_op), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_extent_op_cachep)
+ goto fail;
+
+ return 0;
+fail:
+ btrfs_delayed_ref_exit();
+ return -ENOMEM;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000..712a6315e
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,405 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_DELAYED_REF_H
+#define BTRFS_DELAYED_REF_H
+
+#include <linux/refcount.h>
+
+/* these are the possible values of struct btrfs_delayed_ref_node->action */
+#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+ struct rb_node ref_node;
+ /*
+ * If action is BTRFS_ADD_DELAYED_REF, also link this node to
+ * ref_head->ref_add_list, then we do not need to iterate the
+ * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
+ */
+ struct list_head add_list;
+
+ /* the starting bytenr of the extent */
+ u64 bytenr;
+
+ /* the size of the extent */
+ u64 num_bytes;
+
+ /* seq number to keep track of insertion order */
+ u64 seq;
+
+ /* ref count on this data structure */
+ refcount_t refs;
+
+ /*
+ * how many refs is this entry adding or deleting. For
+ * head refs, this may be a negative number because it is keeping
+ * track of the total mods done to the reference count.
+ * For individual refs, this will always be a positive number
+ *
+ * It may be more than one, since it is possible for a single
+ * parent to have more than one ref on an extent
+ */
+ int ref_mod;
+
+ unsigned int action:8;
+ unsigned int type:8;
+ /* is this node still in the rbtree? */
+ unsigned int is_head:1;
+ unsigned int in_tree:1;
+};
+
+struct btrfs_delayed_extent_op {
+ struct btrfs_disk_key key;
+ u8 level;
+ bool update_key;
+ bool update_flags;
+ u64 flags_to_set;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent. They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+ u64 bytenr;
+ u64 num_bytes;
+ refcount_t refs;
+ /*
+ * the mutex is held while running the refs, and it is also
+ * held when checking the sum of reference modifications.
+ */
+ struct mutex mutex;
+
+ spinlock_t lock;
+ struct rb_root_cached ref_tree;
+ /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
+ struct list_head ref_add_list;
+
+ struct rb_node href_node;
+
+ struct btrfs_delayed_extent_op *extent_op;
+
+ /*
+ * This is used to track the final ref_mod from all the refs associated
+ * with this head ref, this is not adjusted as delayed refs are run,
+ * this is meant to track if we need to do the csum accounting or not.
+ */
+ int total_ref_mod;
+
+ /*
+ * This is the current outstanding mod references for this bytenr. This
+ * is used with lookup_extent_info to get an accurate reference count
+ * for a bytenr, so it is adjusted as delayed refs are run so that any
+ * on disk reference count + ref_mod is accurate.
+ */
+ int ref_mod;
+
+ /*
+ * when a new extent is allocated, it is just reserved in memory
+ * The actual extent isn't inserted into the extent allocation tree
+ * until the delayed ref is processed. must_insert_reserved is
+ * used to flag a delayed ref so the accounting can be updated
+ * when a full insert is done.
+ *
+ * It is possible the extent will be freed before it is ever
+ * inserted into the extent allocation tree. In this case
+ * we need to update the in ram accounting to properly reflect
+ * the free has happened.
+ */
+ unsigned int must_insert_reserved:1;
+ unsigned int is_data:1;
+ unsigned int is_system:1;
+ unsigned int processing:1;
+};
+
+struct btrfs_delayed_tree_ref {
+ struct btrfs_delayed_ref_node node;
+ u64 root;
+ u64 parent;
+ int level;
+};
+
+struct btrfs_delayed_data_ref {
+ struct btrfs_delayed_ref_node node;
+ u64 root;
+ u64 parent;
+ u64 objectid;
+ u64 offset;
+};
+
+enum btrfs_delayed_ref_flags {
+ /* Indicate that we are flushing delayed refs for the commit */
+ BTRFS_DELAYED_REFS_FLUSHING,
+};
+
+struct btrfs_delayed_ref_root {
+ /* head ref rbtree */
+ struct rb_root_cached href_root;
+
+ /* dirty extent records */
+ struct rb_root dirty_extent_root;
+
+ /* this spin lock protects the rbtree and the entries inside */
+ spinlock_t lock;
+
+ /* how many delayed ref updates we've queued, used by the
+ * throttling code
+ */
+ atomic_t num_entries;
+
+ /* total number of head nodes in tree */
+ unsigned long num_heads;
+
+ /* total number of head nodes ready for processing */
+ unsigned long num_heads_ready;
+
+ u64 pending_csums;
+
+ unsigned long flags;
+
+ u64 run_delayed_start;
+
+ /*
+ * To make qgroup to skip given root.
+ * This is for snapshot, as btrfs_qgroup_inherit() will manually
+ * modify counters for snapshot and its source, so we should skip
+ * the snapshot in new_root/old_roots or it will get calculated twice
+ */
+ u64 qgroup_to_skip;
+};
+
+enum btrfs_ref_type {
+ BTRFS_REF_NOT_SET,
+ BTRFS_REF_DATA,
+ BTRFS_REF_METADATA,
+ BTRFS_REF_LAST,
+};
+
+struct btrfs_data_ref {
+ /* For EXTENT_DATA_REF */
+
+ /* Original root this data extent belongs to */
+ u64 owning_root;
+
+ /* Inode which refers to this data extent */
+ u64 ino;
+
+ /*
+ * file_offset - extent_offset
+ *
+ * file_offset is the key.offset of the EXTENT_DATA key.
+ * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
+ */
+ u64 offset;
+};
+
+struct btrfs_tree_ref {
+ /*
+ * Level of this tree block
+ *
+ * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
+ */
+ int level;
+
+ /*
+ * Root which owns this tree block.
+ *
+ * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
+ */
+ u64 owning_root;
+
+ /* For non-skinny metadata, no special member needed */
+};
+
+struct btrfs_ref {
+ enum btrfs_ref_type type;
+ int action;
+
+ /*
+ * Whether this extent should go through qgroup record.
+ *
+ * Normally false, but for certain cases like delayed subtree scan,
+ * setting this flag can hugely reduce qgroup overhead.
+ */
+ bool skip_qgroup;
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* Through which root is this modification. */
+ u64 real_root;
+#endif
+ u64 bytenr;
+ u64 len;
+
+ /* Bytenr of the parent tree block */
+ u64 parent;
+ union {
+ struct btrfs_data_ref data_ref;
+ struct btrfs_tree_ref tree_ref;
+ };
+};
+
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int __init btrfs_delayed_ref_init(void);
+void __cold btrfs_delayed_ref_exit(void);
+
+static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
+ int action, u64 bytenr, u64 len, u64 parent)
+{
+ generic_ref->action = action;
+ generic_ref->bytenr = bytenr;
+ generic_ref->len = len;
+ generic_ref->parent = parent;
+}
+
+static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
+ int level, u64 root, u64 mod_root, bool skip_qgroup)
+{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* If @real_root not set, use @root as fallback */
+ generic_ref->real_root = mod_root ?: root;
+#endif
+ generic_ref->tree_ref.level = level;
+ generic_ref->tree_ref.owning_root = root;
+ generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
+}
+
+static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
+ u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+ bool skip_qgroup)
+{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* If @real_root not set, use @root as fallback */
+ generic_ref->real_root = mod_root ?: ref_root;
+#endif
+ generic_ref->data_ref.owning_root = ref_root;
+ generic_ref->data_ref.ino = ino;
+ generic_ref->data_ref.offset = offset;
+ generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+}
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+ return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+ if (op)
+ kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ WARN_ON(refcount_read(&ref->refs) == 0);
+ if (refcount_dec_and_test(&ref->refs)) {
+ WARN_ON(ref->in_tree);
+ switch (ref->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ case BTRFS_SHARED_DATA_REF_KEY:
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ break;
+ default:
+ BUG();
+ }
+ }
+}
+
+static inline u64 btrfs_ref_head_to_space_flags(
+ struct btrfs_delayed_ref_head *head_ref)
+{
+ if (head_ref->is_data)
+ return BTRFS_BLOCK_GROUP_DATA;
+ else if (head_ref->is_system)
+ return BTRFS_BLOCK_GROUP_SYSTEM;
+ return BTRFS_BLOCK_GROUP_METADATA;
+}
+
+static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
+{
+ if (refcount_dec_and_test(&head->refs))
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head);
+}
+
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref,
+ u64 reserved);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ u64 bytenr);
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+ mutex_unlock(&head->mutex);
+}
+void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
+
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs);
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
+
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush);
+void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ u64 num_bytes);
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_tree_ref *
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+{
+ return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
+
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+ return container_of(node, struct btrfs_delayed_data_ref, node);
+}
+
+#endif
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000..61e58066b
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,1305 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include "misc.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "sysfs.h"
+#include "zoned.h"
+#include "block-group.h"
+
+/*
+ * Device replace overview
+ *
+ * [Objective]
+ * To copy all extents (both new and on-disk) from source device to target
+ * device, while still keeping the filesystem read-write.
+ *
+ * [Method]
+ * There are two main methods involved:
+ *
+ * - Write duplication
+ *
+ * All new writes will be written to both target and source devices, so even
+ * if replace gets canceled, sources device still contains up-to-date data.
+ *
+ * Location: handle_ops_on_dev_replace() from __btrfs_map_block()
+ * Start: btrfs_dev_replace_start()
+ * End: btrfs_dev_replace_finishing()
+ * Content: Latest data/metadata
+ *
+ * - Copy existing extents
+ *
+ * This happens by re-using scrub facility, as scrub also iterates through
+ * existing extents from commit root.
+ *
+ * Location: scrub_write_block_to_dev_replace() from
+ * scrub_block_complete()
+ * Content: Data/meta from commit root.
+ *
+ * Due to the content difference, we need to avoid nocow write when dev-replace
+ * is happening. This is done by marking the block group read-only and waiting
+ * for NOCOW writes.
+ *
+ * After replace is done, the finishing part is done by swapping the target and
+ * source devices.
+ *
+ * Location: btrfs_dev_replace_update_device_in_mapping_tree() from
+ * btrfs_dev_replace_finishing()
+ */
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret);
+static int btrfs_dev_replace_kthread(void *data);
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
+ struct btrfs_key key;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct extent_buffer *eb;
+ int slot;
+ int ret = 0;
+ struct btrfs_path *path = NULL;
+ int item_size;
+ struct btrfs_dev_replace_item *ptr;
+ u64 src_devid;
+
+ if (!dev_root)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+ if (ret) {
+no_valid_dev_replace_entry_found:
+ /*
+ * We don't have a replace item or it's corrupted. If there is
+ * a replace target, fail the mount.
+ */
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ btrfs_err(fs_info,
+ "found replace target device without a valid replace item");
+ ret = -EUCLEAN;
+ goto out;
+ }
+ ret = 0;
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
+ dev_replace->cont_reading_from_srcdev_mode =
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+ dev_replace->time_started = 0;
+ dev_replace->time_stopped = 0;
+ atomic64_set(&dev_replace->num_write_errors, 0);
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ dev_replace->is_valid = 0;
+ dev_replace->item_needs_writeback = 0;
+ goto out;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size(eb, slot);
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+ if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+ btrfs_warn(fs_info,
+ "dev_replace entry found has unexpected size, ignore entry");
+ goto no_valid_dev_replace_entry_found;
+ }
+
+ src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+ dev_replace->cont_reading_from_srcdev_mode =
+ btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+ dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+ dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+ dev_replace->time_stopped =
+ btrfs_dev_replace_time_stopped(eb, ptr);
+ atomic64_set(&dev_replace->num_write_errors,
+ btrfs_dev_replace_num_write_errors(eb, ptr));
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+ btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+ dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+ dev_replace->committed_cursor_left = dev_replace->cursor_left;
+ dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+ dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+ dev_replace->is_valid = 1;
+
+ dev_replace->item_needs_writeback = 0;
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ /*
+ * We don't have an active replace item but if there is a
+ * replace target, fail the mount.
+ */
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ btrfs_err(fs_info,
+"replace without active item, run 'device scan --forget' on the target device");
+ ret = -EUCLEAN;
+ } else {
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ }
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+ args.devid = src_devid;
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
+ /*
+ * allow 'btrfs dev replace_cancel' if src/tgt device is
+ * missing
+ */
+ if (!dev_replace->srcdev &&
+ !btrfs_test_opt(fs_info, DEGRADED)) {
+ ret = -EIO;
+ btrfs_warn(fs_info,
+ "cannot mount because device replace operation is ongoing and");
+ btrfs_warn(fs_info,
+ "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+ src_devid);
+ }
+ if (!dev_replace->tgtdev &&
+ !btrfs_test_opt(fs_info, DEGRADED)) {
+ ret = -EIO;
+ btrfs_warn(fs_info,
+ "cannot mount because device replace operation is ongoing and");
+ btrfs_warn(fs_info,
+ "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+ BTRFS_DEV_REPLACE_DEVID);
+ }
+ if (dev_replace->tgtdev) {
+ if (dev_replace->srcdev) {
+ dev_replace->tgtdev->total_bytes =
+ dev_replace->srcdev->total_bytes;
+ dev_replace->tgtdev->disk_total_bytes =
+ dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->commit_total_bytes =
+ dev_replace->srcdev->commit_total_bytes;
+ dev_replace->tgtdev->bytes_used =
+ dev_replace->srcdev->bytes_used;
+ dev_replace->tgtdev->commit_bytes_used =
+ dev_replace->srcdev->commit_bytes_used;
+ }
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &dev_replace->tgtdev->dev_state);
+
+ WARN_ON(fs_info->fs_devices->rw_devices == 0);
+ dev_replace->tgtdev->io_width = fs_info->sectorsize;
+ dev_replace->tgtdev->io_align = fs_info->sectorsize;
+ dev_replace->tgtdev->sector_size = fs_info->sectorsize;
+ dev_replace->tgtdev->fs_info = fs_info;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &dev_replace->tgtdev->dev_state);
+ }
+ break;
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Initialize a new device for device replace target from a given source dev
+ * and path.
+ *
+ * Return 0 and new device in @device_out, otherwise return < 0
+ */
+static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ const char *device_path,
+ struct btrfs_device *srcdev,
+ struct btrfs_device **device_out)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct rcu_string *name;
+ u64 devid = BTRFS_DEV_REPLACE_DEVID;
+ int ret = 0;
+
+ *device_out = NULL;
+ if (srcdev->fs_devices->seeding) {
+ btrfs_err(fs_info, "the filesystem is a seed filesystem!");
+ return -EINVAL;
+ }
+
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+ fs_info->bdev_holder);
+ if (IS_ERR(bdev)) {
+ btrfs_err(fs_info, "target device %s is invalid!", device_path);
+ return PTR_ERR(bdev);
+ }
+
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ btrfs_err(fs_info,
+ "dev-replace: zoned type of target device mismatch with filesystem");
+ ret = -EINVAL;
+ goto error;
+ }
+
+ sync_blockdev(bdev);
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->bdev == bdev) {
+ btrfs_err(fs_info,
+ "target device is in the filesystem!");
+ ret = -EEXIST;
+ goto error;
+ }
+ }
+
+
+ if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
+ btrfs_err(fs_info,
+ "target device is smaller than source device!");
+ ret = -EINVAL;
+ goto error;
+ }
+
+
+ device = btrfs_alloc_device(NULL, &devid, NULL);
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ name = rcu_string_strdup(device_path, GFP_KERNEL);
+ if (!name) {
+ btrfs_free_device(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error;
+
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ device->generation = 0;
+ device->io_width = fs_info->sectorsize;
+ device->io_align = fs_info->sectorsize;
+ device->sector_size = fs_info->sectorsize;
+ device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+ device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+ device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+ device->commit_total_bytes = srcdev->commit_total_bytes;
+ device->commit_bytes_used = device->bytes_used;
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+ device->mode = FMODE_EXCL;
+ device->dev_stats_valid = 1;
+ set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ device->fs_devices = fs_devices;
+
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ goto error;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
+ fs_devices->open_devices++;
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ *device_out = device;
+ return 0;
+
+error:
+ blkdev_put(bdev, FMODE_EXCL);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_replace_item *ptr;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ down_read(&dev_replace->rwsem);
+ if (!dev_replace->is_valid ||
+ !dev_replace->item_needs_writeback) {
+ up_read(&dev_replace->rwsem);
+ return 0;
+ }
+ up_read(&dev_replace->rwsem);
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ btrfs_warn(fs_info,
+ "error %d while searching for dev_replace item!",
+ ret);
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /*
+ * need to delete old one and insert a new one.
+ * Since no attempt is made to recover any old state, if the
+ * dev_replace state is 'running', the data on the target
+ * drive is lost.
+ * It would be possible to recover the state: just make sure
+ * that the beginning of the item is never changed and always
+ * contains all the essential information. Then read this
+ * minimal set of information and use it as a base for the
+ * new state.
+ */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ btrfs_warn(fs_info,
+ "delete too small dev_replace item failed %d!",
+ ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ btrfs_warn(fs_info,
+ "insert dev_replace item failed %d!", ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_dev_replace_item);
+
+ down_write(&dev_replace->rwsem);
+ if (dev_replace->srcdev)
+ btrfs_set_dev_replace_src_devid(eb, ptr,
+ dev_replace->srcdev->devid);
+ else
+ btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+ btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+ dev_replace->cont_reading_from_srcdev_mode);
+ btrfs_set_dev_replace_replace_state(eb, ptr,
+ dev_replace->replace_state);
+ btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+ btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+ btrfs_set_dev_replace_num_write_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_write_errors));
+ btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+ dev_replace->cursor_left_last_write_of_item =
+ dev_replace->cursor_left;
+ btrfs_set_dev_replace_cursor_left(eb, ptr,
+ dev_replace->cursor_left_last_write_of_item);
+ btrfs_set_dev_replace_cursor_right(eb, ptr,
+ dev_replace->cursor_right);
+ dev_replace->item_needs_writeback = 0;
+ up_write(&dev_replace->rwsem);
+
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+static char* btrfs_dev_name(struct btrfs_device *device)
+{
+ if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ return "<missing disk>";
+ else
+ return rcu_str_deref(device->name);
+}
+
+static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *src_dev)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_block_group *cache;
+ struct btrfs_trans_handle *trans;
+ int iter_ret = 0;
+ int ret = 0;
+ u64 chunk_offset;
+
+ /* Do not use "to_copy" on non zoned filesystem for now */
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ mutex_lock(&fs_info->chunk_mutex);
+
+ /* Ensure we don't have pending new block group */
+ spin_lock(&fs_info->trans_lock);
+ while (fs_info->running_transaction &&
+ !list_empty(&fs_info->running_transaction->dev_update_list)) {
+ spin_unlock(&fs_info->trans_lock);
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (ret == -ENOENT) {
+ spin_lock(&fs_info->trans_lock);
+ continue;
+ } else {
+ goto unlock;
+ }
+ }
+
+ ret = btrfs_commit_transaction(trans);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (ret)
+ goto unlock;
+
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ path->reada = READA_FORWARD;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = src_dev->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *leaf = path->nodes[0];
+
+ if (found_key.objectid != src_dev->devid)
+ break;
+
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
+ break;
+
+ if (found_key.offset < key.offset)
+ break;
+
+ dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+
+ chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (!cache)
+ continue;
+
+ set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
+ btrfs_put_block_group(cache);
+ }
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+ btrfs_free_path(path);
+unlock:
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ return ret;
+}
+
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+ struct btrfs_block_group *cache,
+ u64 physical)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 chunk_offset = cache->start;
+ int num_extents, cur_extent;
+ int i;
+
+ /* Do not use "to_copy" on non zoned filesystem for now */
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ spin_lock(&cache->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
+ spin_unlock(&cache->lock);
+ return true;
+ }
+ spin_unlock(&cache->lock);
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(em));
+ map = em->map_lookup;
+
+ num_extents = 0;
+ cur_extent = 0;
+ for (i = 0; i < map->num_stripes; i++) {
+ /* We have more device extent to copy */
+ if (srcdev != map->stripes[i].dev)
+ continue;
+
+ num_extents++;
+ if (physical == map->stripes[i].physical)
+ cur_extent = i;
+ }
+
+ free_extent_map(em);
+
+ if (num_extents > 1 && cur_extent < num_extents - 1) {
+ /*
+ * Has more stripes on this device. Keep this block group
+ * readonly until we finish all the stripes.
+ */
+ return false;
+ }
+
+ /* Last stripe on this device */
+ clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
+
+ return true;
+}
+
+static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
+ const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
+ int read_src)
+{
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_device *src_device = NULL;
+
+ src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
+ srcdev_name);
+ if (IS_ERR(src_device))
+ return PTR_ERR(src_device);
+
+ if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
+ btrfs_warn_in_rcu(fs_info,
+ "cannot replace device %s (devid %llu) due to active swapfile",
+ btrfs_dev_name(src_device), src_device->devid);
+ return -ETXTBSY;
+ }
+
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
+ }
+
+ ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
+ src_device, &tgt_device);
+ if (ret)
+ return ret;
+
+ ret = mark_block_group_to_copy(fs_info, src_device);
+ if (ret)
+ return ret;
+
+ down_write(&dev_replace->rwsem);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ ASSERT(0);
+ ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ up_write(&dev_replace->rwsem);
+ goto leave;
+ }
+
+ dev_replace->cont_reading_from_srcdev_mode = read_src;
+ dev_replace->srcdev = src_device;
+ dev_replace->tgtdev = tgt_device;
+
+ btrfs_info_in_rcu(fs_info,
+ "dev_replace from %s (devid %llu) to %s started",
+ btrfs_dev_name(src_device),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+
+ /*
+ * from now on, the writes to the srcdev are all duplicated to
+ * go to the tgtdev as well (refer to btrfs_map_block()).
+ */
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ dev_replace->time_started = ktime_get_real_seconds();
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->is_valid = 1;
+ dev_replace->item_needs_writeback = 1;
+ atomic64_set(&dev_replace->num_write_errors, 0);
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+ up_write(&dev_replace->rwsem);
+
+ ret = btrfs_sysfs_add_device(tgt_device);
+ if (ret)
+ btrfs_err(fs_info, "kobj add dev failed %d", ret);
+
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+
+ /*
+ * Commit dev_replace state and reserve 1 item for it.
+ * This is crucial to ensure we won't miss copying extents for new block
+ * groups that are allocated after we started the device replace, and
+ * must be done after setting up the device replace state.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ down_write(&dev_replace->rwsem);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ up_write(&dev_replace->rwsem);
+ goto leave;
+ }
+
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+
+ /* the disk copy procedure reuses the scrub code */
+ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+ btrfs_device_get_total_bytes(src_device),
+ &dev_replace->scrub_progress, 0, 1);
+
+ ret = btrfs_dev_replace_finishing(fs_info, ret);
+ if (ret == -EINPROGRESS)
+ ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
+
+ return ret;
+
+leave:
+ btrfs_destroy_dev_replace_tgtdev(tgt_device);
+ return ret;
+}
+
+int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ int ret;
+
+ switch (args->start.cont_reading_from_srcdev_mode) {
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+ args->start.tgtdev_name[0] == '\0')
+ return -EINVAL;
+
+ ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
+ args->start.srcdevid,
+ args->start.srcdev_name,
+ args->start.cont_reading_from_srcdev_mode);
+ args->result = ret;
+ /* don't warn if EINPROGRESS, someone else might be running scrub */
+ if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+ ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
+ return 0;
+
+ return ret;
+}
+
+/*
+ * blocked until all in-flight bios operations are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+ set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
+ &fs_info->dev_replace.bio_counter));
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+ clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ wake_up(&fs_info->dev_replace.replace_wait);
+}
+
+/*
+ * When finishing the device replace, before swapping the source device with the
+ * target device we must update the chunk allocation state in the target device,
+ * as it is empty because replace works by directly copying the chunks and not
+ * through the normal chunk allocation path.
+ */
+static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 found_start;
+ u64 found_end;
+ int ret = 0;
+
+ lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
+
+ while (!find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = set_extent_bits(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED);
+ if (ret)
+ break;
+ start = found_end + 1;
+ }
+
+ free_extent_state(cached_state);
+ return ret;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *tgt_device;
+ struct btrfs_device *src_device;
+ struct btrfs_root *root = fs_info->tree_root;
+ u8 uuid_tmp[BTRFS_UUID_SIZE];
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ /* don't allow cancel or unmount to disturb the finishing procedure */
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+ down_read(&dev_replace->rwsem);
+ /* was the operation canceled, or is it finished? */
+ if (dev_replace->replace_state !=
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+ up_read(&dev_replace->rwsem);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return 0;
+ }
+
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * flush all outstanding I/O and inode extent mappings before the
+ * copy operation is declared as being finished
+ */
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
+ if (ret) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return ret;
+ }
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+
+ /*
+ * We have to use this loop approach because at this point src_device
+ * has to be available for transaction commit to complete, yet new
+ * chunks shouldn't be allocated on the device.
+ */
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+
+ /* Prevent write_all_supers() during the finishing procedure */
+ mutex_lock(&fs_devices->device_list_mutex);
+ /* Prevent new chunks being allocated on the source device */
+ mutex_lock(&fs_info->chunk_mutex);
+
+ if (!list_empty(&src_device->post_commit_list)) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ } else {
+ break;
+ }
+ }
+
+ down_write(&dev_replace->rwsem);
+ dev_replace->replace_state =
+ scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+ : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ dev_replace->time_stopped = ktime_get_real_seconds();
+ dev_replace->item_needs_writeback = 1;
+
+ /*
+ * Update allocation state in the new device and replace the old device
+ * with the new one in the mapping tree.
+ */
+ if (!scrub_ret) {
+ scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
+ if (scrub_ret)
+ goto error;
+ btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+ src_device,
+ tgt_device);
+ } else {
+ if (scrub_ret != -ECANCELED)
+ btrfs_err_in_rcu(fs_info,
+ "btrfs_scrub_dev(%s, %llu, %s) failed %d",
+ btrfs_dev_name(src_device),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name), scrub_ret);
+error:
+ up_write(&dev_replace->rwsem);
+ mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_rm_dev_replace_blocked(fs_info);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(tgt_device);
+ btrfs_rm_dev_replace_unblocked(fs_info);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return scrub_ret;
+ }
+
+ btrfs_info_in_rcu(fs_info,
+ "dev_replace from %s (devid %llu) to %s finished",
+ btrfs_dev_name(src_device),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
+ tgt_device->devid = src_device->devid;
+ src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+ memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+ memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+ memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+ btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
+ btrfs_device_set_disk_total_bytes(tgt_device,
+ src_device->disk_total_bytes);
+ btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
+ tgt_device->commit_bytes_used = src_device->bytes_used;
+
+ btrfs_assign_next_active_device(src_device, tgt_device);
+
+ list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
+ fs_devices->rw_devices++;
+
+ up_write(&dev_replace->rwsem);
+ btrfs_rm_dev_replace_blocked(fs_info);
+
+ btrfs_rm_dev_replace_remove_srcdev(src_device);
+
+ btrfs_rm_dev_replace_unblocked(fs_info);
+
+ /*
+ * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
+ * update on-disk dev stats value during commit transaction
+ */
+ atomic_inc(&tgt_device->dev_stats_ccnt);
+
+ /*
+ * this is again a consistent state where no dev_replace procedure
+ * is running, the target device is part of the filesystem, the
+ * source device is not part of the filesystem anymore and its 1st
+ * superblock is scratched out so that it is no longer marked to
+ * belong to this filesystem.
+ */
+ mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /* replace the sysfs entry */
+ btrfs_sysfs_remove_device(src_device);
+ btrfs_sysfs_update_devid(tgt_device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
+ btrfs_scratch_superblocks(fs_info, src_device->bdev,
+ src_device->name->str);
+
+ /* write back the superblocks */
+ trans = btrfs_start_transaction(root, 0);
+ if (!IS_ERR(trans))
+ btrfs_commit_transaction(trans);
+
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ btrfs_rm_dev_replace_free_srcdev(src_device);
+
+ return 0;
+}
+
+/*
+ * Read progress of device replace status according to the state and last
+ * stored position. The value format is the same as for
+ * btrfs_dev_replace::progress_1000
+ */
+static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ u64 ret = 0;
+
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ ret = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ ret = 1000;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ ret = div64_u64(dev_replace->cursor_left,
+ div_u64(btrfs_device_get_total_bytes(
+ dev_replace->srcdev), 1000));
+ break;
+ }
+
+ return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ down_read(&dev_replace->rwsem);
+ /* even if !dev_replace_is_valid, the values are good enough for
+ * the replace_status ioctl */
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ args->status.replace_state = dev_replace->replace_state;
+ args->status.time_started = dev_replace->time_started;
+ args->status.time_stopped = dev_replace->time_stopped;
+ args->status.num_write_errors =
+ atomic64_read(&dev_replace->num_write_errors);
+ args->status.num_uncorrectable_read_errors =
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+ args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
+ up_read(&dev_replace->rwsem);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_device *src_device = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = fs_info->tree_root;
+ int result;
+ int ret;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ down_write(&dev_replace->rwsem);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+ up_write(&dev_replace->rwsem);
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ up_write(&dev_replace->rwsem);
+ ret = btrfs_scrub_cancel(fs_info);
+ if (ret < 0) {
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+ } else {
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ /*
+ * btrfs_dev_replace_finishing() will handle the
+ * cleanup part
+ */
+ btrfs_info_in_rcu(fs_info,
+ "dev_replace from %s (devid %llu) to %s canceled",
+ btrfs_dev_name(src_device), src_device->devid,
+ btrfs_dev_name(tgt_device));
+ }
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ /*
+ * Scrub doing the replace isn't running so we need to do the
+ * cleanup step of btrfs_dev_replace_finishing() here
+ */
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+ dev_replace->time_stopped = ktime_get_real_seconds();
+ dev_replace->item_needs_writeback = 1;
+
+ up_write(&dev_replace->rwsem);
+
+ /* Scrub for replace must not be running in suspended state */
+ btrfs_scrub_cancel(fs_info);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+
+ btrfs_info_in_rcu(fs_info,
+ "suspended dev_replace from %s (devid %llu) to %s canceled",
+ btrfs_dev_name(src_device), src_device->devid,
+ btrfs_dev_name(tgt_device));
+
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(tgt_device);
+ break;
+ default:
+ up_write(&dev_replace->rwsem);
+ result = -EINVAL;
+ }
+
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ down_write(&dev_replace->rwsem);
+
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ dev_replace->time_stopped = ktime_get_real_seconds();
+ dev_replace->item_needs_writeback = 1;
+ btrfs_info(fs_info, "suspending dev_replace for unmount");
+ break;
+ }
+
+ up_write(&dev_replace->rwsem);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ down_write(&dev_replace->rwsem);
+
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ up_write(&dev_replace->rwsem);
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ break;
+ }
+ if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+ btrfs_info(fs_info,
+ "cannot continue dev_replace, tgtdev is missing");
+ btrfs_info(fs_info,
+ "you may cancel the operation after 'mount -o degraded'");
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ up_write(&dev_replace->rwsem);
+ return 0;
+ }
+ up_write(&dev_replace->rwsem);
+
+ /*
+ * This could collide with a paused balance, but the exclusive op logic
+ * should never allow both to start and pause. We don't want to allow
+ * dev-replace to start anyway.
+ */
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
+ down_write(&dev_replace->rwsem);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ up_write(&dev_replace->rwsem);
+ btrfs_info(fs_info,
+ "cannot resume dev-replace, other exclusive operation running");
+ return 0;
+ }
+
+ task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+ return PTR_ERR_OR_ZERO(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ u64 progress;
+ int ret;
+
+ progress = btrfs_dev_replace_progress(fs_info);
+ progress = div_u64(progress, 10);
+ btrfs_info_in_rcu(fs_info,
+ "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
+ btrfs_dev_name(dev_replace->srcdev),
+ dev_replace->srcdev->devid,
+ btrfs_dev_name(dev_replace->tgtdev),
+ (unsigned int)progress);
+
+ ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+ dev_replace->committed_cursor_left,
+ btrfs_device_get_total_bytes(dev_replace->srcdev),
+ &dev_replace->scrub_progress, 0, 1);
+ ret = btrfs_dev_replace_finishing(fs_info, ret);
+ WARN_ON(ret && ret != -ECANCELED);
+
+ btrfs_exclop_finish(fs_info);
+ return 0;
+}
+
+int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+ if (!dev_replace->is_valid)
+ return 0;
+
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ /*
+ * return true even if tgtdev is missing (this is
+ * something that can happen if the dev_replace
+ * procedure is suspended by an umount and then
+ * the tgtdev is missing (or "btrfs dev scan") was
+ * not called and the filesystem is remounted
+ * in degraded state. This does not stop the
+ * dev_replace procedure. It needs to be canceled
+ * manually if the cancellation is wanted.
+ */
+ break;
+ }
+ return 1;
+}
+
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
+{
+ percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
+ cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+ while (1) {
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
+ if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state)))
+ break;
+
+ btrfs_bio_counter_dec(fs_info);
+ wait_event(fs_info->dev_replace.replace_wait,
+ !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state));
+ }
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000..6084b3130
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ */
+
+#ifndef BTRFS_DEV_REPLACE_H
+#define BTRFS_DEV_REPLACE_H
+
+struct btrfs_ioctl_dev_replace_args;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+struct btrfs_dev_replace;
+struct btrfs_block_group;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
+int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+ struct btrfs_block_group *cache,
+ u64 physical);
+
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000..fdab48c1a
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,439 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision. data_size indicates how big the item inserted should be. On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+ *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key,
+ u32 data_size,
+ const char *name,
+ int name_len)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+ char *ptr;
+ struct extent_buffer *leaf;
+
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+ if (ret == -EEXIST) {
+ struct btrfs_dir_item *di;
+ di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ if (di)
+ return ERR_PTR(-EEXIST);
+ btrfs_extend_item(path, data_size);
+ } else if (ret < 0)
+ return ERR_PTR(ret);
+ WARN_ON(ret > 0);
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+ ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
+ ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
+ return (struct btrfs_dir_item *)ptr;
+}
+
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len)
+{
+ int ret = 0;
+ struct btrfs_dir_item *dir_item;
+ unsigned long name_ptr, data_ptr;
+ struct btrfs_key key, location;
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *leaf;
+ u32 data_size;
+
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info))
+ return -ENOSPC;
+
+ key.objectid = objectid;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name, name_len);
+
+ data_size = sizeof(*dir_item) + name_len + data_len;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ if (IS_ERR(dir_item))
+ return PTR_ERR(dir_item);
+ memset(&location, 0, sizeof(location));
+
+ leaf = path->nodes[0];
+ btrfs_cpu_key_to_disk(&disk_key, &location);
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ btrfs_set_dir_data_len(leaf, dir_item, data_len);
+ name_ptr = (unsigned long)(dir_item + 1);
+ data_ptr = (unsigned long)((char *)name_ptr + name_len);
+
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ write_extent_buffer(leaf, data, data_ptr, data_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ return ret;
+}
+
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ * Will return 0 or -ENOMEM
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ const struct fscrypt_str *name, struct btrfs_inode *dir,
+ struct btrfs_key *location, u8 type, u64 index)
+{
+ int ret = 0;
+ int ret2 = 0;
+ struct btrfs_root *root = dir->root;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *dir_item;
+ struct extent_buffer *leaf;
+ unsigned long name_ptr;
+ struct btrfs_key key;
+ struct btrfs_disk_key disk_key;
+ u32 data_size;
+
+ key.objectid = btrfs_ino(dir);
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name->name, name->len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ btrfs_cpu_key_to_disk(&disk_key, location);
+
+ data_size = sizeof(*dir_item) + name->len;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name->name, name->len);
+ if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ if (ret == -EEXIST)
+ goto second_insert;
+ goto out_free;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, type);
+ btrfs_set_dir_data_len(leaf, dir_item, 0);
+ btrfs_set_dir_name_len(leaf, dir_item, name->len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ name_ptr = (unsigned long)(dir_item + 1);
+
+ write_extent_buffer(leaf, name->name, name_ptr, name->len);
+ btrfs_mark_buffer_dirty(leaf);
+
+second_insert:
+ /* FIXME, use some real flag for selecting the extra index */
+ if (root == root->fs_info->tree_root) {
+ ret = 0;
+ goto out_free;
+ }
+ btrfs_release_path(path);
+
+ ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir,
+ &disk_key, type, index);
+out_free:
+ btrfs_free_path(path);
+ if (ret)
+ return ret;
+ if (ret2)
+ return ret2;
+ return 0;
+}
+
+static struct btrfs_dir_item *btrfs_lookup_match_dir(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, const char *name,
+ int name_len, int mod)
+{
+ const int ins_len = (mod < 0 ? -1 : 0);
+ const int cow = (mod != 0);
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+}
+
+/*
+ * Lookup for a directory item by name.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir item does not exists, an error pointer if an error
+ * happened, or a pointer to a dir item if a dir item exists for the given name.
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const struct fscrypt_str *name,
+ int mod)
+{
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name->name, name->len);
+
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
+ name->len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
+ return NULL;
+
+ return di;
+}
+
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const struct fscrypt_str *name)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ int data_size;
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name->name, name->len);
+
+ di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name,
+ name->len, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ /* Nothing found, we're safe */
+ if (ret == -ENOENT) {
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0)
+ goto out;
+ }
+
+ /* we found an item, look for our name in the item */
+ if (di) {
+ /* our exact name was found */
+ ret = -EEXIST;
+ goto out;
+ }
+
+ /* See if there is room in the item to insert this name. */
+ data_size = sizeof(*di) + name->len;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (data_size + btrfs_item_size(leaf, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
+ ret = -EOVERFLOW;
+ } else {
+ /* plenty of insertion room */
+ ret = 0;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Lookup for a directory index item by name and index number.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @index: The index number.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir index item does not exists, an error pointer if an
+ * error happened, or a pointer to a dir item if the dir index item exists and
+ * matches the criteria (name and index number).
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 index, const struct fscrypt_str *name, int mod)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = index;
+
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
+ name->len, mod);
+ if (di == ERR_PTR(-ENOENT))
+ return NULL;
+
+ return di;
+}
+
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
+ u64 dirid, const struct fscrypt_str *name)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = dirid;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+
+ btrfs_for_each_slot(root, &key, &key, path, ret) {
+ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+ break;
+
+ di = btrfs_match_dir_item_name(root->fs_info, path,
+ name->name, name->len);
+ if (di)
+ return di;
+ }
+ /* Adjust return code if the key was not found in the next leaf. */
+ if (ret > 0)
+ ret = 0;
+
+ return ERR_PTR(ret);
+}
+
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod)
+{
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+
+ key.objectid = dir;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name, name_len);
+
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
+ return NULL;
+
+ return di;
+}
+
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ const char *name, int name_len)
+{
+ struct btrfs_dir_item *dir_item;
+ unsigned long name_ptr;
+ u32 total_len;
+ u32 cur = 0;
+ u32 this_len;
+ struct extent_buffer *leaf;
+
+ leaf = path->nodes[0];
+ dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+
+ total_len = btrfs_item_size(leaf, path->slots[0]);
+ while (cur < total_len) {
+ this_len = sizeof(*dir_item) +
+ btrfs_dir_name_len(leaf, dir_item) +
+ btrfs_dir_data_len(leaf, dir_item);
+ name_ptr = (unsigned long)(dir_item + 1);
+
+ if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+ memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+ return dir_item;
+
+ cur += this_len;
+ dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+ this_len);
+ }
+ return NULL;
+}
+
+/*
+ * given a pointer into a directory item, delete it. This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di)
+{
+
+ struct extent_buffer *leaf;
+ u32 sub_item_len;
+ u32 item_len;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+ btrfs_dir_data_len(leaf, di);
+ item_len = btrfs_item_size(leaf, path->slots[0]);
+ if (sub_item_len == item_len) {
+ ret = btrfs_del_item(trans, root, path);
+ } else {
+ /* MARKER */
+ unsigned long ptr = (unsigned long)di;
+ unsigned long start;
+
+ start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+ item_len - (ptr + sub_item_len - start));
+ btrfs_truncate_item(path, item_len - sub_item_len, 1);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
new file mode 100644
index 000000000..bd9dde374
--- /dev/null
+++ b/fs/btrfs/discard.c
@@ -0,0 +1,758 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/math64.h>
+#include <linux/sizes.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "block-group.h"
+#include "discard.h"
+#include "free-space-cache.h"
+
+/*
+ * This contains the logic to handle async discard.
+ *
+ * Async discard manages trimming of free space outside of transaction commit.
+ * Discarding is done by managing the block_groups on a LRU list based on free
+ * space recency. Two passes are used to first prioritize discarding extents
+ * and then allow for trimming in the bitmap the best opportunity to coalesce.
+ * The block_groups are maintained on multiple lists to allow for multiple
+ * passes with different discard filter requirements. A delayed work item is
+ * used to manage discarding with timeout determined by a max of the delay
+ * incurred by the iops rate limit, the byte rate limit, and the max delay of
+ * BTRFS_DISCARD_MAX_DELAY.
+ *
+ * Note, this only keeps track of block_groups that are explicitly for data.
+ * Mixed block_groups are not supported.
+ *
+ * The first list is special to manage discarding of fully free block groups.
+ * This is necessary because we issue a final trim for a full free block group
+ * after forgetting it. When a block group becomes unused, instead of directly
+ * being added to the unused_bgs list, we add it to this first list. Then
+ * from there, if it becomes fully discarded, we place it onto the unused_bgs
+ * list.
+ *
+ * The in-memory free space cache serves as the backing state for discard.
+ * Consequently this means there is no persistence. We opt to load all the
+ * block groups in as not discarded, so the mount case degenerates to the
+ * crashing case.
+ *
+ * As the free space cache uses bitmaps, there exists a tradeoff between
+ * ease/efficiency for find_free_extent() and the accuracy of discard state.
+ * Here we opt to let untrimmed regions merge with everything while only letting
+ * trimmed regions merge with other trimmed regions. This can cause
+ * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
+ * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
+ * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
+ * this resets the state and we will retry trimming the whole bitmap. This is a
+ * tradeoff between discard state accuracy and the cost of accounting.
+ */
+
+/* This is an initial delay to give some chance for block reuse */
+#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
+#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
+
+/* Target completion latency of discarding all discardable extents */
+#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC)
+#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
+#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
+#define BTRFS_DISCARD_MAX_IOPS (10U)
+
+/* Montonically decreasing minimum length filters after index 0 */
+static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
+ 0,
+ BTRFS_ASYNC_DISCARD_MAX_FILTER,
+ BTRFS_ASYNC_DISCARD_MIN_FILTER
+};
+
+static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ return &discard_ctl->discard_list[block_group->discard_index];
+}
+
+static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ lockdep_assert_held(&discard_ctl->lock);
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
+ if (list_empty(&block_group->discard_list) ||
+ block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
+ if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
+ block_group->discard_index = BTRFS_DISCARD_INDEX_START;
+ block_group->discard_eligible_time = (ktime_get_ns() +
+ BTRFS_DISCARD_DELAY);
+ block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+ }
+ if (list_empty(&block_group->discard_list))
+ btrfs_get_block_group(block_group);
+
+ list_move_tail(&block_group->discard_list,
+ get_discard_list(discard_ctl, block_group));
+}
+
+static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (!btrfs_is_block_group_data_only(block_group))
+ return;
+
+ spin_lock(&discard_ctl->lock);
+ __add_to_discard_list(discard_ctl, block_group);
+ spin_unlock(&discard_ctl->lock);
+}
+
+static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ bool queued;
+
+ spin_lock(&discard_ctl->lock);
+
+ queued = !list_empty(&block_group->discard_list);
+
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_unlock(&discard_ctl->lock);
+ return;
+ }
+
+ list_del_init(&block_group->discard_list);
+
+ block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+ block_group->discard_eligible_time = (ktime_get_ns() +
+ BTRFS_DISCARD_UNUSED_DELAY);
+ block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+ if (!queued)
+ btrfs_get_block_group(block_group);
+ list_add_tail(&block_group->discard_list,
+ &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
+
+ spin_unlock(&discard_ctl->lock);
+}
+
+static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ bool running = false;
+ bool queued = false;
+
+ spin_lock(&discard_ctl->lock);
+
+ if (block_group == discard_ctl->block_group) {
+ running = true;
+ discard_ctl->block_group = NULL;
+ }
+
+ block_group->discard_eligible_time = 0;
+ queued = !list_empty(&block_group->discard_list);
+ list_del_init(&block_group->discard_list);
+ /*
+ * If the block group is currently running in the discard workfn, we
+ * don't want to deref it, since it's still being used by the workfn.
+ * The workfn will notice this case and deref the block group when it is
+ * finished.
+ */
+ if (queued && !running)
+ btrfs_put_block_group(block_group);
+
+ spin_unlock(&discard_ctl->lock);
+
+ return running;
+}
+
+/**
+ * find_next_block_group - find block_group that's up next for discarding
+ * @discard_ctl: discard control
+ * @now: current time
+ *
+ * Iterate over the discard lists to find the next block_group up for
+ * discarding checking the discard_eligible_time of block_group.
+ */
+static struct btrfs_block_group *find_next_block_group(
+ struct btrfs_discard_ctl *discard_ctl,
+ u64 now)
+{
+ struct btrfs_block_group *ret_block_group = NULL, *block_group;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
+ struct list_head *discard_list = &discard_ctl->discard_list[i];
+
+ if (!list_empty(discard_list)) {
+ block_group = list_first_entry(discard_list,
+ struct btrfs_block_group,
+ discard_list);
+
+ if (!ret_block_group)
+ ret_block_group = block_group;
+
+ if (ret_block_group->discard_eligible_time < now)
+ break;
+
+ if (ret_block_group->discard_eligible_time >
+ block_group->discard_eligible_time)
+ ret_block_group = block_group;
+ }
+ }
+
+ return ret_block_group;
+}
+
+/**
+ * Wrap find_next_block_group()
+ *
+ * @discard_ctl: discard control
+ * @discard_state: the discard_state of the block_group after state management
+ * @discard_index: the discard_index of the block_group after state management
+ * @now: time when discard was invoked, in ns
+ *
+ * This wraps find_next_block_group() and sets the block_group to be in use.
+ * discard_state's control flow is managed here. Variables related to
+ * discard_state are reset here as needed (eg discard_cursor). @discard_state
+ * and @discard_index are remembered as it may change while we're discarding,
+ * but we want the discard to execute in the context determined here.
+ */
+static struct btrfs_block_group *peek_discard_list(
+ struct btrfs_discard_ctl *discard_ctl,
+ enum btrfs_discard_state *discard_state,
+ int *discard_index, u64 now)
+{
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&discard_ctl->lock);
+again:
+ block_group = find_next_block_group(discard_ctl, now);
+
+ if (block_group && now >= block_group->discard_eligible_time) {
+ if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
+ block_group->used != 0) {
+ if (btrfs_is_block_group_data_only(block_group)) {
+ __add_to_discard_list(discard_ctl, block_group);
+ } else {
+ list_del_init(&block_group->discard_list);
+ btrfs_put_block_group(block_group);
+ }
+ goto again;
+ }
+ if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
+ block_group->discard_cursor = block_group->start;
+ block_group->discard_state = BTRFS_DISCARD_EXTENTS;
+ }
+ discard_ctl->block_group = block_group;
+ }
+ if (block_group) {
+ *discard_state = block_group->discard_state;
+ *discard_index = block_group->discard_index;
+ }
+ spin_unlock(&discard_ctl->lock);
+
+ return block_group;
+}
+
+/**
+ * btrfs_discard_check_filter - updates a block groups filters
+ * @block_group: block group of interest
+ * @bytes: recently freed region size after coalescing
+ *
+ * Async discard maintains multiple lists with progressively smaller filters
+ * to prioritize discarding based on size. Should a free space that matches
+ * a larger filter be returned to the free_space_cache, prioritize that discard
+ * by moving @block_group to the proper filter.
+ */
+void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
+ u64 bytes)
+{
+ struct btrfs_discard_ctl *discard_ctl;
+
+ if (!block_group ||
+ !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ return;
+
+ discard_ctl = &block_group->fs_info->discard_ctl;
+
+ if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
+ bytes >= discard_minlen[block_group->discard_index - 1]) {
+ int i;
+
+ remove_from_discard_list(discard_ctl, block_group);
+
+ for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
+ i++) {
+ if (bytes >= discard_minlen[i]) {
+ block_group->discard_index = i;
+ add_to_discard_list(discard_ctl, block_group);
+ break;
+ }
+ }
+ }
+}
+
+/**
+ * btrfs_update_discard_index - moves a block group along the discard lists
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * Increment @block_group's discard_index. If it falls of the list, let it be.
+ * Otherwise add it back to the appropriate list.
+ */
+static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ block_group->discard_index++;
+ if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
+ block_group->discard_index = 1;
+ return;
+ }
+
+ add_to_discard_list(discard_ctl, block_group);
+}
+
+/**
+ * btrfs_discard_cancel_work - remove a block_group from the discard lists
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This removes @block_group from the discard lists. If necessary, it waits on
+ * the current work and then reschedules the delayed work.
+ */
+void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (remove_from_discard_list(discard_ctl, block_group)) {
+ cancel_delayed_work_sync(&discard_ctl->work);
+ btrfs_discard_schedule_work(discard_ctl, true);
+ }
+}
+
+/**
+ * btrfs_discard_queue_work - handles queuing the block_groups
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This maintains the LRU order of the discard lists.
+ */
+void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ return;
+
+ if (block_group->used == 0)
+ add_to_discard_unused_list(discard_ctl, block_group);
+ else
+ add_to_discard_list(discard_ctl, block_group);
+
+ if (!delayed_work_pending(&discard_ctl->work))
+ btrfs_discard_schedule_work(discard_ctl, false);
+}
+
+static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ u64 now, bool override)
+{
+ struct btrfs_block_group *block_group;
+
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+ if (!override && delayed_work_pending(&discard_ctl->work))
+ return;
+
+ block_group = find_next_block_group(discard_ctl, now);
+ if (block_group) {
+ u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
+ u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
+
+ /*
+ * A single delayed workqueue item is responsible for
+ * discarding, so we can manage the bytes rate limit by keeping
+ * track of the previous discard.
+ */
+ if (kbps_limit && discard_ctl->prev_discard) {
+ u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
+ u64 bps_delay = div64_u64(discard_ctl->prev_discard *
+ NSEC_PER_SEC, bps_limit);
+
+ delay = max(delay, bps_delay);
+ }
+
+ /*
+ * This timeout is to hopefully prevent immediate discarding
+ * in a recently allocated block group.
+ */
+ if (now < block_group->discard_eligible_time) {
+ u64 bg_timeout = block_group->discard_eligible_time - now;
+
+ delay = max(delay, bg_timeout);
+ }
+
+ if (override && discard_ctl->prev_discard) {
+ u64 elapsed = now - discard_ctl->prev_discard_time;
+
+ if (delay > elapsed)
+ delay -= elapsed;
+ else
+ delay = 0;
+ }
+
+ mod_delayed_work(discard_ctl->discard_workers,
+ &discard_ctl->work, nsecs_to_jiffies(delay));
+ }
+}
+
+/*
+ * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * @discard_ctl: discard control
+ * @override: override the current timer
+ *
+ * Discards are issued by a delayed workqueue item. @override is used to
+ * update the current delay as the baseline delay interval is reevaluated on
+ * transaction commit. This is also maxed with any other rate limit.
+ */
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ bool override)
+{
+ const u64 now = ktime_get_ns();
+
+ spin_lock(&discard_ctl->lock);
+ __btrfs_discard_schedule_work(discard_ctl, now, override);
+ spin_unlock(&discard_ctl->lock);
+}
+
+/**
+ * btrfs_finish_discard_pass - determine next step of a block_group
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This determines the next step for a block group after it's finished going
+ * through a pass on a discard list. If it is unused and fully trimmed, we can
+ * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
+ * the appropriate filter list or let it fall off.
+ */
+static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ remove_from_discard_list(discard_ctl, block_group);
+
+ if (block_group->used == 0) {
+ if (btrfs_is_free_space_trimmed(block_group))
+ btrfs_mark_bg_unused(block_group);
+ else
+ add_to_discard_unused_list(discard_ctl, block_group);
+ } else {
+ btrfs_update_discard_index(discard_ctl, block_group);
+ }
+}
+
+/**
+ * btrfs_discard_workfn - discard work function
+ * @work: work
+ *
+ * This finds the next block_group to start discarding and then discards a
+ * single region. It does this in a two-pass fashion: first extents and second
+ * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
+ */
+static void btrfs_discard_workfn(struct work_struct *work)
+{
+ struct btrfs_discard_ctl *discard_ctl;
+ struct btrfs_block_group *block_group;
+ enum btrfs_discard_state discard_state;
+ int discard_index = 0;
+ u64 trimmed = 0;
+ u64 minlen = 0;
+ u64 now = ktime_get_ns();
+
+ discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
+
+ block_group = peek_discard_list(discard_ctl, &discard_state,
+ &discard_index, now);
+ if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ return;
+ if (now < block_group->discard_eligible_time) {
+ btrfs_discard_schedule_work(discard_ctl, false);
+ return;
+ }
+
+ /* Perform discarding */
+ minlen = discard_minlen[discard_index];
+
+ if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ u64 maxlen = 0;
+
+ /*
+ * Use the previous levels minimum discard length as the max
+ * length filter. In the case something is added to make a
+ * region go beyond the max filter, the entire bitmap is set
+ * back to BTRFS_TRIM_STATE_UNTRIMMED.
+ */
+ if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
+ maxlen = discard_minlen[discard_index - 1];
+
+ btrfs_trim_block_group_bitmaps(block_group, &trimmed,
+ block_group->discard_cursor,
+ btrfs_block_group_end(block_group),
+ minlen, maxlen, true);
+ discard_ctl->discard_bitmap_bytes += trimmed;
+ } else {
+ btrfs_trim_block_group_extents(block_group, &trimmed,
+ block_group->discard_cursor,
+ btrfs_block_group_end(block_group),
+ minlen, true);
+ discard_ctl->discard_extent_bytes += trimmed;
+ }
+
+ /* Determine next steps for a block_group */
+ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
+ if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ btrfs_finish_discard_pass(discard_ctl, block_group);
+ } else {
+ block_group->discard_cursor = block_group->start;
+ spin_lock(&discard_ctl->lock);
+ if (block_group->discard_state !=
+ BTRFS_DISCARD_RESET_CURSOR)
+ block_group->discard_state =
+ BTRFS_DISCARD_BITMAPS;
+ spin_unlock(&discard_ctl->lock);
+ }
+ }
+
+ now = ktime_get_ns();
+ spin_lock(&discard_ctl->lock);
+ discard_ctl->prev_discard = trimmed;
+ discard_ctl->prev_discard_time = now;
+ /*
+ * If the block group was removed from the discard list while it was
+ * running in this workfn, then we didn't deref it, since this function
+ * still owned that reference. But we set the discard_ctl->block_group
+ * back to NULL, so we can use that condition to know that now we need
+ * to deref the block_group.
+ */
+ if (discard_ctl->block_group == NULL)
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ __btrfs_discard_schedule_work(discard_ctl, now, false);
+ spin_unlock(&discard_ctl->lock);
+}
+
+/**
+ * btrfs_run_discard_work - determines if async discard should be running
+ * @discard_ctl: discard control
+ *
+ * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
+ */
+bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
+{
+ struct btrfs_fs_info *fs_info = container_of(discard_ctl,
+ struct btrfs_fs_info,
+ discard_ctl);
+
+ return (!(fs_info->sb->s_flags & SB_RDONLY) &&
+ test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
+}
+
+/**
+ * btrfs_discard_calc_delay - recalculate the base delay
+ * @discard_ctl: discard control
+ *
+ * Recalculate the base delay which is based off the total number of
+ * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
+ * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
+ */
+void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
+{
+ s32 discardable_extents;
+ s64 discardable_bytes;
+ u32 iops_limit;
+ unsigned long delay;
+
+ discardable_extents = atomic_read(&discard_ctl->discardable_extents);
+ if (!discardable_extents)
+ return;
+
+ spin_lock(&discard_ctl->lock);
+
+ /*
+ * The following is to fix a potential -1 discrepenancy that we're not
+ * sure how to reproduce. But given that this is the only place that
+ * utilizes these numbers and this is only called by from
+ * btrfs_finish_extent_commit() which is synchronized, we can correct
+ * here.
+ */
+ if (discardable_extents < 0)
+ atomic_add(-discardable_extents,
+ &discard_ctl->discardable_extents);
+
+ discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
+ if (discardable_bytes < 0)
+ atomic64_add(-discardable_bytes,
+ &discard_ctl->discardable_bytes);
+
+ if (discardable_extents <= 0) {
+ spin_unlock(&discard_ctl->lock);
+ return;
+ }
+
+ iops_limit = READ_ONCE(discard_ctl->iops_limit);
+ if (iops_limit)
+ delay = MSEC_PER_SEC / iops_limit;
+ else
+ delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
+
+ delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC,
+ BTRFS_DISCARD_MAX_DELAY_MSEC);
+ discard_ctl->delay_ms = delay;
+
+ spin_unlock(&discard_ctl->lock);
+}
+
+/**
+ * btrfs_discard_update_discardable - propagate discard counters
+ * @block_group: block_group of interest
+ *
+ * This propagates deltas of counters up to the discard_ctl. It maintains a
+ * current counter and a previous counter passing the delta up to the global
+ * stat. Then the current counter value becomes the previous counter value.
+ */
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
+{
+ struct btrfs_free_space_ctl *ctl;
+ struct btrfs_discard_ctl *discard_ctl;
+ s32 extents_delta;
+ s64 bytes_delta;
+
+ if (!block_group ||
+ !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
+ !btrfs_is_block_group_data_only(block_group))
+ return;
+
+ ctl = block_group->free_space_ctl;
+ discard_ctl = &block_group->fs_info->discard_ctl;
+
+ lockdep_assert_held(&ctl->tree_lock);
+ extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
+ ctl->discardable_extents[BTRFS_STAT_PREV];
+ if (extents_delta) {
+ atomic_add(extents_delta, &discard_ctl->discardable_extents);
+ ctl->discardable_extents[BTRFS_STAT_PREV] =
+ ctl->discardable_extents[BTRFS_STAT_CURR];
+ }
+
+ bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
+ ctl->discardable_bytes[BTRFS_STAT_PREV];
+ if (bytes_delta) {
+ atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
+ ctl->discardable_bytes[BTRFS_STAT_PREV] =
+ ctl->discardable_bytes[BTRFS_STAT_CURR];
+ }
+}
+
+/**
+ * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
+ * @fs_info: fs_info of interest
+ *
+ * The unused_bgs list needs to be punted to the discard lists because the
+ * order of operations is changed. In the normal synchronous discard path, the
+ * block groups are trimmed via a single large trim in transaction commit. This
+ * is ultimately what we are trying to avoid with asynchronous discard. Thus,
+ * it must be done before going down the unused_bgs path.
+ */
+void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group, *next;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ /* We enabled async discard, so punt all to the queue */
+ list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
+ bg_list) {
+ list_del_init(&block_group->bg_list);
+ btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+ /*
+ * This put is for the get done by btrfs_mark_bg_unused.
+ * Queueing discard incremented it for discard's reference.
+ */
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+/**
+ * btrfs_discard_purge_list - purge discard lists
+ * @discard_ctl: discard control
+ *
+ * If we are disabling async discard, we may have intercepted block groups that
+ * are completely free and ready for the unused_bgs path. As discarding will
+ * now happen in transaction commit or not at all, we can safely mark the
+ * corresponding block groups as unused and they will be sent on their merry
+ * way to the unused_bgs list.
+ */
+static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
+{
+ struct btrfs_block_group *block_group, *next;
+ int i;
+
+ spin_lock(&discard_ctl->lock);
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
+ list_for_each_entry_safe(block_group, next,
+ &discard_ctl->discard_list[i],
+ discard_list) {
+ list_del_init(&block_group->discard_list);
+ spin_unlock(&discard_ctl->lock);
+ if (block_group->used == 0)
+ btrfs_mark_bg_unused(block_group);
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ }
+ }
+ spin_unlock(&discard_ctl->lock);
+}
+
+void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ btrfs_discard_cleanup(fs_info);
+ return;
+ }
+
+ btrfs_discard_punt_unused_bgs_list(fs_info);
+
+ set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
+}
+
+void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
+{
+ clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
+}
+
+void btrfs_discard_init(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ int i;
+
+ spin_lock_init(&discard_ctl->lock);
+ INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
+
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
+ INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
+
+ discard_ctl->prev_discard = 0;
+ discard_ctl->prev_discard_time = 0;
+ atomic_set(&discard_ctl->discardable_extents, 0);
+ atomic64_set(&discard_ctl->discardable_bytes, 0);
+ discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
+ discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
+ discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
+ discard_ctl->kbps_limit = 0;
+ discard_ctl->discard_extent_bytes = 0;
+ discard_ctl->discard_bitmap_bytes = 0;
+ atomic64_set(&discard_ctl->discard_bytes_saved, 0);
+}
+
+void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
+{
+ btrfs_discard_stop(fs_info);
+ cancel_delayed_work_sync(&fs_info->discard_ctl.work);
+ btrfs_discard_purge_list(&fs_info->discard_ctl);
+}
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
new file mode 100644
index 000000000..57b9202f4
--- /dev/null
+++ b/fs/btrfs/discard.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DISCARD_H
+#define BTRFS_DISCARD_H
+
+#include <linux/sizes.h>
+
+struct btrfs_fs_info;
+struct btrfs_discard_ctl;
+struct btrfs_block_group;
+
+/* Discard size limits */
+#define BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE (SZ_64M)
+#define BTRFS_ASYNC_DISCARD_MAX_FILTER (SZ_1M)
+#define BTRFS_ASYNC_DISCARD_MIN_FILTER (SZ_32K)
+
+/* List operations */
+void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes);
+
+/* Work operations */
+void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group);
+void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group);
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ bool override);
+bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl);
+
+/* Update operations */
+void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl);
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group);
+
+/* Setup/cleanup operations */
+void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
+void btrfs_discard_resume(struct btrfs_fs_info *fs_info);
+void btrfs_discard_stop(struct btrfs_fs_info *fs_info);
+void btrfs_discard_init(struct btrfs_fs_info *fs_info);
+void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000..40152458e
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,5355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/migrate.h>
+#include <linux/ratelimit.h>
+#include <linux/uuid.h>
+#include <linux/semaphore.h>
+#include <linux/error-injection.h>
+#include <linux/crc32c.h>
+#include <linux/sched/mm.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "free-space-cache.h"
+#include "free-space-tree.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "raid56.h"
+#include "sysfs.h"
+#include "qgroup.h"
+#include "compression.h"
+#include "tree-checker.h"
+#include "ref-verify.h"
+#include "block-group.h"
+#include "discard.h"
+#include "space-info.h"
+#include "zoned.h"
+#include "subpage.h"
+
+#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
+ BTRFS_HEADER_FLAG_RELOC |\
+ BTRFS_SUPER_FLAG_ERROR |\
+ BTRFS_SUPER_FLAG_SEEDING |\
+ BTRFS_SUPER_FLAG_METADUMP |\
+ BTRFS_SUPER_FLAG_METADUMP_V2)
+
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info);
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages,
+ int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
+static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
+
+static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->csum_shash)
+ crypto_free_shash(fs_info->csum_shash);
+}
+
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads. They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+ struct inode *inode;
+ struct bio *bio;
+ extent_submit_bio_start_t *submit_bio_start;
+ int mirror_num;
+
+ /* Optional parameter for submit_bio_start used by direct io */
+ u64 dio_file_offset;
+ struct btrfs_work work;
+ blk_status_t status;
+};
+
+/*
+ * Compute the csum of a btree block and store the result to provided buffer.
+ */
+static void csum_tree_block(struct extent_buffer *buf, u8 *result)
+{
+ struct btrfs_fs_info *fs_info = buf->fs_info;
+ const int num_pages = num_extent_pages(buf);
+ const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ char *kaddr;
+ int i;
+
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ first_page_part - BTRFS_CSUM_SIZE);
+
+ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
+ kaddr = page_address(buf->pages[i]);
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
+ }
+ memset(result, 0, BTRFS_CSUM_SIZE);
+ crypto_shash_final(shash, result);
+}
+
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer. This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+ struct extent_buffer *eb, u64 parent_transid,
+ int atomic)
+{
+ struct extent_state *cached_state = NULL;
+ int ret;
+
+ if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+ return 0;
+
+ if (atomic)
+ return -EAGAIN;
+
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
+ if (extent_buffer_uptodate(eb) &&
+ btrfs_header_generation(eb) == parent_transid) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_err_rl(eb->fs_info,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror,
+ parent_transid, btrfs_header_generation(eb));
+ ret = 1;
+ clear_extent_buffer_uptodate(eb);
+out:
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
+ return ret;
+}
+
+static bool btrfs_supported_super_csum(u16 csum_type)
+{
+ switch (csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ case BTRFS_CSUM_TYPE_XXHASH:
+ case BTRFS_CSUM_TYPE_SHA256:
+ case BTRFS_CSUM_TYPE_BLAKE2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * Return 0 if the superblock checksum type matches the checksum value of that
+ * algorithm. Pass the raw disk superblock data.
+ */
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb)
+{
+ char result[BTRFS_CSUM_SIZE];
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+
+ shash->tfm = fs_info->csum_shash;
+
+ /*
+ * The super_block structure does not span the whole
+ * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
+ * filled with zeros and is included in the checksum.
+ */
+ crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
+
+ if (memcmp(disk_sb->csum, result, fs_info->csum_size))
+ return 1;
+
+ return 0;
+}
+
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key, u64 parent_transid)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int found_level;
+ struct btrfs_key found_key;
+ int ret;
+
+ found_level = btrfs_header_level(eb);
+ if (found_level != level) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: tree level check failed\n");
+ btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+ eb->start, level, found_level);
+ return -EIO;
+ }
+
+ if (!first_key)
+ return 0;
+
+ /*
+ * For live tree block (new tree blocks in current transaction),
+ * we need proper lock context to avoid race, which is impossible here.
+ * So we only checks tree blocks which is read from disk, whose
+ * generation <= fs_info->last_trans_committed.
+ */
+ if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+ return 0;
+
+ /* We have @first_key, so this @eb must have at least one item */
+ if (btrfs_header_nritems(eb) == 0) {
+ btrfs_err(fs_info,
+ "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return -EUCLEAN;
+ }
+
+ if (found_level)
+ btrfs_node_key_to_cpu(eb, &found_key, 0);
+ else
+ btrfs_item_key_to_cpu(eb, &found_key, 0);
+ ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+ if (ret) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: tree first key check failed\n");
+ btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+ eb->start, parent_transid, first_key->objectid,
+ first_key->type, first_key->offset,
+ found_key.objectid, found_key.type,
+ found_key.offset);
+ }
+ return ret;
+}
+
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ *
+ * @parent_transid: expected transid, skip check if 0
+ * @level: expected level, mandatory check
+ * @first_key: expected key of first slot, skip check if NULL
+ */
+int btrfs_read_extent_buffer(struct extent_buffer *eb,
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct extent_io_tree *io_tree;
+ int failed = 0;
+ int ret;
+ int num_copies = 0;
+ int mirror_num = 0;
+ int failed_mirror = 0;
+
+ io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+ while (1) {
+ clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
+ if (!ret) {
+ if (verify_parent_transid(io_tree, eb,
+ parent_transid, 0))
+ ret = -EIO;
+ else if (btrfs_verify_level_key(eb, level,
+ first_key, parent_transid))
+ ret = -EUCLEAN;
+ else
+ break;
+ }
+
+ num_copies = btrfs_num_copies(fs_info,
+ eb->start, eb->len);
+ if (num_copies == 1)
+ break;
+
+ if (!failed_mirror) {
+ failed = 1;
+ failed_mirror = eb->read_mirror;
+ }
+
+ mirror_num++;
+ if (mirror_num == failed_mirror)
+ mirror_num++;
+
+ if (mirror_num > num_copies)
+ break;
+ }
+
+ if (failed && !ret && failed_mirror)
+ btrfs_repair_eb_io_failure(eb, failed_mirror);
+
+ return ret;
+}
+
+static int csum_one_extent_buffer(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u8 result[BTRFS_CSUM_SIZE];
+ int ret;
+
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
+ offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE) == 0);
+ csum_tree_block(eb, result);
+
+ if (btrfs_header_level(eb))
+ ret = btrfs_check_node(eb);
+ else
+ ret = btrfs_check_leaf_full(eb);
+
+ if (ret < 0)
+ goto error;
+
+ /*
+ * Also check the generation, the eb reached here must be newer than
+ * last committed. Or something seriously wrong happened.
+ */
+ if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "block=%llu bad generation, have %llu expect > %llu",
+ eb->start, btrfs_header_generation(eb),
+ fs_info->last_trans_committed);
+ goto error;
+ }
+ write_extent_buffer(eb, result, 0, fs_info->csum_size);
+
+ return 0;
+
+error:
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
+ eb->start);
+ /*
+ * Be noisy if this is an extent buffer from a log tree. We don't abort
+ * a transaction in case there's a bad log tree extent buffer, we just
+ * fallback to a transaction commit. Still we want to know when there is
+ * a bad log tree extent buffer, as that may signal a bug somewhere.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
+ btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
+ return ret;
+}
+
+/* Checksum all dirty extent buffers in one bio_vec */
+static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
+ struct bio_vec *bvec)
+{
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 cur;
+ int ret = 0;
+
+ for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
+ cur += fs_info->nodesize) {
+ struct extent_buffer *eb;
+ bool uptodate;
+
+ eb = find_extent_buffer(fs_info, cur);
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
+ fs_info->nodesize);
+
+ /* A dirty eb shouldn't disappear from buffer_radix */
+ if (WARN_ON(!eb))
+ return -EUCLEAN;
+
+ if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ if (WARN_ON(!uptodate)) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+
+ ret = csum_one_extent_buffer(eb);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ }
+ return ret;
+}
+
+/*
+ * Checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block.
+ * For subpage extent buffers we need bvec to also read the offset in the page.
+ */
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
+{
+ struct page *page = bvec->bv_page;
+ u64 start = page_offset(page);
+ u64 found_start;
+ struct extent_buffer *eb;
+
+ if (fs_info->nodesize < PAGE_SIZE)
+ return csum_dirty_subpage_buffers(fs_info, bvec);
+
+ eb = (struct extent_buffer *)page->private;
+ if (page != eb->pages[0])
+ return 0;
+
+ found_start = btrfs_header_bytenr(eb);
+
+ if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+ WARN_ON(found_start != 0);
+ return 0;
+ }
+
+ /*
+ * Please do not consolidate these warnings into a single if.
+ * It is useful to know what went wrong.
+ */
+ if (WARN_ON(found_start != start))
+ return -EUCLEAN;
+ if (WARN_ON(!PageUptodate(page)))
+ return -EUCLEAN;
+
+ return csum_one_extent_buffer(eb);
+}
+
+static int check_tree_block_fsid(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ u8 fsid[BTRFS_FSID_SIZE];
+ u8 *metadata_uuid;
+
+ read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE);
+ /*
+ * Checking the incompat flag is only valid for the current fs. For
+ * seed devices it's forbidden to have their uuid changed so reading
+ * ->fsid in this case is fine
+ */
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
+
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ return 0;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
+ return 0;
+
+ return 1;
+}
+
+/* Do basic extent buffer checks at read time */
+static int validate_extent_buffer(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u64 found_start;
+ const u32 csum_size = fs_info->csum_size;
+ u8 found_level;
+ u8 result[BTRFS_CSUM_SIZE];
+ const u8 *header_csum;
+ int ret = 0;
+
+ found_start = btrfs_header_bytenr(eb);
+ if (found_start != eb->start) {
+ btrfs_err_rl(fs_info,
+ "bad tree block start, mirror %u want %llu have %llu",
+ eb->read_mirror, eb->start, found_start);
+ ret = -EIO;
+ goto out;
+ }
+ if (check_tree_block_fsid(eb)) {
+ btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
+ ret = -EIO;
+ goto out;
+ }
+ found_level = btrfs_header_level(eb);
+ if (found_level >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info,
+ "bad tree block level, mirror %u level %d on logical %llu",
+ eb->read_mirror, btrfs_header_level(eb), eb->start);
+ ret = -EIO;
+ goto out;
+ }
+
+ csum_tree_block(eb, result);
+ header_csum = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
+
+ if (memcmp(result, header_csum, csum_size) != 0) {
+ btrfs_warn_rl(fs_info,
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+ eb->start, eb->read_mirror,
+ CSUM_FMT_VALUE(csum_size, header_csum),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /*
+ * If this is a leaf block and it is corrupt, set the corrupt bit so
+ * that we don't try and read the other copies of this block, just
+ * return -EIO.
+ */
+ if (found_level == 0 && btrfs_check_leaf_full(eb)) {
+ set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ ret = -EIO;
+ }
+
+ if (found_level > 0 && btrfs_check_node(eb))
+ ret = -EIO;
+
+ if (!ret)
+ set_extent_buffer_uptodate(eb);
+ else
+ btrfs_err(fs_info,
+ "read time tree block corruption detected on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
+out:
+ return ret;
+}
+
+static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
+ int mirror)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct extent_buffer *eb;
+ bool reads_done;
+ int ret = 0;
+
+ /*
+ * We don't allow bio merge for subpage metadata read, so we should
+ * only get one eb for each endio hook.
+ */
+ ASSERT(end == start + fs_info->nodesize - 1);
+ ASSERT(PagePrivate(page));
+
+ eb = find_extent_buffer(fs_info, start);
+ /*
+ * When we are reading one tree block, eb must have been inserted into
+ * the radix tree. If not, something is wrong.
+ */
+ ASSERT(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ /* Subpage read must finish in page read */
+ ASSERT(reads_done);
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
+ if (ret < 0)
+ goto err;
+
+ set_extent_buffer_uptodate(eb);
+
+ free_extent_buffer(eb);
+ return ret;
+err:
+ /*
+ * end_bio_extent_readpage decrements io_pages in case of error,
+ * make sure it has something to decrement.
+ */
+ atomic_inc(&eb->io_pages);
+ clear_extent_buffer_uptodate(eb);
+ free_extent_buffer(eb);
+ return ret;
+}
+
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
+ struct page *page, u64 start, u64 end,
+ int mirror)
+{
+ struct extent_buffer *eb;
+ int ret = 0;
+ int reads_done;
+
+ ASSERT(page->private);
+
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+ return validate_subpage_buffer(page, start, end, mirror);
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * The pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ atomic_inc(&eb->refs);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
+err:
+ if (ret) {
+ /*
+ * our io error hook is going to dec the io pages
+ * again, we have to make sure it has something
+ * to decrement
+ */
+ atomic_inc(&eb->io_pages);
+ clear_extent_buffer_uptodate(eb);
+ }
+ free_extent_buffer(eb);
+
+ return ret;
+}
+
+static void run_one_async_start(struct btrfs_work *work)
+{
+ struct async_submit_bio *async;
+ blk_status_t ret;
+
+ async = container_of(work, struct async_submit_bio, work);
+ ret = async->submit_bio_start(async->inode, async->bio,
+ async->dio_file_offset);
+ if (ret)
+ async->status = ret;
+}
+
+/*
+ * In order to insert checksums into the metadata in large chunks, we wait
+ * until bio submission time. All the pages in the bio are checksummed and
+ * sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the csums attached on the ordered extent record are
+ * inserted into the tree.
+ */
+static void run_one_async_done(struct btrfs_work *work)
+{
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct inode *inode = async->inode;
+ struct btrfs_bio *bbio = btrfs_bio(async->bio);
+
+ /* If an error occurred we just want to clean up the bio and move on */
+ if (async->status) {
+ btrfs_bio_end_io(bbio, async->status);
+ return;
+ }
+
+ /*
+ * All of the bios that pass through here are from async helpers.
+ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
+ * This changes nothing when cgroups aren't in use.
+ */
+ async->bio->bi_opf |= REQ_CGROUP_PUNT;
+ btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+ struct async_submit_bio *async;
+
+ async = container_of(work, struct async_submit_bio, work);
+ kfree(async);
+}
+
+/*
+ * Submit bio to an async queue.
+ *
+ * Retrun:
+ * - true if the work has been succesfuly submitted
+ * - false in case of error
+ */
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct async_submit_bio *async;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return false;
+
+ async->inode = inode;
+ async->bio = bio;
+ async->mirror_num = mirror_num;
+ async->submit_bio_start = submit_bio_start;
+
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
+ run_one_async_free);
+
+ async->dio_file_offset = dio_file_offset;
+
+ async->status = 0;
+
+ if (op_is_sync(bio->bi_opf))
+ btrfs_queue_work(fs_info->hipri_workers, &async->work);
+ else
+ btrfs_queue_work(fs_info->workers, &async->work);
+ return true;
+}
+
+static blk_status_t btree_csum_one_bio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ struct btrfs_root *root;
+ int ret = 0;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+ ret = csum_dirty_buffer(root->fs_info, bvec);
+ if (ret)
+ break;
+ }
+
+ return errno_to_blk_status(ret);
+}
+
+static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
+{
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_submit_bio.
+ */
+ return btree_csum_one_bio(bio);
+}
+
+static bool should_async_write(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *bi)
+{
+ if (btrfs_is_zoned(fs_info))
+ return false;
+ if (atomic_read(&bi->sync_writers))
+ return false;
+ if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
+ return false;
+ return true;
+}
+
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ blk_status_t ret;
+
+ bio->bi_opf |= REQ_META;
+
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
+ }
+
+ /*
+ * Kthread helpers are used to submit writes so that checksumming can
+ * happen in parallel across all CPUs.
+ */
+ if (should_async_write(fs_info, BTRFS_I(inode)) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ return;
+
+ ret = btree_csum_one_bio(bio);
+ if (ret) {
+ btrfs_bio_end_io(bbio, ret);
+ return;
+ }
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+}
+
+#ifdef CONFIG_MIGRATION
+static int btree_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
+{
+ /*
+ * we can't safely write a btree page from here,
+ * we haven't done the locking hook
+ */
+ if (folio_test_dirty(src))
+ return -EAGAIN;
+ /*
+ * Buffers may be managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+ if (folio_get_private(src) &&
+ !filemap_release_folio(src, GFP_KERNEL))
+ return -EAGAIN;
+ return migrate_folio(mapping, dst, src, mode);
+}
+#else
+#define btree_migrate_folio NULL
+#endif
+
+static int btree_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct btrfs_fs_info *fs_info;
+ int ret;
+
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+
+ if (wbc->for_kupdate)
+ return 0;
+
+ fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ /* this is a bit racy, but that's ok */
+ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH,
+ fs_info->dirty_metadata_batch);
+ if (ret < 0)
+ return 0;
+ }
+ return btree_write_cache_pages(mapping, wbc);
+}
+
+static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
+{
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
+
+ return try_release_extent_buffer(&folio->page);
+}
+
+static void btree_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(folio->mapping->host)->io_tree;
+ extent_invalidate_folio(tree, folio, offset);
+ btree_release_folio(folio, GFP_NOFS);
+ if (folio_get_private(folio)) {
+ btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+ "folio private not zero on folio %llu",
+ (unsigned long long)folio_pos(folio));
+ folio_detach_private(folio);
+ }
+}
+
+#ifdef DEBUG
+static bool btree_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
+ struct btrfs_subpage *subpage;
+ struct extent_buffer *eb;
+ int cur_bit = 0;
+ u64 page_start = folio_pos(folio);
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ eb = folio_get_private(folio);
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_write_locked(eb);
+ return filemap_dirty_folio(mapping, folio);
+ }
+
+ ASSERT(spi);
+ subpage = folio_get_private(folio);
+
+ for (cur_bit = spi->dirty_offset;
+ cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
+ cur_bit++) {
+ unsigned long flags;
+ u64 cur;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!test_bit(cur_bit, subpage->bitmaps)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ continue;
+ }
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur = page_start + cur_bit * fs_info->sectorsize;
+
+ eb = find_extent_buffer(fs_info, cur);
+ ASSERT(eb);
+ ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ ASSERT(atomic_read(&eb->refs));
+ btrfs_assert_tree_write_locked(eb);
+ free_extent_buffer(eb);
+
+ cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
+ }
+ return filemap_dirty_folio(mapping, folio);
+}
+#else
+#define btree_dirty_folio filemap_dirty_folio
+#endif
+
+static const struct address_space_operations btree_aops = {
+ .writepages = btree_writepages,
+ .release_folio = btree_release_folio,
+ .invalidate_folio = btree_invalidate_folio,
+ .migrate_folio = btree_migrate_folio,
+ .dirty_folio = btree_dirty_folio,
+};
+
+struct extent_buffer *btrfs_find_create_tree_block(
+ struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root,
+ int level)
+{
+ if (btrfs_is_testing(fs_info))
+ return alloc_test_extent_buffer(fs_info, bytenr);
+ return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
+}
+
+/*
+ * Read tree block at logical address @bytenr and do variant basic but critical
+ * verification.
+ *
+ * @owner_root: the objectid of the root owner for this block.
+ * @parent_transid: expected transid of this tree block, skip check if 0
+ * @level: expected level, mandatory check
+ * @first_key: expected key in slot 0, skip check if NULL
+ */
+struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key)
+{
+ struct extent_buffer *buf = NULL;
+ int ret;
+
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ if (IS_ERR(buf))
+ return buf;
+
+ ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
+ if (ret) {
+ free_extent_buffer_stale(buf);
+ return ERR_PTR(ret);
+ }
+ if (btrfs_check_eb_owner(buf, owner_root)) {
+ free_extent_buffer_stale(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
+ return buf;
+
+}
+
+void btrfs_clean_tree_block(struct extent_buffer *buf)
+{
+ struct btrfs_fs_info *fs_info = buf->fs_info;
+ if (btrfs_header_generation(buf) ==
+ fs_info->running_transaction->transid) {
+ btrfs_assert_tree_write_locked(buf);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ -buf->len,
+ fs_info->dirty_metadata_batch);
+ clear_extent_buffer_dirty(buf);
+ }
+ }
+}
+
+static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+ root->fs_info = fs_info;
+ root->root_key.objectid = objectid;
+ root->node = NULL;
+ root->commit_root = NULL;
+ root->state = 0;
+ RB_CLEAR_NODE(&root->rb_node);
+
+ root->last_trans = 0;
+ root->free_objectid = 0;
+ root->nr_delalloc_inodes = 0;
+ root->nr_ordered_extents = 0;
+ root->inode_tree = RB_ROOT;
+ INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+
+ btrfs_init_root_block_rsv(root);
+
+ INIT_LIST_HEAD(&root->dirty_list);
+ INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD(&root->delalloc_inodes);
+ INIT_LIST_HEAD(&root->delalloc_root);
+ INIT_LIST_HEAD(&root->ordered_extents);
+ INIT_LIST_HEAD(&root->ordered_root);
+ INIT_LIST_HEAD(&root->reloc_dirty_list);
+ INIT_LIST_HEAD(&root->logged_list[0]);
+ INIT_LIST_HEAD(&root->logged_list[1]);
+ spin_lock_init(&root->inode_lock);
+ spin_lock_init(&root->delalloc_lock);
+ spin_lock_init(&root->ordered_extent_lock);
+ spin_lock_init(&root->accounting_lock);
+ spin_lock_init(&root->log_extents_lock[0]);
+ spin_lock_init(&root->log_extents_lock[1]);
+ spin_lock_init(&root->qgroup_meta_rsv_lock);
+ mutex_init(&root->objectid_mutex);
+ mutex_init(&root->log_mutex);
+ mutex_init(&root->ordered_extent_mutex);
+ mutex_init(&root->delalloc_mutex);
+ init_waitqueue_head(&root->qgroup_flush_wait);
+ init_waitqueue_head(&root->log_writer_wait);
+ init_waitqueue_head(&root->log_commit_wait[0]);
+ init_waitqueue_head(&root->log_commit_wait[1]);
+ INIT_LIST_HEAD(&root->log_ctxs[0]);
+ INIT_LIST_HEAD(&root->log_ctxs[1]);
+ atomic_set(&root->log_commit[0], 0);
+ atomic_set(&root->log_commit[1], 0);
+ atomic_set(&root->log_writers, 0);
+ atomic_set(&root->log_batch, 0);
+ refcount_set(&root->refs, 1);
+ atomic_set(&root->snapshot_force_cow, 0);
+ atomic_set(&root->nr_swapfiles, 0);
+ root->log_transid = 0;
+ root->log_transid_committed = -1;
+ root->last_log_commit = 0;
+ root->anon_dev = 0;
+ if (!dummy) {
+ extent_io_tree_init(fs_info, &root->dirty_log_pages,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
+ extent_io_tree_init(fs_info, &root->log_csum_range,
+ IO_TREE_LOG_CSUM_RANGE, NULL);
+ }
+
+ spin_lock_init(&root->root_item_lock);
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
+#ifdef CONFIG_BTRFS_DEBUG
+ INIT_LIST_HEAD(&root->leak_list);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ list_add_tail(&root->leak_list, &fs_info->allocated_roots);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+#endif
+}
+
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, gfp_t flags)
+{
+ struct btrfs_root *root = kzalloc(sizeof(*root), flags);
+ if (root)
+ __setup_root(root, fs_info, objectid);
+ return root;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/* Should only be used by the testing infrastructure */
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+
+ if (!fs_info)
+ return ERR_PTR(-EINVAL);
+
+ root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ /* We don't use the stripesize in selftest, set it as sectorsize */
+ root->alloc_bytenr = 0;
+
+ return root;
+}
+#endif
+
+static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
+{
+ const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
+ const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
+}
+
+static int global_root_key_cmp(const void *k, const struct rb_node *node)
+{
+ const struct btrfs_key *key = k;
+ const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(key, &root->root_key);
+}
+
+int btrfs_global_root_insert(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *tmp;
+ int ret = 0;
+
+ write_lock(&fs_info->global_root_lock);
+ tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
+ write_unlock(&fs_info->global_root_lock);
+
+ if (tmp) {
+ ret = -EEXIST;
+ btrfs_warn(fs_info, "global root %llu %llu already exists",
+ root->root_key.objectid, root->root_key.offset);
+ }
+ return ret;
+}
+
+void btrfs_global_root_delete(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ write_lock(&fs_info->global_root_lock);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ write_unlock(&fs_info->global_root_lock);
+}
+
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key)
+{
+ struct rb_node *node;
+ struct btrfs_root *root = NULL;
+
+ read_lock(&fs_info->global_root_lock);
+ node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
+ if (node)
+ root = container_of(node, struct btrfs_root, rb_node);
+ read_unlock(&fs_info->global_root_lock);
+
+ return root;
+}
+
+static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ u64 ret;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (bytenr)
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ else
+ block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
+ ASSERT(block_group);
+ if (!block_group)
+ return 0;
+ ret = block_group->global_root_id;
+ btrfs_put_block_group(block_group);
+
+ return ret;
+}
+
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_CSUM_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_EXTENT_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+ u64 objectid)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *leaf;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ unsigned int nofs_flag;
+ int ret = 0;
+
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
+ root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ root->root_key.objectid = objectid;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ leaf = NULL;
+ goto fail_unlock;
+ }
+
+ root->node = leaf;
+ btrfs_mark_buffer_dirty(leaf);
+
+ root->commit_root = btrfs_root_node(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+
+ btrfs_set_root_flags(&root->root_item, 0);
+ btrfs_set_root_limit(&root->root_item, 0);
+ btrfs_set_root_bytenr(&root->root_item, leaf->start);
+ btrfs_set_root_generation(&root->root_item, trans->transid);
+ btrfs_set_root_level(&root->root_item, 0);
+ btrfs_set_root_refs(&root->root_item, 1);
+ btrfs_set_root_used(&root->root_item, leaf->len);
+ btrfs_set_root_last_snapshot(&root->root_item, 0);
+ btrfs_set_root_dirid(&root->root_item, 0);
+ if (is_fstree(objectid))
+ generate_random_guid(root->root_item.uuid);
+ else
+ export_guid(root->root_item.uuid, &guid_null);
+ btrfs_set_root_drop_level(&root->root_item, 0);
+
+ btrfs_tree_unlock(leaf);
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+ if (ret)
+ goto fail;
+
+ return root;
+
+fail_unlock:
+ if (leaf)
+ btrfs_tree_unlock(leaf);
+fail:
+ btrfs_put_root(root);
+
+ return ERR_PTR(ret);
+}
+
+static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+
+ root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+
+ return root;
+}
+
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct extent_buffer *leaf;
+
+ /*
+ * DON'T set SHAREABLE bit for log trees.
+ *
+ * Log trees are not exposed to user space thus can't be snapshotted,
+ * and they go away before a real commit is actually done.
+ *
+ * They do store pointers to file data extents, and those reference
+ * counts still get updated (along with back refs to the log tree).
+ */
+
+ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
+ if (IS_ERR(leaf))
+ return PTR_ERR(leaf);
+
+ root->node = leaf;
+
+ btrfs_mark_buffer_dirty(root->node);
+ btrfs_tree_unlock(root->node);
+
+ return 0;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *log_root;
+
+ log_root = alloc_log_tree(trans, fs_info);
+ if (IS_ERR(log_root))
+ return PTR_ERR(log_root);
+
+ if (!btrfs_is_zoned(fs_info)) {
+ int ret = btrfs_alloc_log_tree_node(trans, log_root);
+
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+ }
+
+ WARN_ON(fs_info->log_root_tree);
+ fs_info->log_root_tree = log_root;
+ return 0;
+}
+
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *log_root;
+ struct btrfs_inode_item *inode_item;
+ int ret;
+
+ log_root = alloc_log_tree(trans, fs_info);
+ if (IS_ERR(log_root))
+ return PTR_ERR(log_root);
+
+ ret = btrfs_alloc_log_tree_node(trans, log_root);
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+
+ log_root->last_trans = trans->transid;
+ log_root->root_key.offset = root->root_key.objectid;
+
+ inode_item = &log_root->root_item.inode;
+ btrfs_set_stack_inode_generation(inode_item, 1);
+ btrfs_set_stack_inode_size(inode_item, 3);
+ btrfs_set_stack_inode_nlink(inode_item, 1);
+ btrfs_set_stack_inode_nbytes(inode_item,
+ fs_info->nodesize);
+ btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
+
+ btrfs_set_root_node(&log_root->root_item, log_root->node);
+
+ WARN_ON(root->log_root);
+ root->log_root = log_root;
+ root->log_transid = 0;
+ root->log_transid_committed = -1;
+ root->last_log_commit = 0;
+ return 0;
+}
+
+static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ u64 generation;
+ int ret;
+ int level;
+
+ root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_find_root(tree_root, key, path,
+ &root->root_item, &root->root_key);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ generation = btrfs_root_generation(&root->root_item);
+ level = btrfs_root_level(&root->root_item);
+ root->node = read_tree_block(fs_info,
+ btrfs_root_bytenr(&root->root_item),
+ key->objectid, generation, level, NULL);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
+ root->node = NULL;
+ goto fail;
+ }
+ if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ ret = -EIO;
+ goto fail;
+ }
+
+ /*
+ * For real fs, and not log/reloc trees, root owner must
+ * match its root node owner
+ */
+ if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ root->root_key.objectid != btrfs_header_owner(root->node)) {
+ btrfs_crit(fs_info,
+"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
+ root->root_key.objectid, root->node->start,
+ btrfs_header_owner(root->node),
+ root->root_key.objectid);
+ ret = -EUCLEAN;
+ goto fail;
+ }
+ root->commit_root = btrfs_root_node(root);
+ return root;
+fail:
+ btrfs_put_root(root);
+ return ERR_PTR(ret);
+}
+
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ root = read_tree_root_path(tree_root, path, key);
+ btrfs_free_path(path);
+
+ return root;
+}
+
+/*
+ * Initialize subvolume root in-memory structure
+ *
+ * @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ */
+static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
+{
+ int ret;
+ unsigned int nofs_flag;
+
+ /*
+ * We might be called under a transaction (e.g. indirect backref
+ * resolution) which could deadlock if it triggers memory reclaim
+ */
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_drew_lock_init(&root->snapshot_lock);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret)
+ goto fail;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ !btrfs_is_data_reloc_root(root) &&
+ is_fstree(root->root_key.objectid)) {
+ set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
+ btrfs_check_and_init_root_item(&root->root_item);
+ }
+
+ /*
+ * Don't assign anonymous block device to roots that are not exposed to
+ * userspace, the id pool is limited to 1M
+ */
+ if (is_fstree(root->root_key.objectid) &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ if (!anon_dev) {
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ } else {
+ root->anon_dev = anon_dev;
+ }
+ }
+
+ mutex_lock(&root->objectid_mutex);
+ ret = btrfs_init_root_free_objectid(root);
+ if (ret) {
+ mutex_unlock(&root->objectid_mutex);
+ goto fail;
+ }
+
+ ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ mutex_unlock(&root->objectid_mutex);
+
+ return 0;
+fail:
+ /* The caller is responsible to call btrfs_free_fs_root */
+ return ret;
+}
+
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
+{
+ struct btrfs_root *root;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_id);
+ if (root)
+ root = btrfs_grab_root(root);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return root;
+}
+
+static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
+ if (objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->tree_root);
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+ if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->chunk_root);
+ if (objectid == BTRFS_DEV_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->dev_root);
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+ if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->quota_root) ?
+ fs_info->quota_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_UUID_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->uuid_root) ?
+ fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->block_group_root) ?
+ fs_info->block_group_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ struct btrfs_root *root = btrfs_global_root(fs_info, &key);
+
+ return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
+ }
+ return NULL;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret)
+ return ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
+ if (ret == 0) {
+ btrfs_grab_root(root);
+ set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
+
+ return ret;
+}
+
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+ struct btrfs_root *root;
+
+ while (!list_empty(&fs_info->allocated_roots)) {
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
+
+ root = list_first_entry(&fs_info->allocated_roots,
+ struct btrfs_root, leak_list);
+ btrfs_err(fs_info, "leaked root %s refcount %d",
+ btrfs_root_name(&root->root_key, buf),
+ refcount_read(&root->refs));
+ while (refcount_read(&root->refs) > 1)
+ btrfs_put_root(root);
+ btrfs_put_root(root);
+ }
+#endif
+}
+
+static void free_global_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct rb_node *node;
+
+ while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
+ root = rb_entry(node, struct btrfs_root, rb_node);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ btrfs_put_root(root);
+ }
+}
+
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->ordered_bytes);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
+ btrfs_free_csum_hash(fs_info);
+ btrfs_free_stripe_hash_table(fs_info);
+ btrfs_free_ref_cache(fs_info);
+ kfree(fs_info->balance_ctl);
+ kfree(fs_info->delayed_root);
+ free_global_roots(fs_info);
+ btrfs_put_root(fs_info->tree_root);
+ btrfs_put_root(fs_info->chunk_root);
+ btrfs_put_root(fs_info->dev_root);
+ btrfs_put_root(fs_info->quota_root);
+ btrfs_put_root(fs_info->uuid_root);
+ btrfs_put_root(fs_info->fs_root);
+ btrfs_put_root(fs_info->data_reloc_root);
+ btrfs_put_root(fs_info->block_group_root);
+ btrfs_check_leaked_roots(fs_info);
+ btrfs_extent_buffer_leak_debug_check(fs_info);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kfree(fs_info->subpage_info);
+ kvfree(fs_info);
+}
+
+
+/*
+ * Get an in-memory reference of a root structure.
+ *
+ * For essential trees like root/extent tree, we grab it from fs_info directly.
+ * For subvolume trees, we check the cached filesystem roots first. If not
+ * found, then read it from disk and add it to cached fs roots.
+ *
+ * Caller should release the root by calling btrfs_put_root() after the usage.
+ *
+ * NOTE: Reloc and log trees can't be read by this function as they share the
+ * same root objectid.
+ *
+ * @objectid: root id
+ * @anon_dev: preallocated anonymous block device number for new roots,
+ * pass 0 for new allocation.
+ * @check_ref: whether to check root item references, If true, return -ENOENT
+ * for orphan roots
+ */
+static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev,
+ bool check_ref)
+{
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
+again:
+ root = btrfs_lookup_fs_root(fs_info, objectid);
+ if (root) {
+ /* Shouldn't get preallocated anon_dev for cached roots */
+ ASSERT(!anon_dev);
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+ btrfs_put_root(root);
+ return ERR_PTR(-ENOENT);
+ }
+ return root;
+ }
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_tree_root(fs_info->tree_root, &key);
+ if (IS_ERR(root))
+ return root;
+
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ ret = btrfs_init_fs_root(root, anon_dev);
+ if (ret)
+ goto fail;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = objectid;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ btrfs_free_path(path);
+ if (ret < 0)
+ goto fail;
+ if (ret == 0)
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
+
+ ret = btrfs_insert_fs_root(fs_info, root);
+ if (ret) {
+ if (ret == -EEXIST) {
+ btrfs_put_root(root);
+ goto again;
+ }
+ goto fail;
+ }
+ return root;
+fail:
+ /*
+ * If our caller provided us an anonymous device, then it's his
+ * responsibility to free it in case we fail. So we have to set our
+ * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
+ * and once again by our caller.
+ */
+ if (anon_dev)
+ root->anon_dev = 0;
+ btrfs_put_root(root);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Get in-memory reference of a root structure
+ *
+ * @objectid: tree objectid
+ * @check_ref: if set, verify that the tree exists and the item has at least
+ * one reference
+ */
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, bool check_ref)
+{
+ return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+}
+
+/*
+ * Get in-memory reference of a root structure, created as new, optionally pass
+ * the anonymous block device id
+ *
+ * @objectid: tree objectid
+ * @anon_dev: if zero, allocate a new anonymous block device or use the
+ * parameter value
+ */
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev)
+{
+ return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
+}
+
+/*
+ * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * @fs_info: the fs_info
+ * @objectid: the objectid we need to lookup
+ *
+ * This is exclusively used for backref walking, and exists specifically because
+ * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
+ * creation time, which means we may have to read the tree_root in order to look
+ * up a fs root that is not in memory. If the root is not in memory we will
+ * read the tree root commit root and look up the fs root from there. This is a
+ * temporary root, it will not be inserted into the radix tree as it doesn't
+ * have the most uptodate information, it'll simply be discarded once the
+ * backref code is finished using the root.
+ */
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+
+ ASSERT(path->search_commit_root && path->skip_locking);
+
+ /*
+ * This can return -ENOENT if we ask for a root that doesn't exist, but
+ * since this is called via the backref walking code we won't be looking
+ * up a root that doesn't exist, unless there's corruption. So if root
+ * != NULL just return it.
+ */
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ root = btrfs_lookup_fs_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = read_tree_root_path(fs_info->tree_root, path, &key);
+ btrfs_release_path(path);
+
+ return root;
+}
+
+static int cleaner_kthread(void *arg)
+{
+ struct btrfs_fs_info *fs_info = arg;
+ int again;
+
+ while (1) {
+ again = 0;
+
+ set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+
+ /* Make the cleaner go to sleep early. */
+ if (btrfs_need_cleaner_sleep(fs_info))
+ goto sleep;
+
+ /*
+ * Do not do anything if we might cause open_ctree() to block
+ * before we have finished mounting the filesystem.
+ */
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ goto sleep;
+
+ if (!mutex_trylock(&fs_info->cleaner_mutex))
+ goto sleep;
+
+ /*
+ * Avoid the problem that we change the status of the fs
+ * during the above check and trylock.
+ */
+ if (btrfs_need_cleaner_sleep(fs_info)) {
+ mutex_unlock(&fs_info->cleaner_mutex);
+ goto sleep;
+ }
+
+ btrfs_run_delayed_iputs(fs_info);
+
+ again = btrfs_clean_one_deleted_snapshot(fs_info);
+ mutex_unlock(&fs_info->cleaner_mutex);
+
+ /*
+ * The defragger has dealt with the R/O remount and umount,
+ * needn't do anything special here.
+ */
+ btrfs_run_defrag_inodes(fs_info);
+
+ /*
+ * Acquires fs_info->reclaim_bgs_lock to avoid racing
+ * with relocation (btrfs_relocate_chunk) and relocation
+ * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
+ * after acquiring fs_info->reclaim_bgs_lock. So we
+ * can't hold, nor need to, fs_info->cleaner_mutex when deleting
+ * unused block groups.
+ */
+ btrfs_delete_unused_bgs(fs_info);
+
+ /*
+ * Reclaim block groups in the reclaim_bgs list after we deleted
+ * all unused block_groups. This possibly gives us some more free
+ * space.
+ */
+ btrfs_reclaim_bgs(fs_info);
+sleep:
+ clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+ if (kthread_should_park())
+ kthread_parkme();
+ if (kthread_should_stop())
+ return 0;
+ if (!again) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ }
+}
+
+static int transaction_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_transaction *cur;
+ u64 transid;
+ time64_t delta;
+ unsigned long delay;
+ bool cannot_commit;
+
+ do {
+ cannot_commit = false;
+ delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
+ mutex_lock(&fs_info->transaction_kthread_mutex);
+
+ spin_lock(&fs_info->trans_lock);
+ cur = fs_info->running_transaction;
+ if (!cur) {
+ spin_unlock(&fs_info->trans_lock);
+ goto sleep;
+ }
+
+ delta = ktime_get_seconds() - cur->start_time;
+ if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
+ cur->state < TRANS_STATE_COMMIT_START &&
+ delta < fs_info->commit_interval) {
+ spin_unlock(&fs_info->trans_lock);
+ delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay = min(delay,
+ msecs_to_jiffies(fs_info->commit_interval * 1000));
+ goto sleep;
+ }
+ transid = cur->transid;
+ spin_unlock(&fs_info->trans_lock);
+
+ /* If the file system is aborted, this will always fail. */
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ cannot_commit = true;
+ goto sleep;
+ }
+ if (transid == trans->transid) {
+ btrfs_commit_transaction(trans);
+ } else {
+ btrfs_end_transaction(trans);
+ }
+sleep:
+ wake_up_process(fs_info->cleaner_kthread);
+ mutex_unlock(&fs_info->transaction_kthread_mutex);
+
+ if (BTRFS_FS_ERROR(fs_info))
+ btrfs_cleanup_transaction(fs_info);
+ if (!kthread_should_stop() &&
+ (!btrfs_transaction_blocked(fs_info) ||
+ cannot_commit))
+ schedule_timeout_interruptible(delay);
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+/*
+ * This will find the highest generation in the array of root backups. The
+ * index of the highest array is returned, or -EINVAL if we can't find
+ * anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest root in the array with the generation
+ * in the super block. If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info)
+{
+ const u64 newest_gen = btrfs_super_generation(info->super_copy);
+ u64 cur;
+ struct btrfs_root_backup *root_backup;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ root_backup = info->super_copy->super_roots + i;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ return i;
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+ const int next_backup = info->backup_root_index;
+ struct btrfs_root_backup *root_backup;
+
+ root_backup = info->super_for_commit->super_roots + next_backup;
+
+ /*
+ * make sure all of our padding and empty slots get zero filled
+ * regardless of which ones we use today
+ */
+ memset(root_backup, 0, sizeof(*root_backup));
+
+ info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+ btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+ btrfs_set_backup_tree_root_gen(root_backup,
+ btrfs_header_generation(info->tree_root->node));
+
+ btrfs_set_backup_tree_root_level(root_backup,
+ btrfs_header_level(info->tree_root->node));
+
+ btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+ btrfs_set_backup_chunk_root_gen(root_backup,
+ btrfs_header_generation(info->chunk_root->node));
+ btrfs_set_backup_chunk_root_level(root_backup,
+ btrfs_header_level(info->chunk_root->node));
+
+ if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
+ struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+ struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
+
+ btrfs_set_backup_extent_root(root_backup,
+ extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(extent_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(csum_root->node));
+ }
+
+ /*
+ * we might commit during log recovery, which happens before we set
+ * the fs_root. Make sure it is valid before we fill it in.
+ */
+ if (info->fs_root && info->fs_root->node) {
+ btrfs_set_backup_fs_root(root_backup,
+ info->fs_root->node->start);
+ btrfs_set_backup_fs_root_gen(root_backup,
+ btrfs_header_generation(info->fs_root->node));
+ btrfs_set_backup_fs_root_level(root_backup,
+ btrfs_header_level(info->fs_root->node));
+ }
+
+ btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+ btrfs_set_backup_dev_root_gen(root_backup,
+ btrfs_header_generation(info->dev_root->node));
+ btrfs_set_backup_dev_root_level(root_backup,
+ btrfs_header_level(info->dev_root->node));
+
+ btrfs_set_backup_total_bytes(root_backup,
+ btrfs_super_total_bytes(info->super_copy));
+ btrfs_set_backup_bytes_used(root_backup,
+ btrfs_super_bytes_used(info->super_copy));
+ btrfs_set_backup_num_devices(root_backup,
+ btrfs_super_num_devices(info->super_copy));
+
+ /*
+ * if we don't copy this out to the super_copy, it won't get remembered
+ * for the next commit
+ */
+ memcpy(&info->super_copy->super_roots,
+ &info->super_for_commit->super_roots,
+ sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * read_backup_root - Reads a backup root based on the passed priority. Prio 0
+ * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
+ *
+ * fs_info - filesystem whose backup roots need to be read
+ * priority - priority of backup root required
+ *
+ * Returns backup root index on success and -EINVAL otherwise.
+ */
+static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
+{
+ int backup_index = find_newest_super_backup(fs_info);
+ struct btrfs_super_block *super = fs_info->super_copy;
+ struct btrfs_root_backup *root_backup;
+
+ if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
+ if (priority == 0)
+ return backup_index;
+
+ backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
+ backup_index %= BTRFS_NUM_BACKUP_ROOTS;
+ } else {
+ return -EINVAL;
+ }
+
+ root_backup = super->super_roots + backup_index;
+
+ btrfs_set_super_generation(super,
+ btrfs_backup_tree_root_gen(root_backup));
+ btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+ btrfs_set_super_root_level(super,
+ btrfs_backup_tree_root_level(root_backup));
+ btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+ /*
+ * Fixme: the total bytes and num_devices need to match or we should
+ * need a fsck
+ */
+ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+ btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+
+ return backup_index;
+}
+
+/* helper to cleanup workers */
+static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+{
+ btrfs_destroy_workqueue(fs_info->fixup_workers);
+ btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->hipri_workers);
+ btrfs_destroy_workqueue(fs_info->workers);
+ if (fs_info->endio_workers)
+ destroy_workqueue(fs_info->endio_workers);
+ if (fs_info->endio_raid56_workers)
+ destroy_workqueue(fs_info->endio_raid56_workers);
+ if (fs_info->rmw_workers)
+ destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->compressed_write_workers)
+ destroy_workqueue(fs_info->compressed_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+ btrfs_destroy_workqueue(fs_info->delayed_workers);
+ btrfs_destroy_workqueue(fs_info->caching_workers);
+ btrfs_destroy_workqueue(fs_info->flush_workers);
+ btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+ if (fs_info->discard_ctl.discard_workers)
+ destroy_workqueue(fs_info->discard_ctl.discard_workers);
+ /*
+ * Now that all other work queues are destroyed, we can safely destroy
+ * the queues used for metadata I/O, since tasks from those other work
+ * queues can do metadata I/O operations.
+ */
+ if (fs_info->endio_meta_workers)
+ destroy_workqueue(fs_info->endio_meta_workers);
+}
+
+static void free_root_extent_buffers(struct btrfs_root *root)
+{
+ if (root) {
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ root->node = NULL;
+ root->commit_root = NULL;
+ }
+}
+
+static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root, *tmp;
+
+ rbtree_postorder_for_each_entry_safe(root, tmp,
+ &fs_info->global_root_tree,
+ rb_node)
+ free_root_extent_buffers(root);
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
+{
+ free_root_extent_buffers(info->tree_root);
+
+ free_global_root_pointers(info);
+ free_root_extent_buffers(info->dev_root);
+ free_root_extent_buffers(info->quota_root);
+ free_root_extent_buffers(info->uuid_root);
+ free_root_extent_buffers(info->fs_root);
+ free_root_extent_buffers(info->data_reloc_root);
+ free_root_extent_buffers(info->block_group_root);
+ if (free_chunk_root)
+ free_root_extent_buffers(info->chunk_root);
+}
+
+void btrfs_put_root(struct btrfs_root *root)
+{
+ if (!root)
+ return;
+
+ if (refcount_dec_and_test(&root->refs)) {
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
+ if (root->anon_dev)
+ free_anon_bdev(root->anon_dev);
+ btrfs_drew_lock_destroy(&root->snapshot_lock);
+ free_root_extent_buffers(root);
+#ifdef CONFIG_BTRFS_DEBUG
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ list_del_init(&root->leak_list);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+#endif
+ kfree(root);
+ }
+}
+
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *gang[8];
+ int i;
+
+ while (!list_empty(&fs_info->dead_roots)) {
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
+
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
+ btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+ btrfs_put_root(gang[0]);
+ }
+
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++)
+ btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+ }
+}
+
+static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
+{
+ mutex_init(&fs_info->scrub_lock);
+ atomic_set(&fs_info->scrubs_running, 0);
+ atomic_set(&fs_info->scrub_pause_req, 0);
+ atomic_set(&fs_info->scrubs_paused, 0);
+ atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->scrub_pause_wait);
+ refcount_set(&fs_info->scrub_workers_refcnt, 0);
+}
+
+static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+}
+
+static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
+{
+ struct inode *inode = fs_info->btree_inode;
+ unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+ fs_info->tree_root);
+
+ inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ set_nlink(inode, 1);
+ /*
+ * we set the i_size on the btree inode to the max possible int.
+ * the real end of the address space is determined by all of
+ * the devices in the system
+ */
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &btree_aops;
+
+ RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
+ IO_TREE_BTREE_INODE_IO, NULL);
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
+
+ BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
+ BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
+ BTRFS_I(inode)->location.type = 0;
+ BTRFS_I(inode)->location.offset = 0;
+ set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+ __insert_inode_hash(inode, hash);
+}
+
+static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
+{
+ mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+ init_rwsem(&fs_info->dev_replace.rwsem);
+ init_waitqueue_head(&fs_info->dev_replace.replace_wait);
+}
+
+static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->qgroup_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ fs_info->qgroup_tree = RB_ROOT;
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ fs_info->qgroup_seq = 1;
+ fs_info->qgroup_ulist = NULL;
+ fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+ mutex_init(&fs_info->qgroup_rescan_lock);
+}
+
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
+{
+ u32 max_active = fs_info->thread_pool_size;
+ unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+
+ fs_info->workers =
+ btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
+ fs_info->hipri_workers =
+ btrfs_alloc_workqueue(fs_info, "worker-high",
+ flags | WQ_HIGHPRI, max_active, 16);
+
+ fs_info->delalloc_workers =
+ btrfs_alloc_workqueue(fs_info, "delalloc",
+ flags, max_active, 2);
+
+ fs_info->flush_workers =
+ btrfs_alloc_workqueue(fs_info, "flush_delalloc",
+ flags, max_active, 0);
+
+ fs_info->caching_workers =
+ btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
+
+ fs_info->fixup_workers =
+ btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
+
+ fs_info->endio_workers =
+ alloc_workqueue("btrfs-endio", flags, max_active);
+ fs_info->endio_meta_workers =
+ alloc_workqueue("btrfs-endio-meta", flags, max_active);
+ fs_info->endio_raid56_workers =
+ alloc_workqueue("btrfs-endio-raid56", flags, max_active);
+ fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
+ fs_info->endio_write_workers =
+ btrfs_alloc_workqueue(fs_info, "endio-write", flags,
+ max_active, 2);
+ fs_info->compressed_write_workers =
+ alloc_workqueue("btrfs-compressed-write", flags, max_active);
+ fs_info->endio_freespace_worker =
+ btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
+ max_active, 0);
+ fs_info->delayed_workers =
+ btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
+ max_active, 0);
+ fs_info->qgroup_rescan_workers =
+ btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
+ fs_info->discard_ctl.discard_workers =
+ alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
+
+ if (!(fs_info->workers && fs_info->hipri_workers &&
+ fs_info->delalloc_workers && fs_info->flush_workers &&
+ fs_info->endio_workers && fs_info->endio_meta_workers &&
+ fs_info->compressed_write_workers &&
+ fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+ fs_info->caching_workers && fs_info->fixup_workers &&
+ fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
+ fs_info->discard_ctl.discard_workers)) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
+{
+ struct crypto_shash *csum_shash;
+ const char *csum_driver = btrfs_super_csum_driver(csum_type);
+
+ csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
+
+ if (IS_ERR(csum_shash)) {
+ btrfs_err(fs_info, "error allocating %s hash for checksum",
+ csum_driver);
+ return PTR_ERR(csum_shash);
+ }
+
+ fs_info->csum_shash = csum_shash;
+
+ /*
+ * Check if the checksum implementation is a fast accelerated one.
+ * As-is this is a bit of a hack and should be replaced once the csum
+ * implementations provide that information themselves.
+ */
+ switch (csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
+ set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
+ break;
+ default:
+ break;
+ }
+
+ btrfs_info(fs_info, "using %s (%s) checksum algorithm",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(csum_shash));
+ return 0;
+}
+
+static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_root *log_tree_root;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 bytenr = btrfs_super_log_root(disk_super);
+ int level = btrfs_super_log_root_level(disk_super);
+
+ if (fs_devices->rw_devices == 0) {
+ btrfs_warn(fs_info, "log replay required on RO media");
+ return -EIO;
+ }
+
+ log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
+ GFP_KERNEL);
+ if (!log_tree_root)
+ return -ENOMEM;
+
+ log_tree_root->node = read_tree_block(fs_info, bytenr,
+ BTRFS_TREE_LOG_OBJECTID,
+ fs_info->generation + 1, level,
+ NULL);
+ if (IS_ERR(log_tree_root->node)) {
+ btrfs_warn(fs_info, "failed to read log tree");
+ ret = PTR_ERR(log_tree_root->node);
+ log_tree_root->node = NULL;
+ btrfs_put_root(log_tree_root);
+ return ret;
+ }
+ if (!extent_buffer_uptodate(log_tree_root->node)) {
+ btrfs_err(fs_info, "failed to read log tree");
+ btrfs_put_root(log_tree_root);
+ return -EIO;
+ }
+
+ /* returns with log_tree_root freed on success */
+ ret = btrfs_recover_log_trees(log_tree_root);
+ if (ret) {
+ btrfs_handle_fs_error(fs_info, ret,
+ "Failed to recover log tree");
+ btrfs_put_root(log_tree_root);
+ return ret;
+ }
+
+ if (sb_rdonly(fs_info->sb)) {
+ ret = btrfs_commit_super(fs_info);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int load_global_roots_objectid(struct btrfs_root *tree_root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name)
+{
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_root *root;
+ u64 max_global_id = 0;
+ int ret;
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ bool found = false;
+
+ /* If we have IGNOREDATACSUMS skip loading these roots. */
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
+ btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ return 0;
+ }
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+ }
+ ret = 0;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != objectid)
+ break;
+ btrfs_release_path(path);
+
+ /*
+ * Just worry about this for extent tree, it'll be the same for
+ * everybody.
+ */
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ max_global_id = max(max_global_id, key.offset);
+
+ found = true;
+ root = read_tree_root_path(tree_root, path, &key);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = PTR_ERR(root);
+ break;
+ }
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ ret = btrfs_global_root_insert(root);
+ if (ret) {
+ btrfs_put_root(root);
+ break;
+ }
+ key.offset++;
+ }
+ btrfs_release_path(path);
+
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ fs_info->nr_global_roots = max_global_id + 1;
+
+ if (!found || ret) {
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = ret ? ret : -ENOENT;
+ else
+ ret = 0;
+ btrfs_err(fs_info, "failed to load root %s", name);
+ }
+ return ret;
+}
+
+static int load_global_roots(struct btrfs_root *tree_root)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_EXTENT_TREE_OBJECTID, "extent");
+ if (ret)
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_CSUM_TREE_OBJECTID, "csum");
+ if (ret)
+ goto out;
+ if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_FREE_SPACE_TREE_OBJECTID,
+ "free space");
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root;
+ struct btrfs_key location;
+ int ret;
+
+ BUG_ON(!fs_info->tree_root);
+
+ ret = load_global_roots(tree_root);
+ if (ret)
+ return ret;
+
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = 0;
+
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->block_group_root = root;
+ }
+ }
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->dev_root = root;
+ }
+ /* Initialize fs_info for all devices in any case */
+ ret = btrfs_init_devices_late(fs_info);
+ if (ret)
+ goto out;
+
+ /*
+ * This tree can share blocks with some other fs tree during relocation
+ * and we need a proper setup by btrfs_get_fs_root
+ */
+ root = btrfs_get_fs_root(tree_root->fs_info,
+ BTRFS_DATA_RELOC_TREE_OBJECTID, true);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->data_reloc_root = root;
+ }
+
+ location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (!IS_ERR(root)) {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ fs_info->quota_root = root;
+ }
+
+ location.objectid = BTRFS_UUID_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT)
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->uuid_root = root;
+ }
+
+ return 0;
+out:
+ btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
+ location.objectid, ret);
+ return ret;
+}
+
+/*
+ * Real super block validation
+ * NOTE: super csum type and incompat features will not be checked here.
+ *
+ * @sb: super block to check
+ * @mirror_num: the super block number to check its bytenr:
+ * 0 the primary (1st) sb
+ * 1, 2 2nd and 3rd backup copy
+ * -1 skip bytenr check
+ */
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
+{
+ u64 nodesize = btrfs_super_nodesize(sb);
+ u64 sectorsize = btrfs_super_sectorsize(sb);
+ int ret = 0;
+
+ if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+ btrfs_err(fs_info, "no valid FS found");
+ ret = -EINVAL;
+ }
+ if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
+ btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "tree_root level too big: %d >= %d",
+ btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
+ btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ btrfs_err(fs_info, "log_root level too big: %d >= %d",
+ btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Check sectorsize and nodesize first, other check will need it.
+ * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
+ */
+ if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
+ ret = -EINVAL;
+ }
+
+ /*
+ * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ *
+ * We can support 16K sectorsize with 64K page size without problem,
+ * but such sectorsize/pagesize combination doesn't make much sense.
+ * 4K will be our future standard, PAGE_SIZE is supported from the very
+ * beginning.
+ */
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
+ btrfs_err(fs_info,
+ "sectorsize %llu not yet supported for page size %lu",
+ sectorsize, PAGE_SIZE);
+ ret = -EINVAL;
+ }
+
+ if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+ nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
+ ret = -EINVAL;
+ }
+ if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+ btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
+ le32_to_cpu(sb->__unused_leafsize), nodesize);
+ ret = -EINVAL;
+ }
+
+ /* Root alignment check */
+ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "tree_root block unaligned: %llu",
+ btrfs_super_root(sb));
+ ret = -EINVAL;
+ }
+ if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
+ btrfs_super_chunk_root(sb));
+ ret = -EINVAL;
+ }
+ if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+ btrfs_warn(fs_info, "log_root block unaligned: %llu",
+ btrfs_super_log_root(sb));
+ ret = -EINVAL;
+ }
+
+ if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
+ btrfs_err(fs_info,
+ "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
+ sb->fsid, fs_info->fs_devices->fsid);
+ ret = -EINVAL;
+ }
+
+ if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
+ BTRFS_FSID_SIZE) != 0) {
+ btrfs_err(fs_info,
+"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
+ btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Artificial requirement for block-group-tree to force newer features
+ * (free-space-tree, no-holes) so the test matrix is smaller.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+ btrfs_err(fs_info,
+ "block-group-tree feature requires fres-space-tree and no-holes");
+ ret = -EINVAL;
+ }
+
+ if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
+ BTRFS_FSID_SIZE) != 0) {
+ btrfs_err(fs_info,
+ "dev_item UUID does not match metadata fsid: %pU != %pU",
+ fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ * done later
+ */
+ if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
+ btrfs_err(fs_info, "bytes_used is too small %llu",
+ btrfs_super_bytes_used(sb));
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(btrfs_super_stripesize(sb))) {
+ btrfs_err(fs_info, "invalid stripesize %u",
+ btrfs_super_stripesize(sb));
+ ret = -EINVAL;
+ }
+ if (btrfs_super_num_devices(sb) > (1UL << 31))
+ btrfs_warn(fs_info, "suspicious number of devices: %llu",
+ btrfs_super_num_devices(sb));
+ if (btrfs_super_num_devices(sb) == 0) {
+ btrfs_err(fs_info, "number of devices is 0");
+ ret = -EINVAL;
+ }
+
+ if (mirror_num >= 0 &&
+ btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
+ btrfs_err(fs_info, "super offset mismatch %llu != %u",
+ btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Obvious sys_chunk_array corruptions, it must hold at least one key
+ * and one chunk
+ */
+ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ btrfs_err(fs_info, "system chunk array too big %u > %u",
+ btrfs_super_sys_array_size(sb),
+ BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk)) {
+ btrfs_err(fs_info, "system chunk array too small %u < %zu",
+ btrfs_super_sys_array_size(sb),
+ sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk));
+ ret = -EINVAL;
+ }
+
+ /*
+ * The generation is a global counter, we'll trust it more than the others
+ * but it's still possible that it's the one that's wrong.
+ */
+ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
+ btrfs_warn(fs_info,
+ "suspicious: generation < chunk_root_generation: %llu < %llu",
+ btrfs_super_generation(sb),
+ btrfs_super_chunk_root_generation(sb));
+ if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+ && btrfs_super_cache_generation(sb) != (u64)-1)
+ btrfs_warn(fs_info,
+ "suspicious: generation < cache_generation: %llu < %llu",
+ btrfs_super_generation(sb),
+ btrfs_super_cache_generation(sb));
+
+ return ret;
+}
+
+/*
+ * Validation of super block at mount time.
+ * Some checks already done early at mount time, like csum type and incompat
+ * flags will be skipped.
+ */
+static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
+}
+
+/*
+ * Validation of super block at write time.
+ * Some checks like bytenr check will be skipped as their values will be
+ * overwritten soon.
+ * Extra checks like csum type and incompat flags will be done here.
+ */
+static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb)
+{
+ int ret;
+
+ ret = btrfs_validate_super(fs_info, sb, -1);
+ if (ret < 0)
+ goto out;
+ if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info, "invalid csum type, has %u want %u",
+ btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
+ goto out;
+ }
+ if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "invalid incompat flags, has 0x%llx valid mask 0x%llx",
+ btrfs_super_incompat_flags(sb),
+ (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
+ goto out;
+ }
+out:
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "super block corruption detected before writing it to disk");
+ return ret;
+}
+
+static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
+{
+ int ret = 0;
+
+ root->node = read_tree_block(root->fs_info, bytenr,
+ root->root_key.objectid, gen, level, NULL);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
+ root->node = NULL;
+ return ret;
+ }
+ if (!extent_buffer_uptodate(root->node)) {
+ free_extent_buffer(root->node);
+ root->node = NULL;
+ return -EIO;
+ }
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ root->commit_root = btrfs_root_node(root);
+ btrfs_set_root_refs(&root->root_item, 1);
+ return ret;
+}
+
+static int load_important_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ u64 gen, bytenr;
+ int level, ret;
+
+ bytenr = btrfs_super_root(sb);
+ gen = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
+ if (ret) {
+ btrfs_warn(fs_info, "couldn't read tree root");
+ return ret;
+ }
+ return 0;
+}
+
+static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
+{
+ int backup_index = find_newest_super_backup(fs_info);
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ bool handle_error = false;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ if (handle_error) {
+ if (!IS_ERR(tree_root->node))
+ free_extent_buffer(tree_root->node);
+ tree_root->node = NULL;
+
+ if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
+ break;
+
+ free_root_pointers(fs_info, 0);
+
+ /*
+ * Don't use the log in recovery mode, it won't be
+ * valid
+ */
+ btrfs_set_super_log_root(sb, 0);
+
+ /* We can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ btrfs_warn(fs_info, "try to load backup roots slot %d", i);
+ ret = read_backup_root(fs_info, i);
+ backup_index = ret;
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = load_important_roots(fs_info);
+ if (ret) {
+ handle_error = true;
+ continue;
+ }
+
+ /*
+ * No need to hold btrfs_root::objectid_mutex since the fs
+ * hasn't been fully initialised and we are the only user
+ */
+ ret = btrfs_init_root_free_objectid(tree_root);
+ if (ret < 0) {
+ handle_error = true;
+ continue;
+ }
+
+ ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ ret = btrfs_read_roots(fs_info);
+ if (ret < 0) {
+ handle_error = true;
+ continue;
+ }
+
+ /* All successful */
+ fs_info->generation = btrfs_header_generation(tree_root->node);
+ fs_info->last_trans_committed = fs_info->generation;
+ fs_info->last_reloc_trans = 0;
+
+ /* Always begin writing backup roots after the one being used */
+ if (backup_index < 0) {
+ fs_info->backup_root_index = 0;
+ } else {
+ fs_info->backup_root_index = backup_index + 1;
+ fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
+ }
+ break;
+ }
+
+ return ret;
+}
+
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
+{
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ INIT_LIST_HEAD(&fs_info->trans_list);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->delayed_iputs);
+ INIT_LIST_HEAD(&fs_info->delalloc_roots);
+ INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ spin_lock_init(&fs_info->delalloc_root_lock);
+ spin_lock_init(&fs_info->trans_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->delayed_iput_lock);
+ spin_lock_init(&fs_info->defrag_inodes_lock);
+ spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->unused_bgs_lock);
+ spin_lock_init(&fs_info->treelog_bg_lock);
+ spin_lock_init(&fs_info->zone_active_bgs_lock);
+ spin_lock_init(&fs_info->relocation_bg_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
+ rwlock_init(&fs_info->global_root_lock);
+ mutex_init(&fs_info->unused_bg_unpin_mutex);
+ mutex_init(&fs_info->reclaim_bgs_lock);
+ mutex_init(&fs_info->reloc_mutex);
+ mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->zoned_meta_io_lock);
+ mutex_init(&fs_info->zoned_data_reloc_io_lock);
+ seqlock_init(&fs_info->profiles_lock);
+
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
+ btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
+ BTRFS_LOCKDEP_TRANS_COMPLETED);
+
+ INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+ INIT_LIST_HEAD(&fs_info->space_info);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_LIST_HEAD(&fs_info->unused_bgs);
+ INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ INIT_LIST_HEAD(&fs_info->zone_active_bgs);
+#ifdef CONFIG_BTRFS_DEBUG
+ INIT_LIST_HEAD(&fs_info->allocated_roots);
+ INIT_LIST_HEAD(&fs_info->allocated_ebs);
+ spin_lock_init(&fs_info->eb_leak_lock);
+#endif
+ extent_map_tree_init(&fs_info->mapping_tree);
+ btrfs_init_block_rsv(&fs_info->global_block_rsv,
+ BTRFS_BLOCK_RSV_GLOBAL);
+ btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+ btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+ BTRFS_BLOCK_RSV_DELOPS);
+ btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
+ BTRFS_BLOCK_RSV_DELREFS);
+
+ atomic_set(&fs_info->async_delalloc_pages, 0);
+ atomic_set(&fs_info->defrag_running, 0);
+ atomic_set(&fs_info->nr_delayed_iputs, 0);
+ atomic64_set(&fs_info->tree_mod_seq, 0);
+ fs_info->global_root_tree = RB_ROOT;
+ fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+ fs_info->metadata_ratio = 0;
+ fs_info->defrag_inodes = RB_ROOT;
+ atomic64_set(&fs_info->free_chunk_space, 0);
+ fs_info->tree_mod_log = RB_ROOT;
+ fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
+ btrfs_init_ref_verify(fs_info);
+
+ fs_info->thread_pool_size = min_t(unsigned long,
+ num_online_cpus() + 2, 8);
+
+ INIT_LIST_HEAD(&fs_info->ordered_roots);
+ spin_lock_init(&fs_info->ordered_root_lock);
+
+ btrfs_init_scrub(fs_info);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ fs_info->check_integrity_print_mask = 0;
+#endif
+ btrfs_init_balance(fs_info);
+ btrfs_init_async_reclaim_work(fs_info);
+
+ rwlock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree = RB_ROOT_CACHED;
+
+ extent_io_tree_init(fs_info, &fs_info->excluded_extents,
+ IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
+
+ mutex_init(&fs_info->ordered_operations_mutex);
+ mutex_init(&fs_info->tree_log_mutex);
+ mutex_init(&fs_info->chunk_mutex);
+ mutex_init(&fs_info->transaction_kthread_mutex);
+ mutex_init(&fs_info->cleaner_mutex);
+ mutex_init(&fs_info->ro_block_group_mutex);
+ init_rwsem(&fs_info->commit_root_sem);
+ init_rwsem(&fs_info->cleanup_work_sem);
+ init_rwsem(&fs_info->subvol_sem);
+ sema_init(&fs_info->uuid_tree_rescan_sem, 1);
+
+ btrfs_init_dev_replace_locks(fs_info);
+ btrfs_init_qgroup(fs_info);
+ btrfs_discard_init(fs_info);
+
+ btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+ btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
+ init_waitqueue_head(&fs_info->transaction_throttle);
+ init_waitqueue_head(&fs_info->transaction_wait);
+ init_waitqueue_head(&fs_info->transaction_blocked_wait);
+ init_waitqueue_head(&fs_info->async_submit_wait);
+ init_waitqueue_head(&fs_info->delayed_iputs_wait);
+
+ /* Usable values until the real ones are cached from the superblock */
+ fs_info->nodesize = 4096;
+ fs_info->sectorsize = 4096;
+ fs_info->sectorsize_bits = ilog2(4096);
+ fs_info->stripesize = 4096;
+
+ fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
+
+ spin_lock_init(&fs_info->swapfile_pins_lock);
+ fs_info->swapfile_pins = RB_ROOT;
+
+ fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
+ INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
+}
+
+static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ int ret;
+
+ fs_info->sb = sb;
+ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
+ sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
+
+ ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ fs_info->dirty_metadata_batch = PAGE_SIZE *
+ (1 + ilog2(nr_cpu_ids));
+
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+ GFP_KERNEL);
+ if (!fs_info->delayed_root)
+ return -ENOMEM;
+ btrfs_init_delayed_root(fs_info->delayed_root);
+
+ if (sb_rdonly(sb))
+ set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
+
+ return btrfs_alloc_stripe_hash_table(fs_info);
+}
+
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ int ret;
+
+ /*
+ * 1st step is to iterate through the existing UUID tree and
+ * to delete all entries that contain outdated data.
+ * 2nd step is to add all missing entries to the UUID tree.
+ */
+ ret = btrfs_uuid_tree_iterate(fs_info);
+ if (ret < 0) {
+ if (ret != -EINTR)
+ btrfs_warn(fs_info, "iterating uuid_tree failed %d",
+ ret);
+ up(&fs_info->uuid_tree_rescan_sem);
+ return ret;
+ }
+ return btrfs_uuid_scan_kthread(data);
+}
+
+static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_rescan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
+ }
+
+ return 0;
+}
+
+/*
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount
+ * code paths.
+ */
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
+{
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+}
+
+/*
+ * Mounting logic specific to read-write file systems. Shared by open_ctree
+ * and btrfs_remount when remounting from read-only to read-write.
+ */
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+ bool rebuild_free_space_tree = false;
+
+ if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ rebuild_free_space_tree = true;
+ } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
+ btrfs_warn(fs_info, "free space tree is invalid");
+ rebuild_free_space_tree = true;
+ }
+
+ if (rebuild_free_space_tree) {
+ btrfs_info(fs_info, "rebuilding free space tree");
+ ret = btrfs_rebuild_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to rebuild free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info, "disabling free space tree");
+ ret = btrfs_delete_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to disable free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ /*
+ * btrfs_find_orphan_roots() is responsible for finding all the dead
+ * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
+ * them into the fs_info->fs_roots_radix tree. This must be done before
+ * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
+ * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
+ * item before the root's tree is deleted - this means that if we unmount
+ * or crash before the deletion completes, on the next mount we will not
+ * delete what remains of the tree because the orphan item does not
+ * exists anymore, which is what tells us we have a pending deletion.
+ */
+ ret = btrfs_find_orphan_roots(fs_info);
+ if (ret)
+ goto out;
+
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto out;
+
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+ up_read(&fs_info->cleanup_work_sem);
+ goto out;
+ }
+ up_read(&fs_info->cleanup_work_sem);
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = btrfs_recover_relocation(fs_info);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
+ goto out;
+ }
+
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info, "creating free space tree");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
+ ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret)
+ goto out;
+
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info, "failed to resume dev_replace");
+ goto out;
+ }
+
+ btrfs_qgroup_rescan_resume(fs_info);
+
+ if (!fs_info->uuid_root) {
+ btrfs_info(fs_info, "creating UUID tree");
+ ret = btrfs_create_uuid_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create the UUID tree %d", ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Do various sanity and dependency checks of different features.
+ *
+ * @is_rw_mount: If the mount is read-write.
+ *
+ * This is the place for less strict checks (like for subpage or artificial
+ * feature dependencies).
+ *
+ * For strict checks or possible corruption detection, see
+ * btrfs_validate_super().
+ *
+ * This should be called after btrfs_parse_options(), as some mount options
+ * (space cache related) can modify on-disk format like free space tree and
+ * screw up certain feature dependencies.
+ */
+int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 incompat = btrfs_super_incompat_flags(disk_super);
+ const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
+ const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
+
+ if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ btrfs_err(fs_info,
+ "cannot mount because of unknown incompat features (0x%llx)",
+ incompat);
+ return -EINVAL;
+ }
+
+ /* Runtime limitation for mixed block groups. */
+ if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (fs_info->sectorsize != fs_info->nodesize)) {
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ fs_info->nodesize, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* Mixed backref is an always-enabled feature. */
+ incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+
+ /* Set compression related flags just in case. */
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
+
+ /*
+ * An ancient flag, which should really be marked deprecated.
+ * Such runtime limitation doesn't really need a incompat flag.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+ incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+
+ if (compat_ro_unsupp && is_rw_mount) {
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unknown compat_ro features (0x%llx)",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata writes, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * Artificial limitations for block group tree, to force
+ * block-group-tree to rely on no-holes and free-space-tree.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
+ btrfs_err(fs_info,
+"block-group-tree feature requires no-holes and free-space-tree features");
+ return -EINVAL;
+ }
+
+ /*
+ * Subpage runtime limitation on v1 cache.
+ *
+ * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * we're already defaulting to v2 cache, no need to bother v1 as it's
+ * going to be deprecated anyway.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* This can be called by remount, we need to protect the super block. */
+ spin_lock(&fs_info->super_lock);
+ btrfs_set_super_incompat_flags(disk_super, incompat);
+ spin_unlock(&fs_info->super_lock);
+
+ return 0;
+}
+
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
+ char *options)
+{
+ u32 sectorsize;
+ u32 nodesize;
+ u32 stripesize;
+ u64 generation;
+ u64 features;
+ u16 csum_type;
+ struct btrfs_super_block *disk_super;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ int ret;
+ int err = -EINVAL;
+ int level;
+
+ ret = init_mount_fs_info(fs_info, sb);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+
+ /* These need to be init'ed before we start creating inodes and such. */
+ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->tree_root = tree_root;
+ chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->chunk_root = chunk_root;
+ if (!tree_root || !chunk_root) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+ btrfs_init_btree_inode(fs_info);
+
+ invalidate_bdev(fs_devices->latest_dev->bdev);
+
+ /*
+ * Read super block and check the signature bytes only
+ */
+ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
+ if (IS_ERR(disk_super)) {
+ err = PTR_ERR(disk_super);
+ goto fail_alloc;
+ }
+
+ btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
+ /*
+ * Verify the type first, if that or the checksum value are
+ * corrupted, we'll find out
+ */
+ csum_type = btrfs_super_csum_type(disk_super);
+ if (!btrfs_supported_super_csum(csum_type)) {
+ btrfs_err(fs_info, "unsupported checksum algorithm: %u",
+ csum_type);
+ err = -EINVAL;
+ btrfs_release_disk_super(disk_super);
+ goto fail_alloc;
+ }
+
+ fs_info->csum_size = btrfs_super_csum_size(disk_super);
+
+ ret = btrfs_init_csum_hash(fs_info, csum_type);
+ if (ret) {
+ err = ret;
+ btrfs_release_disk_super(disk_super);
+ goto fail_alloc;
+ }
+
+ /*
+ * We want to check superblock checksum, the type is stored inside.
+ * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
+ */
+ if (btrfs_check_super_csum(fs_info, disk_super)) {
+ btrfs_err(fs_info, "superblock checksum mismatch");
+ err = -EINVAL;
+ btrfs_release_disk_super(disk_super);
+ goto fail_alloc;
+ }
+
+ /*
+ * super_copy is zeroed at allocation time and we never touch the
+ * following bytes up to INFO_SIZE, the checksum is calculated from
+ * the whole block of INFO_SIZE
+ */
+ memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
+ btrfs_release_disk_super(disk_super);
+
+ disk_super = fs_info->super_copy;
+
+
+ features = btrfs_super_flags(disk_super);
+ if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+ features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
+ btrfs_set_super_flags(disk_super, features);
+ btrfs_info(fs_info,
+ "found metadata UUID change in progress flag, clearing");
+ }
+
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_for_commit));
+
+ ret = btrfs_validate_mount_super(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "superblock contains fatal errors");
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
+ if (!btrfs_super_root(disk_super))
+ goto fail_alloc;
+
+ /* check FS state, whether FS is broken. */
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
+ /*
+ * In the long term, we'll store the compression type in the super
+ * block, and it'll be used for per file compression control.
+ */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
+
+ /* Set up fs_info before parsing mount options */
+ nodesize = btrfs_super_nodesize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = sectorsize;
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
+ fs_info->stripesize = stripesize;
+
+ ret = btrfs_parse_options(fs_info, options, sb->s_flags);
+ if (ret) {
+ err = ret;
+ goto fail_alloc;
+ }
+
+ ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
+ if (ret < 0) {
+ err = ret;
+ goto fail_alloc;
+ }
+
+ if (sectorsize < PAGE_SIZE) {
+ struct btrfs_subpage_info *subpage_info;
+
+ /*
+ * V1 space cache has some hardcoded PAGE_SIZE usage, and is
+ * going to be deprecated.
+ *
+ * Force to use v2 cache for subpage case.
+ */
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
+ "forcing free space tree for sector size %u with page size %lu",
+ sectorsize, PAGE_SIZE);
+
+ btrfs_warn(fs_info,
+ "read-write for sector size %u with page size %lu is experimental",
+ sectorsize, PAGE_SIZE);
+ subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+ if (!subpage_info)
+ goto fail_alloc;
+ btrfs_init_subpage_info(subpage_info, sectorsize);
+ fs_info->subpage_info = subpage_info;
+ }
+
+ ret = btrfs_init_workqueues(fs_info);
+ if (ret) {
+ err = ret;
+ goto fail_sb_buffer;
+ }
+
+ sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
+ sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
+
+ sb->s_blocksize = sectorsize;
+ sb->s_blocksize_bits = blksize_bits(sectorsize);
+ memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_read_sys_array(fs_info);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret) {
+ btrfs_err(fs_info, "failed to read the system array: %d", ret);
+ goto fail_sb_buffer;
+ }
+
+ generation = btrfs_super_chunk_root_generation(disk_super);
+ level = btrfs_super_chunk_root_level(disk_super);
+ ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
+ generation, level);
+ if (ret) {
+ btrfs_err(fs_info, "failed to read chunk root");
+ goto fail_tree_roots;
+ }
+
+ read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+ offsetof(struct btrfs_header, chunk_tree_uuid),
+ BTRFS_UUID_SIZE);
+
+ ret = btrfs_read_chunk_tree(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
+ goto fail_tree_roots;
+ }
+
+ /*
+ * At this point we know all the devices that make this filesystem,
+ * including the seed devices but we don't know yet if the replace
+ * target is required. So free devices that are not part of this
+ * filesystem but skip the replace target device which is checked
+ * below in btrfs_init_dev_replace().
+ */
+ btrfs_free_extra_devids(fs_devices);
+ if (!fs_devices->latest_dev->bdev) {
+ btrfs_err(fs_info, "failed to read devices");
+ goto fail_tree_roots;
+ }
+
+ ret = init_tree_roots(fs_info);
+ if (ret)
+ goto fail_tree_roots;
+
+ /*
+ * Get zone type information of zoned block devices. This will also
+ * handle emulation of a zoned filesystem if a regular device has the
+ * zoned incompat feature flag set.
+ */
+ ret = btrfs_get_dev_zone_info_all_devices(fs_info);
+ if (ret) {
+ btrfs_err(fs_info,
+ "zoned: failed to read device zone info: %d",
+ ret);
+ goto fail_block_groups;
+ }
+
+ /*
+ * If we have a uuid root and we're not being told to rescan we need to
+ * check the generation here so we can set the
+ * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
+ * transaction during a balance or the log replay without updating the
+ * uuid generation, and then if we crash we would rescan the uuid tree,
+ * even though it was perfectly fine.
+ */
+ if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
+ fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
+ set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
+
+ ret = btrfs_verify_dev_extents(fs_info);
+ if (ret) {
+ btrfs_err(fs_info,
+ "failed to verify dev extents against chunks: %d",
+ ret);
+ goto fail_block_groups;
+ }
+ ret = btrfs_recover_balance(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to recover balance: %d", ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_init_dev_stats(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_init_dev_replace(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_check_zoned_mode(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to initialize zoned mode: %d",
+ ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_sysfs_add_fsid(fs_devices);
+ if (ret) {
+ btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
+ ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_sysfs_add_mounted(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
+ goto fail_fsdev_sysfs;
+ }
+
+ ret = btrfs_init_space_info(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to initialize space info: %d", ret);
+ goto fail_sysfs;
+ }
+
+ ret = btrfs_read_block_groups(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to read block groups: %d", ret);
+ goto fail_sysfs;
+ }
+
+ btrfs_free_zone_cache(fs_info);
+
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_warn(fs_info,
+ "writable mount is not allowed due to too many missing devices");
+ goto fail_sysfs;
+ }
+
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
+ "btrfs-cleaner");
+ if (IS_ERR(fs_info->cleaner_kthread))
+ goto fail_sysfs;
+
+ fs_info->transaction_kthread = kthread_run(transaction_kthread,
+ tree_root,
+ "btrfs-transaction");
+ if (IS_ERR(fs_info->transaction_kthread))
+ goto fail_cleaner;
+
+ if (!btrfs_test_opt(fs_info, NOSSD) &&
+ !fs_info->fs_devices->rotating) {
+ btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
+ }
+
+ /*
+ * Mount does not set all options immediately, we can do it now and do
+ * not have to wait for transaction commit
+ */
+ btrfs_apply_pending_changes(fs_info);
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
+ ret = btrfsic_mount(fs_info, fs_devices,
+ btrfs_test_opt(fs_info,
+ CHECK_INTEGRITY_DATA) ? 1 : 0,
+ fs_info->check_integrity_print_mask);
+ if (ret)
+ btrfs_warn(fs_info,
+ "failed to initialize integrity check module: %d",
+ ret);
+ }
+#endif
+ ret = btrfs_read_qgroup_config(fs_info);
+ if (ret)
+ goto fail_trans_kthread;
+
+ if (btrfs_build_ref_tree(fs_info))
+ btrfs_err(fs_info, "couldn't build ref tree");
+
+ /* do not make disk changes in broken FS or nologreplay is given */
+ if (btrfs_super_log_root(disk_super) != 0 &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_info(fs_info, "start tree-log replay");
+ ret = btrfs_replay_log(fs_info, fs_devices);
+ if (ret) {
+ err = ret;
+ goto fail_qgroup;
+ }
+ }
+
+ fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
+ if (IS_ERR(fs_info->fs_root)) {
+ err = PTR_ERR(fs_info->fs_root);
+ btrfs_warn(fs_info, "failed to read fs tree: %d", err);
+ fs_info->fs_root = NULL;
+ goto fail_qgroup;
+ }
+
+ if (sb_rdonly(sb))
+ goto clear_oneshot;
+
+ ret = btrfs_start_pre_rw_mount(fs_info);
+ if (ret) {
+ close_ctree(fs_info);
+ return ret;
+ }
+ btrfs_discard_resume(fs_info);
+
+ if (fs_info->uuid_root &&
+ (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
+ fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
+ btrfs_info(fs_info, "checking UUID tree");
+ ret = btrfs_check_uuid_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to check the UUID tree: %d", ret);
+ close_ctree(fs_info);
+ return ret;
+ }
+ }
+
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+
+ /* Kick the cleaner thread so it'll start deleting snapshots. */
+ if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
+
+clear_oneshot:
+ btrfs_clear_oneshot_options(fs_info);
+ return 0;
+
+fail_qgroup:
+ btrfs_free_qgroup_config(fs_info);
+fail_trans_kthread:
+ kthread_stop(fs_info->transaction_kthread);
+ btrfs_cleanup_transaction(fs_info);
+ btrfs_free_fs_roots(fs_info);
+fail_cleaner:
+ kthread_stop(fs_info->cleaner_kthread);
+
+ /*
+ * make sure we're done with the btree inode before we stop our
+ * kthreads
+ */
+ filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+
+fail_sysfs:
+ btrfs_sysfs_remove_mounted(fs_info);
+
+fail_fsdev_sysfs:
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
+
+fail_block_groups:
+ btrfs_put_block_group_cache(fs_info);
+
+fail_tree_roots:
+ if (fs_info->data_reloc_root)
+ btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
+ free_root_pointers(fs_info, true);
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
+fail_sb_buffer:
+ btrfs_stop_all_workers(fs_info);
+ btrfs_free_block_groups(fs_info);
+fail_alloc:
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+ iput(fs_info->btree_inode);
+fail:
+ btrfs_close_devices(fs_info->fs_devices);
+ return err;
+}
+ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
+
+static void btrfs_end_super_write(struct bio *bio)
+{
+ struct btrfs_device *device = bio->bi_private;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ struct page *page;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ page = bvec->bv_page;
+
+ if (bio->bi_status) {
+ btrfs_warn_rl_in_rcu(device->fs_info,
+ "lost page write due to IO error on %s (%d)",
+ rcu_str_deref(device->name),
+ blk_status_to_errno(bio->bi_status));
+ ClearPageUptodate(page);
+ SetPageError(page);
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ } else {
+ SetPageUptodate(page);
+ }
+
+ put_page(page);
+ unlock_page(page);
+ }
+
+ bio_put(bio);
+}
+
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
+ int copy_num, bool drop_cache)
+{
+ struct btrfs_super_block *super;
+ struct page *page;
+ u64 bytenr, bytenr_orig;
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+ int ret;
+
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret == -ENOENT)
+ return ERR_PTR(-EINVAL);
+ else if (ret)
+ return ERR_PTR(ret);
+
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
+ return ERR_PTR(-EINVAL);
+
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
+
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping,
+ bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+
+ super = page_address(page);
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-ENODATA);
+ }
+
+ if (btrfs_super_bytenr(super) != bytenr_orig) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return super;
+}
+
+
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
+{
+ struct btrfs_super_block *super, *latest = NULL;
+ int i;
+ u64 transid = 0;
+
+ /* we would like to check all the supers, but that would make
+ * a btrfs mount succeed after a mkfs from a different FS.
+ * So, we need to add a special mount option to scan for
+ * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ */
+ for (i = 0; i < 1; i++) {
+ super = btrfs_read_dev_one_super(bdev, i, false);
+ if (IS_ERR(super))
+ continue;
+
+ if (!latest || btrfs_super_generation(super) > transid) {
+ if (latest)
+ btrfs_release_disk_super(super);
+
+ latest = super;
+ transid = btrfs_super_generation(super);
+ }
+ }
+
+ return super;
+}
+
+/*
+ * Write superblock @sb to the @device. Do not wait for completion, all the
+ * pages we use for writing are locked.
+ *
+ * Write @max_mirrors copies of the superblock, where 0 means default that fit
+ * the expected device size at commit time. Note that max_mirrors must be
+ * same for write and wait phases.
+ *
+ * Return number of errors when page is not found or submission fails.
+ */
+static int write_dev_supers(struct btrfs_device *device,
+ struct btrfs_super_block *sb, int max_mirrors)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct address_space *mapping = device->bdev->bd_inode->i_mapping;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ int i;
+ int errors = 0;
+ int ret;
+ u64 bytenr, bytenr_orig;
+
+ if (max_mirrors == 0)
+ max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+ shash->tfm = fs_info->csum_shash;
+
+ for (i = 0; i < max_mirrors; i++) {
+ struct page *page;
+ struct bio *bio;
+ struct btrfs_super_block *disk_super;
+
+ bytenr_orig = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
+ if (ret == -ENOENT) {
+ continue;
+ } else if (ret < 0) {
+ btrfs_err(device->fs_info,
+ "couldn't get super block location for mirror %d",
+ i);
+ errors++;
+ continue;
+ }
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->commit_total_bytes)
+ break;
+
+ btrfs_set_super_bytenr(sb, bytenr_orig);
+
+ crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
+ sb->csum);
+
+ page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
+ GFP_NOFS);
+ if (!page) {
+ btrfs_err(device->fs_info,
+ "couldn't get super block page for bytenr %llu",
+ bytenr);
+ errors++;
+ continue;
+ }
+
+ /* Bump the refcount for wait_dev_supers() */
+ get_page(page);
+
+ disk_super = page_address(page);
+ memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
+
+ /*
+ * Directly use bios here instead of relying on the page cache
+ * to do I/O, so we don't lose the ability to do integrity
+ * checking.
+ */
+ bio = bio_alloc(device->bdev, 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+ GFP_NOFS);
+ bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+ bio->bi_private = device;
+ bio->bi_end_io = btrfs_end_super_write;
+ __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
+ offset_in_page(bytenr));
+
+ /*
+ * We FUA only the first super block. The others we allow to
+ * go down lazy and there's a short window where the on-disk
+ * copies might still contain the older version.
+ */
+ if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
+ bio->bi_opf |= REQ_FUA;
+
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
+
+ if (btrfs_advance_sb_log(device, i))
+ errors++;
+ }
+ return errors < i ? 0 : -1;
+}
+
+/*
+ * Wait for write completion of superblocks done by write_dev_supers,
+ * @max_mirrors same for write and wait phases.
+ *
+ * Return number of errors when page is not found or not marked up to
+ * date.
+ */
+static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
+{
+ int i;
+ int errors = 0;
+ bool primary_failed = false;
+ int ret;
+ u64 bytenr;
+
+ if (max_mirrors == 0)
+ max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+ for (i = 0; i < max_mirrors; i++) {
+ struct page *page;
+
+ ret = btrfs_sb_log_location(device, i, READ, &bytenr);
+ if (ret == -ENOENT) {
+ break;
+ } else if (ret < 0) {
+ errors++;
+ if (i == 0)
+ primary_failed = true;
+ continue;
+ }
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->commit_total_bytes)
+ break;
+
+ page = find_get_page(device->bdev->bd_inode->i_mapping,
+ bytenr >> PAGE_SHIFT);
+ if (!page) {
+ errors++;
+ if (i == 0)
+ primary_failed = true;
+ continue;
+ }
+ /* Page is submitted locked and unlocked once the IO completes */
+ wait_on_page_locked(page);
+ if (PageError(page)) {
+ errors++;
+ if (i == 0)
+ primary_failed = true;
+ }
+
+ /* Drop our reference */
+ put_page(page);
+
+ /* Drop the reference from the writing run */
+ put_page(page);
+ }
+
+ /* log error, force error return */
+ if (primary_failed) {
+ btrfs_err(device->fs_info, "error writing primary super block to device %llu",
+ device->devid);
+ return -1;
+ }
+
+ return errors < i ? 0 : -1;
+}
+
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio)
+{
+ bio_uninit(bio);
+ complete(bio->bi_private);
+}
+
+/*
+ * Submit a flush request to the device if it supports it. Error handling is
+ * done in the waiting counterpart.
+ */
+static void write_dev_flush(struct btrfs_device *device)
+{
+ struct bio *bio = &device->flush_bio;
+
+#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * When a disk has write caching disabled, we skip submission of a bio
+ * with flush and sync requests before writing the superblock, since
+ * it's not needed. However when the integrity checker is enabled, this
+ * results in reports that there are metadata blocks referred by a
+ * superblock that were not properly flushed. So don't skip the bio
+ * submission only when the integrity checker is enabled for the sake
+ * of simplicity, since this is a debug tool and not meant for use in
+ * non-debug builds.
+ */
+ if (!bdev_write_cache(device->bdev))
+ return;
+#endif
+
+ bio_init(bio, device->bdev, NULL, 0,
+ REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
+ bio->bi_end_io = btrfs_end_empty_barrier;
+ init_completion(&device->flush_wait);
+ bio->bi_private = &device->flush_wait;
+
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
+ set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
+}
+
+/*
+ * If the flush bio has been submitted by write_dev_flush, wait for it.
+ */
+static blk_status_t wait_dev_flush(struct btrfs_device *device)
+{
+ struct bio *bio = &device->flush_bio;
+
+ if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
+ return BLK_STS_OK;
+
+ clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
+ wait_for_completion_io(&device->flush_wait);
+
+ return bio->bi_status;
+}
+
+static int check_barrier_error(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_check_rw_degradable(fs_info, NULL))
+ return -EIO;
+ return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ int errors_wait = 0;
+ blk_status_t ret;
+
+ lockdep_assert_held(&info->fs_devices->device_list_mutex);
+ /* send down all the barriers */
+ head = &info->fs_devices->devices;
+ list_for_each_entry(dev, head, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (!dev->bdev)
+ continue;
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
+ continue;
+
+ write_dev_flush(dev);
+ dev->last_flush_error = BLK_STS_OK;
+ }
+
+ /* wait for all the barriers */
+ list_for_each_entry(dev, head, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (!dev->bdev) {
+ errors_wait++;
+ continue;
+ }
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
+ continue;
+
+ ret = wait_dev_flush(dev);
+ if (ret) {
+ dev->last_flush_error = ret;
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
+ errors_wait++;
+ }
+ }
+
+ if (errors_wait) {
+ /*
+ * At some point we need the status of all disks
+ * to arrive at the volume status. So error checking
+ * is being pushed to a separate loop.
+ */
+ return check_barrier_error(info);
+ }
+ return 0;
+}
+
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
+{
+ int raid_type;
+ int min_tolerated = INT_MAX;
+
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+ (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+ min_tolerated = min_t(int, min_tolerated,
+ btrfs_raid_array[BTRFS_RAID_SINGLE].
+ tolerated_failures);
+
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (raid_type == BTRFS_RAID_SINGLE)
+ continue;
+ if (!(flags & btrfs_raid_array[raid_type].bg_flag))
+ continue;
+ min_tolerated = min_t(int, min_tolerated,
+ btrfs_raid_array[raid_type].
+ tolerated_failures);
+ }
+
+ if (min_tolerated == INT_MAX) {
+ pr_warn("BTRFS: unknown raid flag: %llu", flags);
+ min_tolerated = 0;
+ }
+
+ return min_tolerated;
+}
+
+int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ struct btrfs_super_block *sb;
+ struct btrfs_dev_item *dev_item;
+ int ret;
+ int do_barriers;
+ int max_errors;
+ int total_errors = 0;
+ u64 flags;
+
+ do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
+
+ /*
+ * max_mirrors == 0 indicates we're from commit_transaction,
+ * not from fsync where the tree roots in fs_info have not
+ * been consistent on disk.
+ */
+ if (max_mirrors == 0)
+ backup_super_roots(fs_info);
+
+ sb = fs_info->super_for_commit;
+ dev_item = &sb->dev_item;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ head = &fs_info->fs_devices->devices;
+ max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
+
+ if (do_barriers) {
+ ret = barrier_all_devices(fs_info);
+ if (ret) {
+ mutex_unlock(
+ &fs_info->fs_devices->device_list_mutex);
+ btrfs_handle_fs_error(fs_info, ret,
+ "errors while submitting device barriers.");
+ return ret;
+ }
+ }
+
+ list_for_each_entry(dev, head, dev_list) {
+ if (!dev->bdev) {
+ total_errors++;
+ continue;
+ }
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
+ continue;
+
+ btrfs_set_stack_device_generation(dev_item, 0);
+ btrfs_set_stack_device_type(dev_item, dev->type);
+ btrfs_set_stack_device_id(dev_item, dev->devid);
+ btrfs_set_stack_device_total_bytes(dev_item,
+ dev->commit_total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item,
+ dev->commit_bytes_used);
+ btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+ btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+ btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+ memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+ memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE);
+
+ flags = btrfs_super_flags(sb);
+ btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+ ret = btrfs_validate_write_super(fs_info, sb);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ btrfs_handle_fs_error(fs_info, -EUCLEAN,
+ "unexpected superblock corruption detected");
+ return -EUCLEAN;
+ }
+
+ ret = write_dev_supers(dev, sb, max_mirrors);
+ if (ret)
+ total_errors++;
+ }
+ if (total_errors > max_errors) {
+ btrfs_err(fs_info, "%d errors while writing supers",
+ total_errors);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ /* FUA is masked off if unsupported and can't be the reason */
+ btrfs_handle_fs_error(fs_info, -EIO,
+ "%d errors while writing supers",
+ total_errors);
+ return -EIO;
+ }
+
+ total_errors = 0;
+ list_for_each_entry(dev, head, dev_list) {
+ if (!dev->bdev)
+ continue;
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
+ continue;
+
+ ret = wait_dev_supers(dev, max_mirrors);
+ if (ret)
+ total_errors++;
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (total_errors > max_errors) {
+ btrfs_handle_fs_error(fs_info, -EIO,
+ "%d errors while writing supers",
+ total_errors);
+ return -EIO;
+ }
+ return 0;
+}
+
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ bool drop_ref = false;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_delete(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ drop_ref = true;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (BTRFS_FS_ERROR(fs_info)) {
+ ASSERT(root->log_root == NULL);
+ if (root->reloc_root) {
+ btrfs_put_root(root->reloc_root);
+ root->reloc_root = NULL;
+ }
+ }
+
+ if (drop_ref)
+ btrfs_put_root(root);
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ u64 root_objectid = 0;
+ struct btrfs_root *gang[8];
+ int i = 0;
+ int err = 0;
+ unsigned int ret = 0;
+
+ while (1) {
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang));
+ if (!ret) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ break;
+ }
+ root_objectid = gang[ret - 1]->root_key.objectid + 1;
+
+ for (i = 0; i < ret; i++) {
+ /* Avoid to grab roots in dead_roots */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* grab all the search result for later use */
+ gang[i] = btrfs_grab_root(gang[i]);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
+ root_objectid = gang[i]->root_key.objectid;
+ err = btrfs_orphan_cleanup(gang[i]);
+ if (err)
+ break;
+ btrfs_put_root(gang[i]);
+ }
+ root_objectid++;
+ }
+
+ /* release the uncleaned roots due to error */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_root(gang[i]);
+ }
+ return err;
+}
+
+int btrfs_commit_super(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_trans_handle *trans;
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(fs_info);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ wake_up_process(fs_info->cleaner_kthread);
+
+ /* wait until ongoing cleanup work done */
+ down_write(&fs_info->cleanup_work_sem);
+ up_write(&fs_info->cleanup_work_sem);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ return btrfs_commit_transaction(trans);
+}
+
+static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_transaction *trans;
+ struct btrfs_transaction *tmp;
+ bool found = false;
+
+ if (list_empty(&fs_info->trans_list))
+ return;
+
+ /*
+ * This function is only called at the very end of close_ctree(),
+ * thus no other running transaction, no need to take trans_lock.
+ */
+ ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
+ list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
+ struct extent_state *cached = NULL;
+ u64 dirty_bytes = 0;
+ u64 cur = 0;
+ u64 found_start;
+ u64 found_end;
+
+ found = true;
+ while (!find_first_extent_bit(&trans->dirty_pages, cur,
+ &found_start, &found_end, EXTENT_DIRTY, &cached)) {
+ dirty_bytes += found_end + 1 - found_start;
+ cur = found_end + 1;
+ }
+ btrfs_warn(fs_info,
+ "transaction %llu (with %llu dirty metadata bytes) is not committed",
+ trans->transid, dirty_bytes);
+ btrfs_cleanup_one_transaction(trans, fs_info);
+
+ if (trans == fs_info->running_transaction)
+ fs_info->running_transaction = NULL;
+ list_del_init(&trans->list);
+
+ btrfs_put_transaction(trans);
+ trace_btrfs_transaction_commit(fs_info);
+ }
+ ASSERT(!found);
+}
+
+void __cold close_ctree(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+
+ /*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ * We must do this before stopping the block group reclaim task, because
+ * at btrfs_relocate_block_group() we wait for this bit, and after the
+ * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
+ * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
+ * return 1.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
+ /*
+ * We may have the reclaim task running and relocating a data block group,
+ * in which case it may create delayed iputs. So stop it before we park
+ * the cleaner kthread otherwise we can get new delayed iputs after
+ * parking the cleaner, and that can make the async reclaim task to hang
+ * if it's waiting for delayed iputs to complete, since the cleaner is
+ * parked and can not run delayed iputs - this will make us hang when
+ * trying to stop the async reclaim task.
+ */
+ cancel_work_sync(&fs_info->reclaim_bgs_work);
+ /*
+ * We don't want the cleaner to start new transactions, add more delayed
+ * iputs, etc. while we're closing. We can't use kthread_stop() yet
+ * because that frees the task_struct, and the transaction kthread might
+ * still try to wake up the cleaner.
+ */
+ kthread_park(fs_info->cleaner_kthread);
+
+ /* wait for the qgroup rescan worker to stop */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /* wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* avoid complains from lockdep et al., set sem back to initial state */
+ up(&fs_info->uuid_tree_rescan_sem);
+
+ /* pause restriper - we want to resume on mount */
+ btrfs_pause_balance(fs_info);
+
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+
+ btrfs_scrub_cancel(fs_info);
+
+ /* wait for any defraggers to finish */
+ wait_event(fs_info->transaction_wait,
+ (atomic_read(&fs_info->defrag_running) == 0));
+
+ /* clear out the rbtree of defraggable inodes */
+ btrfs_cleanup_defrag_inodes(fs_info);
+
+ /*
+ * After we parked the cleaner kthread, ordered extents may have
+ * completed and created new delayed iputs. If one of the async reclaim
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+ * can hang forever trying to stop it, because if a delayed iput is
+ * added after it ran btrfs_run_delayed_iputs() and before it called
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+ * no one else to run iputs.
+ *
+ * So wait for all ongoing ordered extents to complete and then run
+ * delayed iputs. This works because once we reach this point no one
+ * can either create new ordered extents nor create delayed iputs
+ * through some other means.
+ *
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+ * but the delayed iput for the respective inode is made only when doing
+ * the final btrfs_put_ordered_extent() (which must happen at
+ * btrfs_finish_ordered_io() when we are unmounting).
+ */
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
+ /* Ordered extents for free space inodes. */
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ btrfs_run_delayed_iputs(fs_info);
+
+ cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
+ cancel_work_sync(&fs_info->preempt_reclaim_work);
+
+ /* Cancel or finish ongoing discard work */
+ btrfs_discard_cleanup(fs_info);
+
+ if (!sb_rdonly(fs_info->sb)) {
+ /*
+ * The cleaner kthread is stopped, so do one final pass over
+ * unused block groups.
+ */
+ btrfs_delete_unused_bgs(fs_info);
+
+ /*
+ * There might be existing delayed inode workers still running
+ * and holding an empty delayed inode item. We must wait for
+ * them to complete first because they can create a transaction.
+ * This happens when someone calls btrfs_balance_delayed_items()
+ * and then a transaction commit runs the same delayed nodes
+ * before any delayed worker has done something with the nodes.
+ * We must wait for any worker here and not at transaction
+ * commit time since that could cause a deadlock.
+ * This is a very rare case.
+ */
+ btrfs_flush_workqueue(fs_info->delayed_workers);
+
+ ret = btrfs_commit_super(fs_info);
+ if (ret)
+ btrfs_err(fs_info, "commit super ret %d", ret);
+ }
+
+ if (BTRFS_FS_ERROR(fs_info))
+ btrfs_error_commit_super(fs_info);
+
+ kthread_stop(fs_info->transaction_kthread);
+ kthread_stop(fs_info->cleaner_kthread);
+
+ ASSERT(list_empty(&fs_info->delayed_iputs));
+ set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
+
+ if (btrfs_check_quota_leak(fs_info)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err(fs_info, "qgroup reserved space leaked");
+ }
+
+ btrfs_free_qgroup_config(fs_info);
+ ASSERT(list_empty(&fs_info->delalloc_roots));
+
+ if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+ btrfs_info(fs_info, "at unmount delalloc count %lld",
+ percpu_counter_sum(&fs_info->delalloc_bytes));
+ }
+
+ if (percpu_counter_sum(&fs_info->ordered_bytes))
+ btrfs_info(fs_info, "at unmount dio bytes count %lld",
+ percpu_counter_sum(&fs_info->ordered_bytes));
+
+ btrfs_sysfs_remove_mounted(fs_info);
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
+
+ btrfs_put_block_group_cache(fs_info);
+
+ /*
+ * we must make sure there is not any read request to
+ * submit after we stopping all workers.
+ */
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ btrfs_stop_all_workers(fs_info);
+
+ /* We shouldn't have any transaction open at this point */
+ warn_about_uncommitted_trans(fs_info);
+
+ clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
+ free_root_pointers(fs_info, true);
+ btrfs_free_fs_roots(fs_info);
+
+ /*
+ * We must free the block groups after dropping the fs_roots as we could
+ * have had an IO error and have left over tree log blocks that aren't
+ * cleaned up until the fs roots are freed. This makes the block group
+ * accounting appear to be wrong because there's pending reserved bytes,
+ * so make sure we do the block group cleanup afterwards.
+ */
+ btrfs_free_block_groups(fs_info);
+
+ iput(fs_info->btree_inode);
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
+ btrfsic_unmount(fs_info->fs_devices);
+#endif
+
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_close_devices(fs_info->fs_devices);
+}
+
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+ int atomic)
+{
+ int ret;
+ struct inode *btree_inode = buf->pages[0]->mapping->host;
+
+ ret = extent_buffer_uptodate(buf);
+ if (!ret)
+ return ret;
+
+ ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+ parent_transid, atomic);
+ if (ret == -EAGAIN)
+ return ret;
+ return !ret;
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+ struct btrfs_fs_info *fs_info = buf->fs_info;
+ u64 transid = btrfs_header_generation(buf);
+ int was_dirty;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ /*
+ * This is a fast path so only do this check if we have sanity tests
+ * enabled. Normal people shouldn't be using unmapped buffers as dirty
+ * outside of the sanity tests.
+ */
+ if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
+ return;
+#endif
+ btrfs_assert_tree_write_locked(buf);
+ if (transid != fs_info->generation)
+ WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
+ buf->start, transid, fs_info->generation);
+ was_dirty = set_extent_buffer_dirty(buf);
+ if (!was_dirty)
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ buf->len,
+ fs_info->dirty_metadata_batch);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * Since btrfs_mark_buffer_dirty() can be called with item pointer set
+ * but item data not updated.
+ * So here we should only check item pointers, not item data.
+ */
+ if (btrfs_header_level(buf) == 0 &&
+ btrfs_check_leaf_relaxed(buf)) {
+ btrfs_print_leaf(buf);
+ ASSERT(0);
+ }
+#endif
+}
+
+static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
+ int flush_delayed)
+{
+ /*
+ * looks as though older kernels can get into trouble with
+ * this code, they end up stuck in balance_dirty_pages forever
+ */
+ int ret;
+
+ if (current->flags & PF_MEMALLOC)
+ return;
+
+ if (flush_delayed)
+ btrfs_balance_delayed_items(fs_info);
+
+ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH,
+ fs_info->dirty_metadata_batch);
+ if (ret > 0) {
+ balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
+ }
+}
+
+void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
+{
+ __btrfs_btree_balance_dirty(fs_info, 1);
+}
+
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
+{
+ __btrfs_btree_balance_dirty(fs_info, 0);
+}
+
+static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
+{
+ /* cleanup FS via transaction */
+ btrfs_cleanup_transaction(fs_info);
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(fs_info);
+ mutex_unlock(&fs_info->cleaner_mutex);
+
+ down_write(&fs_info->cleanup_work_sem);
+ up_write(&fs_info->cleanup_work_sem);
+}
+
+static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *gang[8];
+ u64 root_objectid = 0;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang))) != 0) {
+ int i;
+
+ for (i = 0; i < ret; i++)
+ gang[i] = btrfs_grab_root(gang[i]);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
+ root_objectid = gang[i]->root_key.objectid;
+ btrfs_free_log(NULL, gang[i]);
+ btrfs_put_root(gang[i]);
+ }
+ root_objectid++;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ btrfs_free_log_root_tree(NULL, fs_info);
+}
+
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&root->ordered_extent_lock);
+ /*
+ * This will just short circuit the ordered completion stuff which will
+ * make sure the ordered extent gets properly cleaned up.
+ */
+ list_for_each_entry(ordered, &root->ordered_extents,
+ root_extent_list)
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+ spin_unlock(&root->ordered_extent_lock);
+}
+
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ list_move_tail(&root->ordered_root,
+ &fs_info->ordered_roots);
+
+ spin_unlock(&fs_info->ordered_root_lock);
+ btrfs_destroy_ordered_extents(root);
+
+ cond_resched();
+ spin_lock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&fs_info->ordered_root_lock);
+
+ /*
+ * We need this here because if we've been flipped read-only we won't
+ * get sync() from the umount, so we need to make sure any ordered
+ * extents that haven't had their dirty pages IO start writeout yet
+ * actually get run and error out properly.
+ */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ int ret = 0;
+
+ delayed_refs = &trans->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ if (atomic_read(&delayed_refs->num_entries) == 0) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_debug(fs_info, "delayed_refs has NO entry");
+ return ret;
+ }
+
+ while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
+ struct btrfs_delayed_ref_head *head;
+ struct rb_node *n;
+ bool pin_bytes = false;
+
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ if (btrfs_delayed_ref_lock(delayed_refs, head))
+ continue;
+
+ spin_lock(&head->lock);
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
+ ref = rb_entry(n, struct btrfs_delayed_ref_node,
+ ref_node);
+ ref->in_tree = 0;
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
+ RB_CLEAR_NODE(&ref->ref_node);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
+ atomic_dec(&delayed_refs->num_entries);
+ btrfs_put_delayed_ref(ref);
+ }
+ if (head->must_insert_reserved)
+ pin_bytes = true;
+ btrfs_free_delayed_extent_op(head->extent_op);
+ btrfs_delete_ref_head(delayed_refs, head);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ mutex_unlock(&head->mutex);
+
+ if (pin_bytes) {
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, head->bytenr);
+ BUG_ON(!cache);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += head->num_bytes;
+ btrfs_space_info_update_bytes_pinned(fs_info,
+ cache->space_info, head->num_bytes);
+ cache->reserved -= head->num_bytes;
+ cache->space_info->bytes_reserved -= head->num_bytes;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ btrfs_put_block_group(cache);
+
+ btrfs_error_unpin_extent_range(fs_info, head->bytenr,
+ head->bytenr + head->num_bytes - 1);
+ }
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ btrfs_put_delayed_ref_head(head);
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ btrfs_qgroup_destroy_extent_records(trans);
+
+ spin_unlock(&delayed_refs->lock);
+
+ return ret;
+}
+
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
+
+ while (!list_empty(&splice)) {
+ struct inode *inode = NULL;
+ btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+ delalloc_inodes);
+ __btrfs_del_delalloc_inode(root, btrfs_inode);
+ spin_unlock(&root->delalloc_lock);
+
+ /*
+ * Make sure we get a live inode and that it'll not disappear
+ * meanwhile.
+ */
+ inode = igrab(&btrfs_inode->vfs_inode);
+ if (inode) {
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
+ invalidate_inode_pages2(inode->i_mapping);
+ memalloc_nofs_restore(nofs_flag);
+ iput(inode);
+ }
+ spin_lock(&root->delalloc_lock);
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ root = btrfs_grab_root(root);
+ BUG_ON(!root);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ btrfs_destroy_delalloc_inodes(root);
+ btrfs_put_root(root);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ }
+ spin_unlock(&fs_info->delalloc_root_lock);
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages,
+ int mark)
+{
+ int ret;
+ struct extent_buffer *eb;
+ u64 start = 0;
+ u64 end;
+
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, NULL);
+ if (ret)
+ break;
+
+ clear_extent_bits(dirty_pages, start, end, mark);
+ while (start <= end) {
+ eb = find_extent_buffer(fs_info, start);
+ start += fs_info->nodesize;
+ if (!eb)
+ continue;
+ wait_on_extent_buffer_writeback(eb);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+ &eb->bflags))
+ clear_extent_buffer_dirty(eb);
+ free_extent_buffer_stale(eb);
+ }
+ }
+
+ return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *unpin)
+{
+ u64 start;
+ u64 end;
+ int ret;
+
+ while (1) {
+ struct extent_state *cached_state = NULL;
+
+ /*
+ * The btrfs_finish_extent_commit() may get the same range as
+ * ours between find_first_extent_bit and clear_extent_dirty.
+ * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
+ * the same extent range.
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ break;
+ }
+
+ clear_extent_dirty(unpin, start, end, &cached_state);
+ free_extent_state(cached_state);
+ btrfs_error_unpin_extent_range(fs_info, start, end);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
+{
+ struct inode *inode;
+
+ inode = cache->io_ctl.inode;
+ if (inode) {
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
+ invalidate_inode_pages2(inode->i_mapping);
+ memalloc_nofs_restore(nofs_flag);
+
+ BTRFS_I(inode)->generation = 0;
+ cache->io_ctl.inode = NULL;
+ iput(inode);
+ }
+ ASSERT(cache->io_ctl.pages == NULL);
+ btrfs_put_block_group(cache);
+}
+
+void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *cache;
+
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ while (!list_empty(&cur_trans->dirty_bgs)) {
+ cache = list_first_entry(&cur_trans->dirty_bgs,
+ struct btrfs_block_group,
+ dirty_list);
+
+ if (!list_empty(&cache->io_list)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ list_del_init(&cache->io_list);
+ btrfs_cleanup_bg_io(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ }
+
+ list_del_init(&cache->dirty_list);
+ spin_lock(&cache->lock);
+ cache->disk_cache_state = BTRFS_DC_ERROR;
+ spin_unlock(&cache->lock);
+
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ btrfs_put_block_group(cache);
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
+ while (!list_empty(&cur_trans->io_bgs)) {
+ cache = list_first_entry(&cur_trans->io_bgs,
+ struct btrfs_block_group,
+ io_list);
+
+ list_del_init(&cache->io_list);
+ spin_lock(&cache->lock);
+ cache->disk_cache_state = BTRFS_DC_ERROR;
+ spin_unlock(&cache->lock);
+ btrfs_cleanup_bg_io(cache);
+ }
+}
+
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *dev, *tmp;
+
+ btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
+ ASSERT(list_empty(&cur_trans->dirty_bgs));
+ ASSERT(list_empty(&cur_trans->io_bgs));
+
+ list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
+ post_commit_list) {
+ list_del_init(&dev->post_commit_list);
+ }
+
+ btrfs_destroy_delayed_refs(cur_trans, fs_info);
+
+ cur_trans->state = TRANS_STATE_COMMIT_START;
+ wake_up(&fs_info->transaction_blocked_wait);
+
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
+ wake_up(&fs_info->transaction_wait);
+
+ btrfs_destroy_delayed_inodes(fs_info);
+
+ btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
+ EXTENT_DIRTY);
+ btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
+
+ btrfs_free_redirty_list(cur_trans);
+
+ cur_trans->state =TRANS_STATE_COMPLETED;
+ wake_up(&cur_trans->commit_wait);
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_transaction *t;
+
+ mutex_lock(&fs_info->transaction_kthread_mutex);
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&fs_info->trans_list)) {
+ t = list_first_entry(&fs_info->trans_list,
+ struct btrfs_transaction, list);
+ if (t->state >= TRANS_STATE_COMMIT_START) {
+ refcount_inc(&t->use_count);
+ spin_unlock(&fs_info->trans_lock);
+ btrfs_wait_for_commit(fs_info, t->transid);
+ btrfs_put_transaction(t);
+ spin_lock(&fs_info->trans_lock);
+ continue;
+ }
+ if (t == fs_info->running_transaction) {
+ t->state = TRANS_STATE_COMMIT_DOING;
+ spin_unlock(&fs_info->trans_lock);
+ /*
+ * We wait for 0 num_writers since we don't hold a trans
+ * handle open currently for this transaction.
+ */
+ wait_event(t->writer_wait,
+ atomic_read(&t->num_writers) == 0);
+ } else {
+ spin_unlock(&fs_info->trans_lock);
+ }
+ btrfs_cleanup_one_transaction(t, fs_info);
+
+ spin_lock(&fs_info->trans_lock);
+ if (t == fs_info->running_transaction)
+ fs_info->running_transaction = NULL;
+ list_del_init(&t->list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_put_transaction(t);
+ trace_btrfs_transaction_commit(fs_info);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+ btrfs_destroy_all_ordered_extents(fs_info);
+ btrfs_destroy_delayed_inodes(fs_info);
+ btrfs_assert_delayed_root_empty(fs_info);
+ btrfs_destroy_all_delalloc_inodes(fs_info);
+ btrfs_drop_all_logs(fs_info);
+ mutex_unlock(&fs_info->transaction_kthread_mutex);
+
+ return 0;
+}
+
+int btrfs_init_root_free_objectid(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ int slot;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+ search_key.type = -1;
+ search_key.offset = (u64)-1;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0); /* Corruption */
+ if (path->slots[0] > 0) {
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ root->free_objectid = max_t(u64, found_key.objectid + 1,
+ BTRFS_FIRST_FREE_OBJECTID);
+ } else {
+ root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
+{
+ int ret;
+ mutex_lock(&root->objectid_mutex);
+
+ if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ btrfs_warn(root->fs_info,
+ "the objectid of root %llu reaches its highest value",
+ root->root_key.objectid);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ *objectid = root->free_objectid++;
+ ret = 0;
+out:
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000..7322af63c
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_DISK_IO_H
+#define BTRFS_DISK_IO_H
+
+#define BTRFS_SUPER_MIRROR_MAX 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+/*
+ * Fixed blocksize for all devices, applies to specific ways of reading
+ * metadata like superblock. Must meet the set_blocksize requirements.
+ *
+ * Do not change.
+ */
+#define BTRFS_BDEV_BLOCKSIZE (4096)
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+ u64 start = SZ_16K;
+ if (mirror)
+ return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+ return BTRFS_SUPER_INFO_OFFSET;
+}
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key, u64 parent_transid);
+struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
+struct extent_buffer *btrfs_find_create_tree_block(
+ struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root,
+ int level);
+void btrfs_clean_tree_block(struct extent_buffer *buf);
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb);
+int __cold open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options);
+void __cold close_ctree(struct btrfs_fs_info *fs_info);
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num);
+int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
+int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
+ int copy_num, bool drop_cache);
+int btrfs_commit_super(struct btrfs_fs_info *fs_info);
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, bool check_ref);
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev);
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid);
+int btrfs_global_root_insert(struct btrfs_root *root);
+void btrfs_global_root_delete(struct btrfs_root *root);
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key);
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
+#endif
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
+{
+ if (!root)
+ return NULL;
+ if (refcount_inc_not_zero(&root->refs))
+ return root;
+ return NULL;
+}
+
+static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
+void btrfs_put_root(struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+ int atomic);
+int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start);
+blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
+ int mirror_num);
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+ u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+ void (*flush_fn)(void *));
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_init_root_free_objectid(struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000..fab7eb76e
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+ parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+ parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+ int len = *max_len;
+ int type;
+
+ if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+ *max_len = BTRFS_FID_SIZE_CONNECTABLE;
+ return FILEID_INVALID;
+ } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+ *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ return FILEID_INVALID;
+ }
+
+ len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ type = FILEID_BTRFS_WITHOUT_PARENT;
+
+ fid->objectid = btrfs_ino(BTRFS_I(inode));
+ fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
+ fid->gen = inode->i_generation;
+
+ if (parent) {
+ u64 parent_root_id;
+
+ fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+ fid->parent_gen = parent->i_generation;
+ parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
+
+ if (parent_root_id != fid->root_objectid) {
+ fid->parent_root_objectid = parent_root_id;
+ len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+ type = FILEID_BTRFS_WITH_PARENT_ROOT;
+ } else {
+ len = BTRFS_FID_SIZE_CONNECTABLE;
+ type = FILEID_BTRFS_WITH_PARENT;
+ }
+ }
+
+ *max_len = len;
+ return type;
+}
+
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u64 generation,
+ int check_generation)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root;
+ struct inode *inode;
+
+ if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+ return ERR_PTR(-ESTALE);
+
+ root = btrfs_get_fs_root(fs_info, root_objectid, true);
+ if (IS_ERR(root))
+ return ERR_CAST(root);
+
+ inode = btrfs_iget(sb, objectid, root);
+ btrfs_put_root(root);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ if (check_generation && generation != inode->i_generation) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return d_obtain_alias(inode);
+}
+
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+ u64 objectid, root_objectid;
+ u32 generation;
+
+ if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+ if (fh_len < BTRFS_FID_SIZE_CONNECTABLE)
+ return NULL;
+ root_objectid = fid->root_objectid;
+ } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+ if (fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+ return NULL;
+ root_objectid = fid->parent_root_objectid;
+ } else
+ return NULL;
+
+ objectid = fid->parent_objectid;
+ generation = fid->parent_gen;
+
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+ u64 objectid, root_objectid;
+ u32 generation;
+
+ if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+ fh_len < BTRFS_FID_SIZE_CONNECTABLE) &&
+ (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+ fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+ (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+ fh_len < BTRFS_FID_SIZE_NON_CONNECTABLE))
+ return NULL;
+
+ objectid = fid->objectid;
+ root_objectid = fid->root_objectid;
+ generation = fid->gen;
+
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+struct dentry *btrfs_get_parent(struct dentry *child)
+{
+ struct inode *dir = d_inode(child);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_root_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = fs_info->tree_root;
+ } else {
+ key.objectid = btrfs_ino(BTRFS_I(dir));
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ }
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+
+ BUG_ON(ret == 0); /* Key with offset of -1 found */
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != key.objectid || found_key.type != key.type) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ key.objectid = btrfs_root_ref_dirid(leaf, ref);
+ } else {
+ key.objectid = found_key.offset;
+ }
+ btrfs_free_path(path);
+
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ return btrfs_get_dentry(fs_info->sb, key.objectid,
+ found_key.offset, 0, 0);
+ }
+
+ return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
+fail:
+ btrfs_free_path(path);
+ return ERR_PTR(ret);
+}
+
+static int btrfs_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct inode *inode = d_inode(child);
+ struct inode *dir = d_inode(parent);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_root_ref *rref;
+ struct extent_buffer *leaf;
+ unsigned long name_ptr;
+ struct btrfs_key key;
+ int name_len;
+ int ret;
+ u64 ino;
+
+ if (!S_ISDIR(dir->i_mode))
+ return -EINVAL;
+
+ ino = btrfs_ino(BTRFS_I(inode));
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = fs_info->tree_root;
+ } else {
+ key.objectid = ino;
+ key.offset = btrfs_ino(BTRFS_I(dir));
+ key.type = BTRFS_INODE_REF_KEY;
+ }
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ btrfs_free_path(path);
+ return ret;
+ } else if (ret > 0) {
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ path->slots[0]--;
+ } else {
+ btrfs_free_path(path);
+ return -ENOENT;
+ }
+ }
+ leaf = path->nodes[0];
+
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ rref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ name_ptr = (unsigned long)(rref + 1);
+ name_len = btrfs_root_ref_name_len(leaf, rref);
+ } else {
+ iref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_ref);
+ name_ptr = (unsigned long)(iref + 1);
+ name_len = btrfs_inode_ref_name_len(leaf, iref);
+ }
+
+ read_extent_buffer(leaf, name, name_ptr, name_len);
+ btrfs_free_path(path);
+
+ /*
+ * have to add the null termination to make sure that reconnect_path
+ * gets the right len for strlen
+ */
+ name[name_len] = '\0';
+
+ return 0;
+}
+
+const struct export_operations btrfs_export_ops = {
+ .encode_fh = btrfs_encode_fh,
+ .fh_to_dentry = btrfs_fh_to_dentry,
+ .fh_to_parent = btrfs_fh_to_parent,
+ .get_parent = btrfs_get_parent,
+ .get_name = btrfs_get_name,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000..5afb7ca42
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations btrfs_export_ops;
+
+struct btrfs_fid {
+ u64 objectid;
+ u64 root_objectid;
+ u32 gen;
+
+ u64 parent_objectid;
+ u32 parent_gen;
+
+ u64 parent_root_objectid;
+} __attribute__ ((packed));
+
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u64 generation,
+ int check_generation);
+struct dentry *btrfs_get_parent(struct dentry *child);
+
+#endif
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
new file mode 100644
index 000000000..09ae0e73e
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.c
@@ -0,0 +1,1680 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <trace/events/btrfs.h>
+#include "ctree.h"
+#include "extent-io-tree.h"
+#include "btrfs_inode.h"
+#include "misc.h"
+
+static struct kmem_cache *extent_state_cache;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(states);
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline void btrfs_leak_debug_add_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&state->leak_list, &states);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_leak_debug_del_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&state->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+ struct extent_state *state;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
+ refcount_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct extent_io_tree *tree,
+ u64 start, u64 end)
+{
+ struct inode *inode = tree->private_data;
+ u64 isize;
+
+ if (!inode)
+ return;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
+}
+#else
+#define btrfs_leak_debug_add_state(state) do {} while (0)
+#define btrfs_leak_debug_del_state(state) do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#endif
+
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data)
+{
+ tree->fs_info = fs_info;
+ tree->state = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ tree->private_data = private_data;
+ tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once extent_io_tree_release is
+ * called.
+ */
+ smp_mb();
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+
+ cond_resched_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+
+ /*
+ * The given mask might be not appropriate for the slab allocator,
+ * drop the unsupported bits
+ */
+ mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ RB_CLEAR_NODE(&state->rb_node);
+ btrfs_leak_debug_add_state(state);
+ refcount_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
+ return state;
+}
+
+static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (refcount_dec_and_test(&state->refs)) {
+ WARN_ON(extent_state_in_tree(state));
+ btrfs_leak_debug_del_state(state);
+ trace_free_extent_state(state, _RET_IP_);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static int add_extent_changeset(struct extent_state *state, u32 bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return 0;
+ if (set && (state->state & bits) == bits)
+ return 0;
+ if (!set && (state->state & bits) == 0)
+ return 0;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(&changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ return ret;
+}
+
+static inline struct extent_state *next_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_next(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+static inline struct extent_state *prev_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_prev(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+/*
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @node_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
+ * containing @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ */
+static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***node_ret,
+ struct rb_node **parent_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct extent_state *entry = NULL;
+
+ while (*node) {
+ prev = *node;
+ entry = rb_entry(prev, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ if (node_ret)
+ *node_ret = node;
+ if (parent_ret)
+ *parent_ret = prev;
+
+ /* Search neighbors until we find the first one past the end */
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+
+ return entry;
+}
+
+/*
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
+ u64 offset,
+ struct extent_state **prev_ret,
+ struct extent_state **next_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct extent_state *orig_prev;
+ struct extent_state *entry = NULL;
+
+ ASSERT(prev_ret);
+ ASSERT(next_ret);
+
+ while (*node) {
+ entry = rb_entry(*node, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ orig_prev = entry;
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+ *next_ret = entry;
+ entry = orig_prev;
+
+ while (entry && offset < entry->start)
+ entry = prev_state(entry);
+ *prev_ret = entry;
+
+ return NULL;
+}
+
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
+{
+ return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree->fs_info, err,
+ "locking error: extent tree was modified by another thread while locked");
+}
+
+/*
+ * Utility function to look for merge candidates inside a given range. Any
+ * extents with matching state are merged together into a single extent in the
+ * tree. Extents with EXTENT_IO in their state field are not merged because
+ * the end_io handlers need to be able to do operations on them without
+ * sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *other;
+
+ if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ return;
+
+ other = prev_state(state);
+ if (other && other->end == state->start - 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
+ state->start = other->start;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+ other = next_state(state);
+ if (other && other->start == state->end + 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data, state,
+ other);
+ state->end = other->end;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ u32 bits_to_set = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+ BUG_ON(ret < 0);
+ state->state |= bits_to_set;
+}
+
+/*
+ * Insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ struct rb_node **node;
+ struct rb_node *parent = NULL;
+ const u64 end = state->end;
+
+ set_state_bits(tree, state, bits, changeset);
+
+ node = &tree->state.rb_node;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
+ entry->start, entry->end, state->start, end);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+
+ merge_state(tree, state);
+ return 0;
+}
+
+/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+ struct extent_state *state, struct rb_node **node,
+ struct rb_node *parent, unsigned bits,
+ struct extent_changeset *changeset)
+{
+ set_state_bits(tree, state, bits, changeset);
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+ merge_state(tree, state);
+}
+
+/*
+ * Split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *parent = NULL;
+ struct rb_node **node;
+
+ if (tree->private_data)
+ btrfs_split_delalloc_extent(tree->private_data, orig, split);
+
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ parent = &orig->rb_node;
+ node = &parent;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (prealloc->end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (prealloc->end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&prealloc->rb_node, parent, node);
+ rb_insert_color(&prealloc->rb_node, &tree->state);
+
+ return 0;
+}
+
+/*
+ * Utility function to clear some bits in an extent state struct. It will
+ * optionally wake up anyone waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, int wake,
+ struct extent_changeset *changeset)
+{
+ struct extent_state *next;
+ u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+ BUG_ON(ret < 0);
+ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (state->state == 0) {
+ next = next_state(state);
+ if (extent_state_in_tree(state)) {
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ next = next_state(state);
+ }
+ return next;
+}
+
+/*
+ * Clear some bits on a range in the tree. This may require splitting or
+ * inserting elements in the tree, so the gfp mask is used to indicate which
+ * allocations or sleeping are allowed.
+ *
+ * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
+ * range from the tree regardless of state (ie for truncate).
+ *
+ * The range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
+ u64 last_end;
+ int err;
+ int clear = 0;
+ int wake;
+ int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
+
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
+ wake = (bits & EXTENT_LOCKED) ? 1 : 0;
+ if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ clear = 1;
+again:
+ if (!prealloc) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
+ if (clear)
+ refcount_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ if (clear)
+ free_extent_state(cached);
+ }
+
+ /* This search will find the extents that end after our range starts. */
+ state = tree_search(tree, start);
+ if (!state)
+ goto out;
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ /* The state doesn't have the wanted bits, go ahead. */
+ if (!(state->state & bits)) {
+ state = next_state(state);
+ goto next;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we clear the desired bit
+ * on it.
+ */
+
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+ goto next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ if (wake)
+ wake_up(&state->wq);
+
+ clear_state_bit(tree, prealloc, bits, wake, changeset);
+
+ prealloc = NULL;
+ goto out;
+ }
+
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+next:
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && state && !need_resched())
+ goto hit_next;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+}
+
+/*
+ * Wait for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
+{
+ struct extent_state *state;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ spin_lock(&tree->lock);
+again:
+ while (1) {
+ /*
+ * This search will find all the extents that end after our
+ * range starts.
+ */
+ state = tree_search(tree, start);
+process_node:
+ if (!state)
+ break;
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ refcount_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (!cond_resched_lock(&tree->lock)) {
+ state = next_state(state);
+ goto process_node;
+ }
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ unsigned flags)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (!flags || (state->state & flags)) {
+ *cached_ptr = state;
+ refcount_inc(&state->refs);
+ }
+ }
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+}
+
+/*
+ * Find the first state struct with 'bits' set after 'start', and return it.
+ * tree->lock must be held. NULL will returned if nothing was found after
+ * 'start'.
+ */
+static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, u32 bits)
+{
+ struct extent_state *state;
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, start);
+ while (state) {
+ if (state->end >= start && (state->state & bits))
+ return state;
+ state = next_state(state);
+ }
+ return NULL;
+}
+
+/*
+ * Find the first offset in the io tree with one or more @bits set.
+ *
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
+ while ((state = next_state(state)) != NULL) {
+ if (state->state & bits)
+ goto got_it;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
+ state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+ if (state) {
+ cache_state_if_flags(state, cached_state, 0);
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous range of bytes in the file marked as delalloc, not more
+ * than 'max_bytes'. start and end are used to return the range,
+ *
+ * True is returned if we find something, false if nothing was in the tree.
+ */
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ bool found = false;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ if (!state) {
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (state) {
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found) {
+ *start = state->start;
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ found = true;
+ *end = state->end;
+ cur_start = state->end + 1;
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+/*
+ * Set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ u32 exclusive_bits = (bits & EXTENT_LOCKED);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
+again:
+ if (!prealloc) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, changeset);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, changeset);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc)
+ goto search_again;
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, changeset);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state, gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
+ NULL, mask);
+}
+
+/*
+ * Convert all bits in a given range from one bit to another
+ *
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie.
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ bool first_iteration = true;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+ clear_bits);
+
+again:
+ if (!prealloc) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
+ prealloc = alloc_extent_state(GFP_NOFS);
+ if (!prealloc && !first_iteration)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, NULL);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going.
+ */
+ if (state->start == start && state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, NULL);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, NULL);
+ cache_state(prealloc, cached_state);
+ clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ cond_resched();
+ first_iteration = false;
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+}
+
+/*
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
+ *
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ struct extent_state *prev = NULL, *next;
+
+ spin_lock(&tree->lock);
+
+ /* Find first extent with bits cleared */
+ while (1) {
+ state = tree_search_prev_next(tree, start, &prev, &next);
+ if (!state && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!state && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ *start_ret = prev->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!state) {
+ state = next;
+ }
+
+ /*
+ * At this point 'state' either contains 'start' or start is
+ * before 'state'
+ */
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
+ /*
+ * |--range with bits sets--|
+ * |
+ * start
+ */
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as the
+ * beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
+ }
+ } else {
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev)
+ *start_ret = prev->end + 1;
+ else
+ *start_ret = 0;
+ break;
+ }
+ }
+
+ /*
+ * Find the longest stretch from start until an entry which has the
+ * bits set
+ */
+ while (state) {
+ if (state->end >= start && !(state->state & bits)) {
+ *end_ret = state->end;
+ } else {
+ *end_ret = state->start - 1;
+ break;
+ }
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+/*
+ * Count the number of bytes in the tree that have a given bit(s) set. This
+ * can be fairly slow, except for EXTENT_DIRTY which is cached. The total
+ * number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, int contig)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ u64 last = 0;
+ int found = 0;
+
+ if (WARN_ON(search_end < cur_start))
+ return 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ while (state) {
+ if (state->start > search_end)
+ break;
+ if (contig && found && state->start > last + 1)
+ break;
+ if (state->end >= cur_start && (state->state & bits) == bits) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = max(cur_start, state->start);
+ found = 1;
+ }
+ last = state->end;
+ } else if (contig && found) {
+ break;
+ }
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+ return total_bytes;
+}
+
+/*
+ * Searche a range in the state tree for a given mask. If 'filled' == 1, this
+ * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
+ * is returned if any bit in the range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, int filled, struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+ cached->end > start)
+ state = cached;
+ else
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->end == (u64)-1)
+ break;
+
+ start = state->end + 1;
+ if (start > end)
+ break;
+ state = next_state(state);
+ }
+
+ /* We ran out of states and were still inside of our range. */
+ if (filled && !state)
+ bitset = 0;
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/* Wrappers around set/clear extent bit */
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
+ GFP_NOFS);
+}
+
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
+ changeset);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ NULL, NULL, GFP_NOFS);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, NULL);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ cached_state, NULL, GFP_NOFS);
+ while (err == -EEXIST) {
+ if (failed_start != start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, cached_state);
+
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ &failed_start, cached_state, NULL,
+ GFP_NOFS);
+ }
+ return err;
+}
+
+void __cold extent_state_free_cachep(void)
+{
+ btrfs_extent_state_leak_debug_check();
+ kmem_cache_destroy(extent_state_cache);
+}
+
+int __init extent_state_init_cachep(void)
+{
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
new file mode 100644
index 000000000..a855f40dd
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_IO_TREE_H
+#define BTRFS_EXTENT_IO_TREE_H
+
+struct extent_changeset;
+struct io_failure_record;
+
+/* Bits for the extent state */
+#define EXTENT_DIRTY (1U << 0)
+#define EXTENT_UPTODATE (1U << 1)
+#define EXTENT_LOCKED (1U << 2)
+#define EXTENT_NEW (1U << 3)
+#define EXTENT_DELALLOC (1U << 4)
+#define EXTENT_DEFRAG (1U << 5)
+#define EXTENT_BOUNDARY (1U << 6)
+#define EXTENT_NODATASUM (1U << 7)
+#define EXTENT_CLEAR_META_RESV (1U << 8)
+#define EXTENT_NEED_WAIT (1U << 9)
+#define EXTENT_NORESERVE (1U << 11)
+#define EXTENT_QGROUP_RESERVED (1U << 12)
+#define EXTENT_CLEAR_DATA_RESV (1U << 13)
+/*
+ * Must be cleared only during ordered extent completion or on error paths if we
+ * did not manage to submit bios and create the ordered extents for the range.
+ * Should not be cleared during page release and page invalidation (if there is
+ * an ordered extent in flight), that is left for the ordered extent completion.
+ */
+#define EXTENT_DELALLOC_NEW (1U << 14)
+/*
+ * When an ordered extent successfully completes for a region marked as a new
+ * delalloc range, use this flag when clearing a new delalloc range to indicate
+ * that the VFS' inode number of bytes should be incremented and the inode's new
+ * delalloc bytes decremented, in an atomic way to prevent races with stat(2).
+ */
+#define EXTENT_ADD_INODE_BYTES (1U << 15)
+
+/*
+ * Set during truncate when we're clearing an entire range and we just want the
+ * extent states to go away.
+ */
+#define EXTENT_CLEAR_ALL_BITS (1U << 16)
+
+#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
+ EXTENT_CLEAR_DATA_RESV)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
+ EXTENT_ADD_INODE_BYTES | \
+ EXTENT_CLEAR_ALL_BITS)
+
+/*
+ * Redefined bits above which are used only in the device allocation tree,
+ * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
+ * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
+ * manipulation functions
+ */
+#define CHUNK_ALLOCATED EXTENT_DIRTY
+#define CHUNK_TRIMMED EXTENT_DEFRAG
+#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
+ CHUNK_TRIMMED)
+
+enum {
+ IO_TREE_FS_PINNED_EXTENTS,
+ IO_TREE_FS_EXCLUDED_EXTENTS,
+ IO_TREE_BTREE_INODE_IO,
+ IO_TREE_INODE_IO,
+ IO_TREE_RELOC_BLOCKS,
+ IO_TREE_TRANS_DIRTY_PAGES,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES,
+ IO_TREE_INODE_FILE_EXTENT,
+ IO_TREE_LOG_CSUM_RANGE,
+ IO_TREE_SELFTEST,
+ IO_TREE_DEVICE_ALLOC_STATE,
+};
+
+struct extent_io_tree {
+ struct rb_root state;
+ struct btrfs_fs_info *fs_info;
+ void *private_data;
+
+ /* Who owns this io tree, should be one of IO_TREE_* */
+ u8 owner;
+
+ spinlock_t lock;
+};
+
+struct extent_state {
+ u64 start;
+ u64 end; /* inclusive */
+ struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
+ wait_queue_head_t wq;
+ refcount_t refs;
+ u32 state;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
+};
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data);
+void extent_io_tree_release(struct extent_io_tree *tree);
+
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+
+int __init extent_state_init_cachep(void);
+void __cold extent_state_free_cachep(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, u32 bits, int contig);
+
+void free_extent_state(struct extent_state *state);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, int filled, struct extent_state *cached_state);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset);
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
+
+static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, bits, cached,
+ GFP_NOFS, NULL);
+}
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_NOFS, NULL);
+}
+
+static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_ATOMIC, NULL);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits)
+{
+ return clear_extent_bit(tree, start, end, bits, NULL);
+}
+
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
+}
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, GFP_NOFS, NULL);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, cached);
+}
+
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 extra_bits,
+ struct extent_state **cached_state)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | extra_bits,
+ cached_state, GFP_NOFS);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_DEFRAG,
+ cached_state, GFP_NOFS);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+ u64 end)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, mask);
+}
+
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state);
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
+
+#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000..0d2cc1869
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,6264 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/rcupdate.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
+#include <linux/crc32c.h>
+#include "misc.h"
+#include "tree-log.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "locking.h"
+#include "free-space-cache.h"
+#include "free-space-tree.h"
+#include "sysfs.h"
+#include "qgroup.h"
+#include "ref-verify.h"
+#include "space-info.h"
+#include "block-rsv.h"
+#include "delalloc-space.h"
+#include "block-group.h"
+#include "discard.h"
+#include "rcu-string.h"
+#include "zoned.h"
+#include "dev-replace.h"
+
+#undef SCRAMBLE_DELAYED_REFS
+
+
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node, u64 parent,
+ u64 root_objectid, u64 owner_objectid,
+ u64 owner_offset, int refs_to_drop,
+ struct btrfs_delayed_extent_op *extra_op);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ u64 parent, u64 root_objectid,
+ u64 flags, u64 owner, u64 offset,
+ struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op);
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key);
+
+static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
+{
+ return (cache->flags & bits) == bits;
+}
+
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 num_bytes)
+{
+ u64 end = start + num_bytes - 1;
+ set_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
+ return 0;
+}
+
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ u64 start, end;
+
+ start = cache->start;
+ end = start + cache->length - 1;
+
+ clear_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
+}
+
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
+{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, start);
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = start;
+ key.offset = len;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to lookup reference count and flags of a tree block.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 offset, int metadata, u64 *refs, u64 *flags)
+{
+ struct btrfs_root *extent_root;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u32 item_size;
+ u64 num_refs;
+ u64 extent_flags;
+ int ret;
+
+ /*
+ * If we don't have skinny metadata, don't bother doing anything
+ * different
+ */
+ if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
+ offset = fs_info->nodesize;
+ metadata = 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (!trans) {
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ }
+
+search_again:
+ key.objectid = bytenr;
+ key.offset = offset;
+ if (metadata)
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ extent_root = btrfs_extent_root(fs_info, bytenr);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_free;
+
+ if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == fs_info->nodesize)
+ ret = 0;
+ }
+ }
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ } else {
+ ret = -EINVAL;
+ btrfs_print_v0_err(fs_info);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+
+ goto out_free;
+ }
+
+ BUG_ON(num_refs == 0);
+ } else {
+ num_refs = 0;
+ extent_flags = 0;
+ ret = 0;
+ }
+
+ if (!trans)
+ goto out;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's released and try
+ * again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref_head(head);
+ goto search_again;
+ }
+ spin_lock(&head->lock);
+ if (head->extent_op && head->extent_op->update_flags)
+ extent_flags |= head->extent_op->flags_to_set;
+ else
+ BUG_ON(num_refs == 0);
+
+ num_refs += head->ref_mod;
+ spin_unlock(&head->lock);
+ mutex_unlock(&head->mutex);
+ }
+ spin_unlock(&delayed_refs->lock);
+out:
+ WARN_ON(num_refs == 0);
+ if (refs)
+ *refs = num_refs;
+ if (flags)
+ *flags = extent_flags;
+out_free:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Back reference rules. Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ * when a reference is dropped we can make sure it was a valid reference
+ * before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ * if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ * maintenance. This is actually the same as #2, but with a slightly
+ * different use case.
+ *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COWed through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure for the implicit back refs has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - objectid of the file holding the reference
+ * - original offset in the file
+ * - how many bookend extents
+ *
+ * The key offset for the implicit back refs is hash of the first
+ * three fields.
+ *
+ * The extent ref structure for the full back refs has field for:
+ *
+ * - number of pointers in the tree leaf
+ *
+ * The key offset for the implicit back refs is the first byte of
+ * the tree leaf
+ *
+ * When a file extent is allocated, The implicit back refs is used.
+ * the fields are filled in:
+ *
+ * (root_key.objectid, inode objectid, offset in file, 1)
+ *
+ * When a file extent is removed file truncation, we find the
+ * corresponding implicit back refs and check the following fields:
+ *
+ * (btrfs_header_owner(leaf), inode objectid, offset in file)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ *
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
+ *
+ * When implicit back refs is used, information about the lowest key and
+ * level of the tree block are required. These information are stored in
+ * tree block info structure.
+ */
+
+/*
+ * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
+ * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
+ */
+int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
+ struct btrfs_extent_inline_ref *iref,
+ enum btrfs_inline_ref_type is_data)
+{
+ int type = btrfs_extent_inline_ref_type(eb, iref);
+ u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
+
+ if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_DATA_REF_KEY ||
+ type == BTRFS_EXTENT_DATA_REF_KEY) {
+ if (is_data == BTRFS_REF_TYPE_BLOCK) {
+ if (type == BTRFS_TREE_BLOCK_REF_KEY)
+ return type;
+ if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ ASSERT(eb->fs_info);
+ /*
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
+ */
+ if (offset &&
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ return type;
+ }
+ } else if (is_data == BTRFS_REF_TYPE_DATA) {
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ return type;
+ if (type == BTRFS_SHARED_DATA_REF_KEY) {
+ ASSERT(eb->fs_info);
+ /*
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
+ */
+ if (offset &&
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ return type;
+ }
+ } else {
+ ASSERT(is_data == BTRFS_REF_TYPE_ANY);
+ return type;
+ }
+ }
+
+ btrfs_print_leaf((struct extent_buffer *)eb);
+ btrfs_err(eb->fs_info,
+ "eb %llu iref 0x%lx invalid extent inline ref type %d",
+ eb->start, (unsigned long)iref, type);
+ WARN_ON(1);
+
+ return BTRFS_REF_TYPE_INVALID;
+}
+
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+ u32 high_crc = ~(u32)0;
+ u32 low_crc = ~(u32)0;
+ __le64 lenum;
+
+ lenum = cpu_to_le64(root_objectid);
+ high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ lenum = cpu_to_le64(owner);
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ lenum = cpu_to_le64(offset);
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+
+ return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *ref)
+{
+ return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+ btrfs_extent_data_ref_objectid(leaf, ref),
+ btrfs_extent_data_ref_offset(leaf, ref));
+}
+
+static int match_extent_data_ref(struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *ref,
+ u64 root_objectid, u64 owner, u64 offset)
+{
+ if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+ btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+ btrfs_extent_data_ref_offset(leaf, ref) != offset)
+ return 0;
+ return 1;
+}
+
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid,
+ u64 owner, u64 offset)
+{
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
+ struct btrfs_key key;
+ struct btrfs_extent_data_ref *ref;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+ int recow;
+ int err = -ENOENT;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_DATA_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_EXTENT_DATA_REF_KEY;
+ key.offset = hash_extent_data_ref(root_objectid,
+ owner, offset);
+ }
+again:
+ recow = 0;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
+
+ if (parent) {
+ if (!ret)
+ return 0;
+ goto fail;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ while (1) {
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret)
+ goto fail;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ recow = 1;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != bytenr ||
+ key.type != BTRFS_EXTENT_DATA_REF_KEY)
+ goto fail;
+
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+
+ if (match_extent_data_ref(leaf, ref, root_objectid,
+ owner, offset)) {
+ if (recow) {
+ btrfs_release_path(path);
+ goto again;
+ }
+ err = 0;
+ break;
+ }
+ path->slots[0]++;
+ }
+fail:
+ return err;
+}
+
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid, u64 owner,
+ u64 offset, int refs_to_add)
+{
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u32 size;
+ u32 num_refs;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_DATA_REF_KEY;
+ key.offset = parent;
+ size = sizeof(struct btrfs_shared_data_ref);
+ } else {
+ key.type = BTRFS_EXTENT_DATA_REF_KEY;
+ key.offset = hash_extent_data_ref(root_objectid,
+ owner, offset);
+ size = sizeof(struct btrfs_extent_data_ref);
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, size);
+ if (ret && ret != -EEXIST)
+ goto fail;
+
+ leaf = path->nodes[0];
+ if (parent) {
+ struct btrfs_shared_data_ref *ref;
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ if (ret == 0) {
+ btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+ } else {
+ num_refs = btrfs_shared_data_ref_count(leaf, ref);
+ num_refs += refs_to_add;
+ btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
+ }
+ } else {
+ struct btrfs_extent_data_ref *ref;
+ while (ret == -EEXIST) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ if (match_extent_data_ref(leaf, ref, root_objectid,
+ owner, offset))
+ break;
+ btrfs_release_path(path);
+ key.offset++;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ size);
+ if (ret && ret != -EEXIST)
+ goto fail;
+
+ leaf = path->nodes[0];
+ }
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ if (ret == 0) {
+ btrfs_set_extent_data_ref_root(leaf, ref,
+ root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+ btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+ } else {
+ num_refs = btrfs_extent_data_ref_count(leaf, ref);
+ num_refs += refs_to_add;
+ btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
+ }
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ ret = 0;
+fail:
+ btrfs_release_path(path);
+ return ret;
+}
+
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int refs_to_drop)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_data_ref *ref1 = NULL;
+ struct btrfs_shared_data_ref *ref2 = NULL;
+ struct extent_buffer *leaf;
+ u32 num_refs = 0;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+ ref2 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+ } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
+ btrfs_print_v0_err(trans->fs_info);
+ btrfs_abort_transaction(trans, -EINVAL);
+ return -EINVAL;
+ } else {
+ BUG();
+ }
+
+ BUG_ON(num_refs < refs_to_drop);
+ num_refs -= refs_to_drop;
+
+ if (num_refs == 0) {
+ ret = btrfs_del_item(trans, root, path);
+ } else {
+ if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+ btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+ else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+ btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ return ret;
+}
+
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_data_ref *ref1;
+ struct btrfs_shared_data_ref *ref2;
+ u32 num_refs = 0;
+ int type;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ if (iref) {
+ /*
+ * If type is invalid, we should have bailed out earlier than
+ * this call.
+ */
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+ ASSERT(type != BTRFS_REF_TYPE_INVALID);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else {
+ ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+ }
+ } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+ ref2 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+ } else {
+ WARN_ON(1);
+ }
+ return num_refs;
+}
+
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid)
+{
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
+}
+
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid)
+{
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ btrfs_release_path(path);
+ return ret;
+}
+
+static inline int extent_ref_type(u64 parent, u64 owner)
+{
+ int type;
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (parent > 0)
+ type = BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ type = BTRFS_TREE_BLOCK_REF_KEY;
+ } else {
+ if (parent > 0)
+ type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ type = BTRFS_EXTENT_DATA_REF_KEY;
+ }
+ return type;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key)
+
+{
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level])
+ break;
+ if (path->slots[level] + 1 >=
+ btrfs_header_nritems(path->nodes[level]))
+ continue;
+ if (level == 0)
+ btrfs_item_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ else
+ btrfs_node_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ * items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref **ref_ret,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int insert)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ u64 flags;
+ u64 item_size;
+ unsigned long ptr;
+ unsigned long end;
+ int extra_size;
+ int type;
+ int want;
+ int ret;
+ int err = 0;
+ bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+ int needed;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ want = extent_ref_type(parent, owner);
+ if (insert) {
+ extra_size = btrfs_extent_inline_ref_size(want);
+ path->search_for_extension = 1;
+ path->keep_locks = 1;
+ } else
+ extra_size = -1;
+
+ /*
+ * Owner is our level, so we can just add one to get the level for the
+ * block we are interested in.
+ */
+ if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = owner;
+ }
+
+again:
+ ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ /*
+ * We may be a newly converted file system which still has the old fat
+ * extent entries for metadata, so try and see if we have one of those.
+ */
+ if (ret > 0 && skinny_metadata) {
+ skinny_metadata = false;
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes)
+ ret = 0;
+ }
+ if (ret) {
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
+
+ if (ret && !insert) {
+ err = -ENOENT;
+ goto out;
+ } else if (WARN_ON(ret)) {
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_err(fs_info,
+"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
+ bytenr, num_bytes, parent, root_objectid, owner,
+ offset);
+ err = -EIO;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if (unlikely(item_size < sizeof(*ei))) {
+ err = -EINVAL;
+ btrfs_print_v0_err(fs_info);
+ btrfs_abort_transaction(trans, err);
+ goto out;
+ }
+
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + item_size;
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
+ ptr += sizeof(struct btrfs_tree_block_info);
+ BUG_ON(ptr > end);
+ }
+
+ if (owner >= BTRFS_FIRST_FREE_OBJECTID)
+ needed = BTRFS_REF_TYPE_DATA;
+ else
+ needed = BTRFS_REF_TYPE_BLOCK;
+
+ err = -ENOENT;
+ while (1) {
+ if (ptr >= end) {
+ if (ptr > end) {
+ err = -EUCLEAN;
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+ path->slots[0], root_objectid, owner, offset, parent);
+ }
+ break;
+ }
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
+ if (type == BTRFS_REF_TYPE_INVALID) {
+ err = -EUCLEAN;
+ goto out;
+ }
+
+ if (want < type)
+ break;
+ if (want > type) {
+ ptr += btrfs_extent_inline_ref_size(type);
+ continue;
+ }
+
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ struct btrfs_extent_data_ref *dref;
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ if (match_extent_data_ref(leaf, dref, root_objectid,
+ owner, offset)) {
+ err = 0;
+ break;
+ }
+ if (hash_extent_data_ref_item(leaf, dref) <
+ hash_extent_data_ref(root_objectid, owner, offset))
+ break;
+ } else {
+ u64 ref_offset;
+ ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ if (parent > 0) {
+ if (parent == ref_offset) {
+ err = 0;
+ break;
+ }
+ if (ref_offset < parent)
+ break;
+ } else {
+ if (root_objectid == ref_offset) {
+ err = 0;
+ break;
+ }
+ if (ref_offset < root_objectid)
+ break;
+ }
+ }
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+ if (err == -ENOENT && insert) {
+ if (item_size + extra_size >=
+ BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * To add new inline back ref, we have to make sure
+ * there is no corresponding back ref item.
+ * For simplicity, we just do not add new inline back
+ * ref if there is any kind of item for this block
+ */
+ if (find_next_key(path, 0, &key) == 0 &&
+ key.objectid == bytenr &&
+ key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ err = -EAGAIN;
+ goto out;
+ }
+ }
+ *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+ if (insert) {
+ path->keep_locks = 0;
+ path->search_for_extension = 0;
+ btrfs_unlock_up_safe(path, 1);
+ }
+ return err;
+}
+
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ unsigned long ptr;
+ unsigned long end;
+ unsigned long item_offset;
+ u64 refs;
+ int size;
+ int type;
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ item_offset = (unsigned long)iref - (unsigned long)ei;
+
+ type = extent_ref_type(parent, owner);
+ size = btrfs_extent_inline_ref_size(type);
+
+ btrfs_extend_item(path, size);
+
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, ei);
+ refs += refs_to_add;
+ btrfs_set_extent_refs(leaf, ei, refs);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ ptr = (unsigned long)ei + item_offset;
+ end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
+ if (ptr < end - size)
+ memmove_extent_buffer(leaf, ptr + size, ptr,
+ end - size - ptr);
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ btrfs_set_extent_inline_ref_type(leaf, iref, type);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ struct btrfs_extent_data_ref *dref;
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+ btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+ } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+ struct btrfs_shared_data_ref *sref;
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+}
+
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref **ref_ret,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset)
+{
+ int ret;
+
+ ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
+ num_bytes, parent, root_objectid,
+ owner, offset, 0);
+ if (ret != -ENOENT)
+ return ret;
+
+ btrfs_release_path(path);
+ *ref_ret = NULL;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ ret = lookup_tree_block_ref(trans, path, bytenr, parent,
+ root_objectid);
+ } else {
+ ret = lookup_extent_data_ref(trans, path, bytenr, parent,
+ root_objectid, owner, offset);
+ }
+ return ret;
+}
+
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+void update_inline_extent_backref(struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_mod,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_data_ref *dref = NULL;
+ struct btrfs_shared_data_ref *sref = NULL;
+ unsigned long ptr;
+ unsigned long end;
+ u32 item_size;
+ int size;
+ int type;
+ u64 refs;
+
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, ei);
+ WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+ refs += refs_to_mod;
+ btrfs_set_extent_refs(leaf, ei, refs);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ /*
+ * If type is invalid, we should have bailed out after
+ * lookup_inline_extent_backref().
+ */
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
+ ASSERT(type != BTRFS_REF_TYPE_INVALID);
+
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ refs = btrfs_extent_data_ref_count(leaf, dref);
+ } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ refs = btrfs_shared_data_ref_count(leaf, sref);
+ } else {
+ refs = 1;
+ BUG_ON(refs_to_mod != -1);
+ }
+
+ BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+ refs += refs_to_mod;
+
+ if (refs > 0) {
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ btrfs_set_extent_data_ref_count(leaf, dref, refs);
+ else
+ btrfs_set_shared_data_ref_count(leaf, sref, refs);
+ } else {
+ size = btrfs_extent_inline_ref_size(type);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ if (ptr + size < end)
+ memmove_extent_buffer(leaf, ptr, ptr + size,
+ end - ptr - size);
+ item_size -= size;
+ btrfs_truncate_item(path, item_size, 1);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+}
+
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner,
+ u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+
+ ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
+ num_bytes, parent, root_objectid,
+ owner, offset, 1);
+ if (ret == 0) {
+ /*
+ * We're adding refs to a tree block we already own, this
+ * should not happen at all.
+ */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_crit(trans->fs_info,
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
+ bytenr, num_bytes, root_objectid);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ WARN_ON(1);
+ btrfs_crit(trans->fs_info,
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ return -EUCLEAN;
+ }
+ update_inline_extent_backref(path, iref, refs_to_add, extent_op);
+ } else if (ret == -ENOENT) {
+ setup_inline_extent_backref(trans->fs_info, path, iref, parent,
+ root_objectid, owner, offset,
+ refs_to_add, extent_op);
+ ret = 0;
+ }
+ return ret;
+}
+
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_drop, int is_data)
+{
+ int ret = 0;
+
+ BUG_ON(!is_data && refs_to_drop != 1);
+ if (iref)
+ update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+ else if (is_data)
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ else
+ ret = btrfs_del_item(trans, root, path);
+ return ret;
+}
+
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+ u64 *discarded_bytes)
+{
+ int j, ret = 0;
+ u64 bytes_left, end;
+ u64 aligned_start = ALIGN(start, 1 << 9);
+
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
+ len -= aligned_start - start;
+ len = round_down(len, 1 << 9);
+ start = aligned_start;
+ }
+
+ *discarded_bytes = 0;
+
+ if (!len)
+ return 0;
+
+ end = start + len;
+ bytes_left = len;
+
+ /* Skip any superblocks on this device. */
+ for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+ u64 sb_start = btrfs_sb_offset(j);
+ u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+ u64 size = sb_start - start;
+
+ if (!in_range(sb_start, start, bytes_left) &&
+ !in_range(sb_end, start, bytes_left) &&
+ !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+ continue;
+
+ /*
+ * Superblock spans beginning of range. Adjust start and
+ * try again.
+ */
+ if (sb_start <= start) {
+ start += sb_end - start;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ continue;
+ }
+
+ if (size) {
+ ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+ GFP_NOFS);
+ if (!ret)
+ *discarded_bytes += size;
+ else if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ start = sb_end;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ }
+
+ if (bytes_left) {
+ ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+ GFP_NOFS);
+ if (!ret)
+ *discarded_bytes += bytes_left;
+ }
+ return ret;
+}
+
+static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
+{
+ struct btrfs_device *dev = stripe->dev;
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ u64 phys = stripe->physical;
+ u64 len = stripe->length;
+ u64 discarded = 0;
+ int ret = 0;
+
+ /* Zone reset on a zoned filesystem */
+ if (btrfs_can_zone_reset(dev, phys, len)) {
+ u64 src_disc;
+
+ ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
+ if (ret)
+ goto out;
+
+ if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
+ dev != dev_replace->srcdev)
+ goto out;
+
+ src_disc = discarded;
+
+ /* Send to replace target as well */
+ ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
+ &discarded);
+ discarded += src_disc;
+ } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
+ ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
+ } else {
+ ret = 0;
+ *bytes = 0;
+ }
+
+out:
+ *bytes = discarded;
+ return ret;
+}
+
+int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes)
+{
+ int ret = 0;
+ u64 discarded_bytes = 0;
+ u64 end = bytenr + num_bytes;
+ u64 cur = bytenr;
+
+ /*
+ * Avoid races with device replace and make sure the devices in the
+ * stripes don't go away while we are discarding.
+ */
+ btrfs_bio_counter_inc_blocked(fs_info);
+ while (cur < end) {
+ struct btrfs_discard_stripe *stripes;
+ unsigned int num_stripes;
+ int i;
+
+ num_bytes = end - cur;
+ stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+ if (IS_ERR(stripes)) {
+ ret = PTR_ERR(stripes);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ break;
+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_discard_stripe *stripe = stripes + i;
+ u64 bytes;
+
+ if (!stripe->dev->bdev) {
+ ASSERT(btrfs_test_opt(fs_info, DEGRADED));
+ continue;
+ }
+
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &stripe->dev->dev_state))
+ continue;
+
+ ret = do_discard_extent(stripe, &bytes);
+ if (ret) {
+ /*
+ * Keep going if discard is not supported by the
+ * device.
+ */
+ if (ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
+ } else {
+ discarded_bytes += bytes;
+ }
+ }
+ kfree(stripes);
+ if (ret)
+ break;
+ cur += num_bytes;
+ }
+ btrfs_bio_counter_dec(fs_info);
+ if (actual_bytes)
+ *actual_bytes = discarded_bytes;
+ return ret;
+}
+
+/* Can return -ENOMEM */
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_ref *generic_ref)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+
+ ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
+ generic_ref->action);
+ BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
+ generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
+
+ if (generic_ref->type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
+
+ btrfs_ref_tree_mod(fs_info, generic_ref);
+
+ return ret;
+}
+
+/*
+ * __btrfs_inc_extent_ref - insert backreference for a given extent
+ *
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
+ * how it works.
+ *
+ * @trans: Handle of transaction
+ *
+ * @node: The delayed ref node used to get the bytenr/length for
+ * extent whose references are incremented.
+ *
+ * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
+ * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
+ * bytenr of the parent block. Since new extents are always
+ * created with indirect references, this will only be the case
+ * when relocating a shared extent. In that case, root_objectid
+ * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
+ * be 0
+ *
+ * @root_objectid: The id of the root where this modification has originated,
+ * this can be either one of the well-known metadata trees or
+ * the subvolume id which references this extent.
+ *
+ * @owner: For data extents it is the inode number of the owning file.
+ * For metadata extents this parameter holds the level in the
+ * tree of the extent.
+ *
+ * @offset: For metadata extents the offset is ignored and is currently
+ * always passed as 0. For data extents it is the fileoffset
+ * this extent belongs to.
+ *
+ * @refs_to_add Number of references to add
+ *
+ * @extent_op Pointer to a structure, holding information necessary when
+ * updating a tree block's flags
+ *
+ */
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *item;
+ struct btrfs_key key;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
+ u64 refs;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* this will setup the path even if it fails to insert the back ref */
+ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
+ parent, root_objectid, owner,
+ offset, refs_to_add, extent_op);
+ if ((ret < 0 && ret != -EAGAIN) || !ret)
+ goto out;
+
+ /*
+ * Ok we had -EAGAIN which means we didn't have space to insert and
+ * inline extent ref, so just update the reference count and add a
+ * normal backref.
+ */
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, item);
+ btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, item);
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ /* now insert the actual backref */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ BUG_ON(refs_to_add != 1);
+ ret = insert_tree_block_ref(trans, path, bytenr, parent,
+ root_objectid);
+ } else {
+ ret = insert_extent_data_ref(trans, path, bytenr, parent,
+ root_objectid, owner, offset,
+ refs_to_add);
+ }
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret = 0;
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_key ins;
+ u64 parent = 0;
+ u64 ref_root = 0;
+ u64 flags = 0;
+
+ ins.objectid = node->bytenr;
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+ trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
+
+ if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+ parent = ref->parent;
+ ref_root = ref->root;
+
+ if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ if (extent_op)
+ flags |= extent_op->flags_to_set;
+ ret = alloc_reserved_file_extent(trans, parent, ref_root,
+ flags, ref->objectid,
+ ref->offset, &ins,
+ node->ref_mod);
+ } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
+ ref->objectid, ref->offset,
+ node->ref_mod, extent_op);
+ } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+ ret = __btrfs_free_extent(trans, node, parent,
+ ref_root, ref->objectid,
+ ref->offset, node->ref_mod,
+ extent_op);
+ } else {
+ BUG();
+ }
+ return ret;
+}
+
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_item *ei)
+{
+ u64 flags = btrfs_extent_flags(leaf, ei);
+ if (extent_op->update_flags) {
+ flags |= extent_op->flags_to_set;
+ btrfs_set_extent_flags(leaf, ei, flags);
+ }
+
+ if (extent_op->update_key) {
+ struct btrfs_tree_block_info *bi;
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+ }
+}
+
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ u32 item_size;
+ int ret;
+ int err = 0;
+ int metadata = 1;
+
+ if (TRANS_ABORTED(trans))
+ return 0;
+
+ if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ metadata = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = head->bytenr;
+
+ if (metadata) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = extent_op->level;
+ } else {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = head->num_bytes;
+ }
+
+ root = btrfs_extent_root(fs_info, key.objectid);
+again:
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0) {
+ if (metadata) {
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == head->bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == head->num_bytes)
+ ret = 0;
+ }
+ if (ret > 0) {
+ btrfs_release_path(path);
+ metadata = 0;
+
+ key.objectid = head->bytenr;
+ key.offset = head->num_bytes;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ goto again;
+ }
+ } else {
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (unlikely(item_size < sizeof(*ei))) {
+ err = -EINVAL;
+ btrfs_print_v0_err(fs_info);
+ btrfs_abort_transaction(trans, err);
+ goto out;
+ }
+
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret = 0;
+ struct btrfs_delayed_tree_ref *ref;
+ u64 parent = 0;
+ u64 ref_root = 0;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
+
+ if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ parent = ref->parent;
+ ref_root = ref->root;
+
+ if (unlikely(node->ref_mod != 1)) {
+ btrfs_err(trans->fs_info,
+ "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
+ node->bytenr, node->ref_mod, node->action, ref_root,
+ parent);
+ return -EUCLEAN;
+ }
+ if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ BUG_ON(!extent_op || !extent_op->update_flags);
+ ret = alloc_reserved_tree_block(trans, node, extent_op);
+ } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
+ ref->level, 0, 1, extent_op);
+ } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+ ret = __btrfs_free_extent(trans, node, parent, ref_root,
+ ref->level, 0, 1, extent_op);
+ } else {
+ BUG();
+ }
+ return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret = 0;
+
+ if (TRANS_ABORTED(trans)) {
+ if (insert_reserved)
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ return 0;
+ }
+
+ if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ ret = run_delayed_tree_ref(trans, node, extent_op,
+ insert_reserved);
+ else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ ret = run_delayed_data_ref(trans, node, extent_op,
+ insert_reserved);
+ else
+ BUG();
+ if (ret && insert_reserved)
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ if (ret < 0)
+ btrfs_err(trans->fs_info,
+"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
+ node->bytenr, node->num_bytes, node->type,
+ node->action, node->ref_mod, ret);
+ return ret;
+}
+
+static inline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_node *ref;
+
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
+ return NULL;
+
+ /*
+ * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * This is to prevent a ref count from going down to zero, which deletes
+ * the extent item from the extent tree, when there still are references
+ * to add, which would fail because they would not find the extent item.
+ */
+ if (!list_empty(&head->ref_add_list))
+ return list_first_entry(&head->ref_add_list,
+ struct btrfs_delayed_ref_node, add_list);
+
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
+ struct btrfs_delayed_ref_node, ref_node);
+ ASSERT(list_empty(&ref->add_list));
+ return ref;
+}
+
+static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ spin_lock(&delayed_refs->lock);
+ head->processing = 0;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(head);
+}
+
+static struct btrfs_delayed_extent_op *cleanup_extent_op(
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+
+ if (!extent_op)
+ return NULL;
+
+ if (head->must_insert_reserved) {
+ head->extent_op = NULL;
+ btrfs_free_delayed_extent_op(extent_op);
+ return NULL;
+ }
+ return extent_op;
+}
+
+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_extent_op *extent_op;
+ int ret;
+
+ extent_op = cleanup_extent_op(head);
+ if (!extent_op)
+ return 0;
+ head->extent_op = NULL;
+ spin_unlock(&head->lock);
+ ret = run_delayed_extent_op(trans, head, extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
+ return ret ? ret : 1;
+}
+
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ int nr_items = 1; /* Dropping this ref head update. */
+
+ /*
+ * We had csum deletions accounted for in our delayed refs rsv, we need
+ * to drop the csum leaves for this update from our delayed_refs_rsv.
+ */
+ if (head->total_ref_mod < 0 && head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+ }
+
+ btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+}
+
+static int cleanup_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ ret = run_and_cleanup_extent_op(trans, head);
+ if (ret < 0) {
+ unselect_delayed_ref_head(delayed_refs, head);
+ btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+ return ret;
+ } else if (ret) {
+ return ret;
+ }
+
+ /*
+ * Need to drop our head ref lock and re-acquire the delayed ref lock
+ * and then re-check to make sure nobody got added.
+ */
+ spin_unlock(&head->lock);
+ spin_lock(&delayed_refs->lock);
+ spin_lock(&head->lock);
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ return 1;
+ }
+ btrfs_delete_ref_head(delayed_refs, head);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+
+ if (head->must_insert_reserved) {
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
+ if (head->is_data) {
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(fs_info, head->bytenr);
+ ret = btrfs_del_csums(trans, csum_root, head->bytenr,
+ head->num_bytes);
+ }
+ }
+
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+
+ trace_run_delayed_ref_head(fs_info, head, 0);
+ btrfs_delayed_ref_unlock(head);
+ btrfs_put_delayed_ref_head(head);
+ return ret;
+}
+
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_delayed_ref_head *head = NULL;
+ int ret;
+
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_select_ref_head(delayed_refs);
+ if (!head) {
+ spin_unlock(&delayed_refs->lock);
+ return head;
+ }
+
+ /*
+ * Grab the lock that says we are going to process all the refs for
+ * this head
+ */
+ ret = btrfs_delayed_ref_lock(delayed_refs, head);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * We may have dropped the spin lock to get the head mutex lock, and
+ * that might have given someone else time to free the head. If that's
+ * true, it has been removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN)
+ head = ERR_PTR(-EAGAIN);
+
+ return head;
+}
+
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *locked_ref,
+ unsigned long *run_refs)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_extent_op *extent_op;
+ struct btrfs_delayed_ref_node *ref;
+ int must_insert_reserved = 0;
+ int ret;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ lockdep_assert_held(&locked_ref->mutex);
+ lockdep_assert_held(&locked_ref->lock);
+
+ while ((ref = select_delayed_ref(locked_ref))) {
+ if (ref->seq &&
+ btrfs_check_delayed_seq(fs_info, ref->seq)) {
+ spin_unlock(&locked_ref->lock);
+ unselect_delayed_ref_head(delayed_refs, locked_ref);
+ return -EAGAIN;
+ }
+
+ (*run_refs)++;
+ ref->in_tree = 0;
+ rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
+ RB_CLEAR_NODE(&ref->ref_node);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
+ /*
+ * When we play the delayed ref, also correct the ref_mod on
+ * head
+ */
+ switch (ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ case BTRFS_ADD_DELAYED_EXTENT:
+ locked_ref->ref_mod -= ref->ref_mod;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ locked_ref->ref_mod += ref->ref_mod;
+ break;
+ default:
+ WARN_ON(1);
+ }
+ atomic_dec(&delayed_refs->num_entries);
+
+ /*
+ * Record the must_insert_reserved flag before we drop the
+ * spin lock.
+ */
+ must_insert_reserved = locked_ref->must_insert_reserved;
+ locked_ref->must_insert_reserved = 0;
+
+ extent_op = locked_ref->extent_op;
+ locked_ref->extent_op = NULL;
+ spin_unlock(&locked_ref->lock);
+
+ ret = run_one_delayed_ref(trans, ref, extent_op,
+ must_insert_reserved);
+
+ btrfs_free_delayed_extent_op(extent_op);
+ if (ret) {
+ unselect_delayed_ref_head(delayed_refs, locked_ref);
+ btrfs_put_delayed_ref(ref);
+ return ret;
+ }
+
+ btrfs_put_delayed_ref(ref);
+ cond_resched();
+
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+ }
+
+ return 0;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ unsigned long nr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
+ ktime_t start = ktime_get();
+ int ret;
+ unsigned long count = 0;
+ unsigned long actual_count = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ do {
+ if (!locked_ref) {
+ locked_ref = btrfs_obtain_ref_head(trans);
+ if (IS_ERR_OR_NULL(locked_ref)) {
+ if (PTR_ERR(locked_ref) == -EAGAIN) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ count++;
+ }
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+ &actual_count);
+ if (ret < 0 && ret != -EAGAIN) {
+ /*
+ * Error, btrfs_run_delayed_refs_for_head already
+ * unlocked everything so just bail out
+ */
+ return ret;
+ } else if (!ret) {
+ /*
+ * Success, perform the usual cleanup of a processed
+ * head
+ */
+ ret = cleanup_ref_head(trans, locked_ref);
+ if (ret > 0 ) {
+ /* We dropped our lock, we need to loop. */
+ ret = 0;
+ continue;
+ } else if (ret) {
+ return ret;
+ }
+ }
+
+ /*
+ * Either success case or btrfs_run_delayed_refs_for_head
+ * returned -EAGAIN, meaning we need to select another head
+ */
+
+ locked_ref = NULL;
+ cond_resched();
+ } while ((nr != -1 && count < nr) || locked_ref);
+
+ /*
+ * We don't want to include ref heads since we can have empty ref heads
+ * and those will drastically skew our runtime down since we just do
+ * accounting, no actual extent tree updates.
+ */
+ if (actual_count > 0) {
+ u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+ u64 avg;
+
+ /*
+ * We weigh the current average higher than our current runtime
+ * to avoid large swings in the average.
+ */
+ spin_lock(&delayed_refs->lock);
+ avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+ fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
+ spin_unlock(&delayed_refs->lock);
+ }
+ return 0;
+}
+
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_delayed_ref_node *entry;
+ int alt = 1;
+ u64 middle;
+ u64 first = 0, last = 0;
+
+ n = rb_first(root);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ first = entry->bytenr;
+ }
+ n = rb_last(root);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ last = entry->bytenr;
+ }
+ n = root->rb_node;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ WARN_ON(!entry->in_tree);
+
+ middle = entry->bytenr;
+
+ if (alt)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+
+ alt = 1 - alt;
+ }
+ return middle;
+}
+#endif
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far. count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ unsigned long count)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *head;
+ int ret;
+ int run_all = count == (unsigned long)-1;
+
+ /* We'll clean this up in btrfs_cleanup_transaction */
+ if (TRANS_ABORTED(trans))
+ return 0;
+
+ if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
+ return 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ if (count == 0)
+ count = delayed_refs->num_heads_ready;
+
+again:
+#ifdef SCRAMBLE_DELAYED_REFS
+ delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+ ret = __btrfs_run_delayed_refs(trans, count);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ if (run_all) {
+ btrfs_create_pending_block_groups(trans);
+
+ spin_lock(&delayed_refs->lock);
+ node = rb_first_cached(&delayed_refs->href_root);
+ if (!node) {
+ spin_unlock(&delayed_refs->lock);
+ goto out;
+ }
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ /* Mutex was contended, block until it's released and retry. */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+
+ btrfs_put_delayed_ref_head(head);
+ cond_resched();
+ goto again;
+ }
+out:
+ return 0;
+}
+
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb, u64 flags,
+ int level)
+{
+ struct btrfs_delayed_extent_op *extent_op;
+ int ret;
+
+ extent_op = btrfs_alloc_delayed_extent_op();
+ if (!extent_op)
+ return -ENOMEM;
+
+ extent_op->flags_to_set = flags;
+ extent_op->update_flags = true;
+ extent_op->update_key = false;
+ extent_op->level = level;
+
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
+ if (ret)
+ btrfs_free_delayed_extent_op(extent_op);
+ return ret;
+}
+
+static noinline int check_delayed_ref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid, u64 offset, u64 bytenr)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_data_ref *data_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_transaction *cur_trans;
+ struct rb_node *node;
+ int ret = 0;
+
+ spin_lock(&root->fs_info->trans_lock);
+ cur_trans = root->fs_info->running_transaction;
+ if (cur_trans)
+ refcount_inc(&cur_trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+ if (!cur_trans)
+ return 0;
+
+ delayed_refs = &cur_trans->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ if (!head) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
+ return 0;
+ }
+
+ if (!mutex_trylock(&head->mutex)) {
+ if (path->nowait) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
+ return -EAGAIN;
+ }
+
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's released and let
+ * caller try again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref_head(head);
+ btrfs_put_transaction(cur_trans);
+ return -EAGAIN;
+ }
+ spin_unlock(&delayed_refs->lock);
+
+ spin_lock(&head->lock);
+ /*
+ * XXX: We should replace this with a proper search function in the
+ * future.
+ */
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+ /* If it's a shared ref we know a cross reference exists */
+ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
+ ret = 1;
+ break;
+ }
+
+ data_ref = btrfs_delayed_node_to_data_ref(ref);
+
+ /*
+ * If our ref doesn't match the one we're currently looking at
+ * then we have a cross reference.
+ */
+ if (data_ref->root != root->root_key.objectid ||
+ data_ref->objectid != objectid ||
+ data_ref->offset != offset) {
+ ret = 1;
+ break;
+ }
+ }
+ spin_unlock(&head->lock);
+ mutex_unlock(&head->mutex);
+ btrfs_put_transaction(cur_trans);
+ return ret;
+}
+
+static noinline int check_committed_ref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid, u64 offset, u64 bytenr,
+ bool strict)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
+ struct extent_buffer *leaf;
+ struct btrfs_extent_data_ref *ref;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ u32 item_size;
+ int type;
+ int ret;
+
+ key.objectid = bytenr;
+ key.offset = (u64)-1;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0); /* Corruption */
+
+ ret = -ENOENT;
+ if (path->slots[0] == 0)
+ goto out;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto out;
+
+ ret = 1;
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+
+ /* If extent item has more than 1 inline ref then it's shared */
+ if (item_size != sizeof(*ei) +
+ btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+ goto out;
+
+ /*
+ * If extent created before last snapshot => it's shared unless the
+ * snapshot has been deleted. Use the heuristic if strict is false.
+ */
+ if (!strict &&
+ (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item)))
+ goto out;
+
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+
+ /* If this extent has SHARED_DATA_REF then it's shared */
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+ if (type != BTRFS_EXTENT_DATA_REF_KEY)
+ goto out;
+
+ ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ if (btrfs_extent_refs(leaf, ei) !=
+ btrfs_extent_data_ref_count(leaf, ref) ||
+ btrfs_extent_data_ref_root(leaf, ref) !=
+ root->root_key.objectid ||
+ btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+ btrfs_extent_data_ref_offset(leaf, ref) != offset)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
+ u64 bytenr, bool strict, struct btrfs_path *path)
+{
+ int ret;
+
+ do {
+ ret = check_committed_ref(root, path, objectid,
+ offset, bytenr, strict);
+ if (ret && ret != -ENOENT)
+ goto out;
+
+ ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+ } while (ret == -EAGAIN);
+
+out:
+ btrfs_release_path(path);
+ if (btrfs_is_data_reloc_root(root))
+ WARN_ON(ret > 0);
+ return ret;
+}
+
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ int full_backref, int inc)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 parent;
+ u64 ref_root;
+ u32 nritems;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_ref generic_ref = { 0 };
+ bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
+ int i;
+ int action;
+ int level;
+ int ret = 0;
+
+ if (btrfs_is_testing(fs_info))
+ return 0;
+
+ ref_root = btrfs_header_owner(buf);
+ nritems = btrfs_header_nritems(buf);
+ level = btrfs_header_level(buf);
+
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
+ return 0;
+
+ if (full_backref)
+ parent = buf->start;
+ else
+ parent = 0;
+ if (inc)
+ action = BTRFS_ADD_DELAYED_REF;
+ else
+ action = BTRFS_DROP_DELAYED_REF;
+
+ for (i = 0; i < nritems; i++) {
+ if (level == 0) {
+ btrfs_item_key_to_cpu(buf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(buf, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(buf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (bytenr == 0)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ key.offset -= btrfs_file_extent_offset(buf, fi);
+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
+ num_bytes, parent);
+ btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
+ key.offset, root->root_key.objectid,
+ for_reloc);
+ if (inc)
+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ else
+ ret = btrfs_free_extent(trans, &generic_ref);
+ if (ret)
+ goto fail;
+ } else {
+ bytenr = btrfs_node_blockptr(buf, i);
+ num_bytes = fs_info->nodesize;
+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
+ num_bytes, parent);
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
+ root->root_key.objectid, for_reloc);
+ if (inc)
+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
+ else
+ ret = btrfs_free_extent(trans, &generic_ref);
+ if (ret)
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref)
+{
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+}
+
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref)
+{
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+}
+
+static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 flags;
+ u64 ret;
+
+ if (data)
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (root == fs_info->chunk_root)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+
+ ret = btrfs_get_alloc_profile(fs_info, flags);
+ return ret;
+}
+
+static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *leftmost;
+ u64 bytenr = 0;
+
+ read_lock(&fs_info->block_group_cache_lock);
+ /* Get the block group with the lowest logical start address. */
+ leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
+ if (leftmost) {
+ struct btrfs_block_group *bg;
+
+ bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
+ bytenr = bg->start;
+ }
+ read_unlock(&fs_info->block_group_cache_lock);
+
+ return bytenr;
+}
+
+static int pin_down_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *cache,
+ u64 bytenr, u64 num_bytes, int reserved)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += num_bytes;
+ btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
+ num_bytes);
+ if (reserved) {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+ return 0;
+}
+
+int btrfs_pin_extent(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, int reserved)
+{
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
+ BUG_ON(!cache); /* Logic error */
+
+ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
+
+ btrfs_put_block_group(cache);
+ return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_block_group *cache;
+ int ret;
+
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
+ if (!cache)
+ return -EINVAL;
+
+ /*
+ * Fully cache the free space first so that our pin removes the free space
+ * from the cache.
+ */
+ ret = btrfs_cache_block_group(cache, true);
+ if (ret)
+ goto out;
+
+ pin_down_extent(trans, cache, bytenr, num_bytes, 0);
+
+ /* remove us from the free space cache (if we're there at all) */
+ ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+out:
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_block_group *block_group;
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group)
+ return -EINVAL;
+
+ ret = btrfs_cache_block_group(block_group, true);
+ if (ret)
+ goto out;
+
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+out:
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+int btrfs_exclude_logged_extents(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key key;
+ int found_type;
+ int i;
+ int ret = 0;
+
+ if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
+ return 0;
+
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+ if (found_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+ continue;
+ key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void
+btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
+{
+ atomic_inc(&bg->reservations);
+}
+
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 *empty_cluster)
+{
+ struct btrfs_free_cluster *ret = NULL;
+
+ *empty_cluster = 0;
+ if (btrfs_mixed_space_info(space_info))
+ return ret;
+
+ if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ ret = &fs_info->meta_alloc_cluster;
+ if (btrfs_test_opt(fs_info, SSD))
+ *empty_cluster = SZ_2M;
+ else
+ *empty_cluster = SZ_64K;
+ } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
+ btrfs_test_opt(fs_info, SSD_SPREAD)) {
+ *empty_cluster = SZ_2M;
+ ret = &fs_info->data_alloc_cluster;
+ }
+
+ return ret;
+}
+
+static int unpin_extent_range(struct btrfs_fs_info *fs_info,
+ u64 start, u64 end,
+ const bool return_free_space)
+{
+ struct btrfs_block_group *cache = NULL;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_free_cluster *cluster = NULL;
+ u64 len;
+ u64 total_unpinned = 0;
+ u64 empty_cluster = 0;
+ bool readonly;
+
+ while (start <= end) {
+ readonly = false;
+ if (!cache ||
+ start >= cache->start + cache->length) {
+ if (cache)
+ btrfs_put_block_group(cache);
+ total_unpinned = 0;
+ cache = btrfs_lookup_block_group(fs_info, start);
+ BUG_ON(!cache); /* Logic error */
+
+ cluster = fetch_cluster_info(fs_info,
+ cache->space_info,
+ &empty_cluster);
+ empty_cluster <<= 1;
+ }
+
+ len = cache->start + cache->length - start;
+ len = min(len, end + 1 - start);
+
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
+
+ start += len;
+ total_unpinned += len;
+ space_info = cache->space_info;
+
+ /*
+ * If this space cluster has been marked as fragmented and we've
+ * unpinned enough in this block group to potentially allow a
+ * cluster to be created inside of it go ahead and clear the
+ * fragmented check.
+ */
+ if (cluster && cluster->fragmented &&
+ total_unpinned > empty_cluster) {
+ spin_lock(&cluster->lock);
+ cluster->fragmented = 0;
+ spin_unlock(&cluster->lock);
+ }
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned -= len;
+ btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
+ space_info->max_extent_size = 0;
+ if (cache->ro) {
+ space_info->bytes_readonly += len;
+ readonly = true;
+ } else if (btrfs_is_zoned(fs_info)) {
+ /* Need reset before reusing in a zoned block group */
+ space_info->bytes_zone_unusable += len;
+ readonly = true;
+ }
+ spin_unlock(&cache->lock);
+ if (!readonly && return_free_space &&
+ global_rsv->space_info == space_info) {
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ u64 to_add = min(len, global_rsv->size -
+ global_rsv->reserved);
+
+ global_rsv->reserved += to_add;
+ btrfs_space_info_update_bytes_may_use(fs_info,
+ space_info, to_add);
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = 1;
+ len -= to_add;
+ }
+ spin_unlock(&global_rsv->lock);
+ }
+ /* Add to any tickets we may have */
+ if (!readonly && return_free_space && len)
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+ }
+
+ if (cache)
+ btrfs_put_block_group(cache);
+ return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *block_group, *tmp;
+ struct list_head *deleted_bgs;
+ struct extent_io_tree *unpin;
+ u64 start;
+ u64 end;
+ int ret;
+
+ unpin = &trans->transaction->pinned_extents;
+
+ while (!TRANS_ABORTED(trans)) {
+ struct extent_state *cached_state = NULL;
+
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ break;
+ }
+
+ if (btrfs_test_opt(fs_info, DISCARD_SYNC))
+ ret = btrfs_discard_extent(fs_info, start,
+ end + 1 - start, NULL);
+
+ clear_extent_dirty(unpin, start, end, &cached_state);
+ unpin_extent_range(fs_info, start, end, true);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ free_extent_state(cached_state);
+ cond_resched();
+ }
+
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ btrfs_discard_calc_delay(&fs_info->discard_ctl);
+ btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
+ }
+
+ /*
+ * Transaction is finished. We don't need the lock anymore. We
+ * do need to clean up the block groups in case of a transaction
+ * abort.
+ */
+ deleted_bgs = &trans->transaction->deleted_bgs;
+ list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+ u64 trimmed = 0;
+
+ ret = -EROFS;
+ if (!TRANS_ABORTED(trans))
+ ret = btrfs_discard_extent(fs_info,
+ block_group->start,
+ block_group->length,
+ &trimmed);
+
+ list_del_init(&block_group->bg_list);
+ btrfs_unfreeze_block_group(block_group);
+ btrfs_put_block_group(block_group);
+
+ if (ret) {
+ const char *errstr = btrfs_decode_error(ret);
+ btrfs_warn(fs_info,
+ "discard failed while removing blockgroup: errno=%d %s",
+ ret, errstr);
+ }
+ }
+
+ return 0;
+}
+
+static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, bool is_data)
+{
+ int ret;
+
+ if (is_data) {
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(trans->fs_info, bytenr);
+ ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ ret = add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+}
+
+/*
+ * Drop one or more refs of @node.
+ *
+ * 1. Locate the extent refs.
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
+ * Locate it, then reduce the refs number or remove the ref line completely.
+ *
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
+ *
+ * Inline backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 2 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
+ *
+ * This function gets called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 257
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 1 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ *
+ * Keyed backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 754 gen 6 flags DATA
+ * [...]
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
+ *
+ * This function get called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 866
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 753 gen 6 flags DATA
+ *
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
+ */
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node, u64 parent,
+ u64 root_objectid, u64 owner_objectid,
+ u64 owner_offset, int refs_to_drop,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_root *extent_root;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+ int is_data;
+ int extent_slot = 0;
+ int found_extent = 0;
+ int num_to_del = 1;
+ u32 item_size;
+ u64 refs;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
+ bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+
+ extent_root = btrfs_extent_root(info, bytenr);
+ ASSERT(extent_root);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
+
+ if (!is_data && refs_to_drop != 1) {
+ btrfs_crit(info,
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
+ node->bytenr, refs_to_drop);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ if (is_data)
+ skinny_metadata = false;
+
+ ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
+ parent, root_objectid, owner_objectid,
+ owner_offset);
+ if (ret == 0) {
+ /*
+ * Either the inline backref or the SHARED_DATA_REF/
+ * SHARED_BLOCK_REF is found
+ *
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
+ */
+ extent_slot = path->slots[0];
+ while (extent_slot >= 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ extent_slot);
+ if (key.objectid != bytenr)
+ break;
+ if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes) {
+ found_extent = 1;
+ break;
+ }
+ if (key.type == BTRFS_METADATA_ITEM_KEY &&
+ key.offset == owner_objectid) {
+ found_extent = 1;
+ break;
+ }
+
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
+ if (path->slots[0] - extent_slot > 5)
+ break;
+ extent_slot--;
+ }
+
+ if (!found_extent) {
+ if (iref) {
+ btrfs_crit(info,
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ /* Must be SHARED_* item, remove the backref first */
+ ret = remove_extent_backref(trans, extent_root, path,
+ NULL, refs_to_drop, is_data);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ /* Slow path to locate EXTENT/METADATA_ITEM */
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ if (!is_data && skinny_metadata) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = owner_objectid;
+ }
+
+ ret = btrfs_search_slot(trans, extent_root,
+ &key, path, -1, 1);
+ if (ret > 0 && skinny_metadata && path->slots[0]) {
+ /*
+ * Couldn't find our skinny metadata item,
+ * see if we have ye olde extent item.
+ */
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes)
+ ret = 0;
+ }
+
+ if (ret > 0 && skinny_metadata) {
+ skinny_metadata = false;
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, extent_root,
+ &key, path, -1, 1);
+ }
+
+ if (ret) {
+ btrfs_err(info,
+ "umm, got %d back from search, was looking for %llu",
+ ret, bytenr);
+ if (ret > 0)
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ extent_slot = path->slots[0];
+ }
+ } else if (WARN_ON(ret == -ENOENT)) {
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_err(info,
+ "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
+ bytenr, parent, root_objectid, owner_objectid,
+ owner_offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ } else {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, extent_slot);
+ if (unlikely(item_size < sizeof(*ei))) {
+ ret = -EINVAL;
+ btrfs_print_v0_err(info);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ ei = btrfs_item_ptr(leaf, extent_slot,
+ struct btrfs_extent_item);
+ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
+ key.type == BTRFS_EXTENT_ITEM_KEY) {
+ struct btrfs_tree_block_info *bi;
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ btrfs_crit(info,
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
+ key.objectid, key.type, key.offset,
+ owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
+ }
+
+ refs = btrfs_extent_refs(leaf, ei);
+ if (refs < refs_to_drop) {
+ btrfs_crit(info,
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
+ refs_to_drop, refs, bytenr);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ refs -= refs_to_drop;
+
+ if (refs > 0) {
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+ /*
+ * In the case of inline back ref, reference count will
+ * be updated by remove_extent_backref
+ */
+ if (iref) {
+ if (!found_extent) {
+ btrfs_crit(info,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ } else {
+ btrfs_set_extent_refs(leaf, ei, refs);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ if (found_extent) {
+ ret = remove_extent_backref(trans, extent_root, path,
+ iref, refs_to_drop, is_data);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ }
+ } else {
+ /* In this branch refs == 1 */
+ if (found_extent) {
+ if (is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref)) {
+ btrfs_crit(info,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ if (iref) {
+ if (path->slots[0] != extent_slot) {
+ btrfs_crit(info,
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ } else {
+ /*
+ * No inline ref, we must be at SHARED_* item,
+ * And it's single ref, it must be:
+ * | extent_slot ||extent_slot + 1|
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
+ */
+ if (path->slots[0] != extent_slot + 1) {
+ btrfs_crit(info,
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ path->slots[0] = extent_slot;
+ num_to_del = 2;
+ }
+ }
+
+ ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+ num_to_del);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
+ }
+ btrfs_release_path(path);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+err_dump:
+ /*
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
+ * dump for debug build.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
+ path->slots[0], extent_slot);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+
+ btrfs_free_path(path);
+ return -EUCLEAN;
+}
+
+/*
+ * when we free an block, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well. This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
+ */
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+ u64 bytenr)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ if (!head)
+ goto out_delayed_unlock;
+
+ spin_lock(&head->lock);
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
+ goto out;
+
+ if (cleanup_extent_op(head) != NULL)
+ goto out;
+
+ /*
+ * waiting for the lock here would deadlock. If someone else has it
+ * locked they are already in the process of dropping it anyway
+ */
+ if (!mutex_trylock(&head->mutex))
+ goto out;
+
+ btrfs_delete_ref_head(delayed_refs, head);
+ head->processing = 0;
+
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+
+ BUG_ON(head->extent_op);
+ if (head->must_insert_reserved)
+ ret = 1;
+
+ btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref_head(head);
+ return ret;
+out:
+ spin_unlock(&head->lock);
+
+out_delayed_unlock:
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_ref generic_ref = { 0 };
+ int ret;
+
+ btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
+ buf->start, buf->len, parent);
+ btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
+ root_id, 0, false);
+
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ btrfs_ref_tree_mod(fs_info, &generic_ref);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+
+ if (last_ref && btrfs_header_generation(buf) == trans->transid) {
+ struct btrfs_block_group *cache;
+ bool must_pin = false;
+
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, buf->start);
+ if (!ret) {
+ btrfs_redirty_list_add(trans->transaction, buf);
+ goto out;
+ }
+ }
+
+ cache = btrfs_lookup_block_group(fs_info, buf->start);
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
+ goto out;
+ }
+
+ /*
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
+ * We are safe from races here because at this point no other
+ * node or root points to this extent buffer, so if after this
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
+ */
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ must_pin = true;
+
+ if (must_pin || btrfs_is_zoned(fs_info)) {
+ btrfs_redirty_list_add(trans->transaction, buf);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
+ goto out;
+ }
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(cache, buf->start, buf->len);
+ btrfs_free_reserved_bytes(cache, buf->len, 0);
+ btrfs_put_block_group(cache);
+ trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+ }
+out:
+ if (last_ref) {
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't
+ * matter anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ }
+}
+
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+
+ if (btrfs_is_testing(fs_info))
+ return 0;
+
+ /*
+ * tree log blocks never actually go into the extent allocation
+ * tree, just update pinning info and exit early.
+ */
+ if ((ref->type == BTRFS_REF_METADATA &&
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+ (ref->type == BTRFS_REF_DATA &&
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
+ /* unlocks the pinned mutex */
+ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
+ ret = 0;
+ } else if (ref->type == BTRFS_REF_METADATA) {
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
+ } else {
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
+ }
+
+ if (!((ref->type == BTRFS_REF_METADATA &&
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+ (ref->type == BTRFS_REF_DATA &&
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
+ btrfs_ref_tree_mod(fs_info, ref);
+
+ return ret;
+}
+
+enum btrfs_loop_type {
+ LOOP_CACHING_NOWAIT,
+ LOOP_CACHING_WAIT,
+ LOOP_ALLOC_CHUNK,
+ LOOP_NO_EMPTY_SIZE,
+};
+
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group *cache,
+ int delalloc)
+{
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
+ int delalloc)
+{
+ btrfs_get_block_group(cache);
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group *btrfs_lock_cluster(
+ struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster,
+ int delalloc)
+ __acquires(&cluster->refill_lock)
+{
+ struct btrfs_block_group *used_bg = NULL;
+
+ spin_lock(&cluster->refill_lock);
+ while (1) {
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
+ return used_bg;
+
+ btrfs_get_block_group(used_bg);
+
+ if (!delalloc)
+ return used_bg;
+
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
+
+ spin_unlock(&cluster->refill_lock);
+
+ /* We should only have one-level nested. */
+ down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
+
+ spin_lock(&cluster->refill_lock);
+ if (used_bg == cluster->block_group)
+ return used_bg;
+
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group *cache,
+ int delalloc)
+{
+ if (delalloc)
+ up_read(&cache->data_rwsem);
+ btrfs_put_block_group(cache);
+}
+
+enum btrfs_extent_allocation_policy {
+ BTRFS_EXTENT_ALLOC_CLUSTERED,
+ BTRFS_EXTENT_ALLOC_ZONED,
+};
+
+/*
+ * Structure used internally for find_free_extent() function. Wraps needed
+ * parameters.
+ */
+struct find_free_extent_ctl {
+ /* Basic allocation info */
+ u64 ram_bytes;
+ u64 num_bytes;
+ u64 min_alloc_size;
+ u64 empty_size;
+ u64 flags;
+ int delalloc;
+
+ /* Where to start the search inside the bg */
+ u64 search_start;
+
+ /* For clustered allocation */
+ u64 empty_cluster;
+ struct btrfs_free_cluster *last_ptr;
+ bool use_cluster;
+
+ bool have_caching_bg;
+ bool orig_have_caching_bg;
+
+ /* Allocation is called for tree-log */
+ bool for_treelog;
+
+ /* Allocation is called for data relocation */
+ bool for_data_reloc;
+
+ /* RAID index, converted from flags */
+ int index;
+
+ /*
+ * Current loop number, check find_free_extent_update_loop() for details
+ */
+ int loop;
+
+ /*
+ * Whether we're refilling a cluster, if true we need to re-search
+ * current block group but don't try to refill the cluster again.
+ */
+ bool retry_clustered;
+
+ /*
+ * Whether we're updating free space cache, if true we need to re-search
+ * current block group but don't try updating free space cache again.
+ */
+ bool retry_unclustered;
+
+ /* If current block group is cached */
+ int cached;
+
+ /* Max contiguous hole found */
+ u64 max_extent_size;
+
+ /* Total free space from free space cache, not always contiguous */
+ u64 total_free_space;
+
+ /* Found result */
+ u64 found_offset;
+
+ /* Hint where to start looking for an empty space */
+ u64 hint_byte;
+
+ /* Allocation policy */
+ enum btrfs_extent_allocation_policy policy;
+};
+
+
+/*
+ * Helper function for find_free_extent().
+ *
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ * Return >0 to inform caller that we find nothing
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
+ */
+static int find_free_extent_clustered(struct btrfs_block_group *bg,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **cluster_bg_ret)
+{
+ struct btrfs_block_group *cluster_bg;
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+ u64 aligned_cluster;
+ u64 offset;
+ int ret;
+
+ cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
+ if (!cluster_bg)
+ goto refill_cluster;
+ if (cluster_bg != bg && (cluster_bg->ro ||
+ !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ goto release_cluster;
+
+ offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
+ ffe_ctl->num_bytes, cluster_bg->start,
+ &ffe_ctl->max_extent_size);
+ if (offset) {
+ /* We have a block, we're done */
+ spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(cluster_bg,
+ ffe_ctl->search_start, ffe_ctl->num_bytes);
+ *cluster_bg_ret = cluster_bg;
+ ffe_ctl->found_offset = offset;
+ return 0;
+ }
+ WARN_ON(last_ptr->block_group != cluster_bg);
+
+release_cluster:
+ /*
+ * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
+ * lets just skip it and let the allocator find whatever block it can
+ * find. If we reach this point, we will have tried the cluster
+ * allocator plenty of times and not have found anything, so we are
+ * likely way too fragmented for the clustering stuff to find anything.
+ *
+ * However, if the cluster is taken from the current block group,
+ * release the cluster first, so that we stand a better chance of
+ * succeeding in the unclustered allocation.
+ */
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
+ spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+ return -ENOENT;
+ }
+
+ /* This cluster didn't work out, free it and start over */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+ if (cluster_bg != bg)
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+
+refill_cluster:
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ return -ENOENT;
+ }
+
+ aligned_cluster = max_t(u64,
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size,
+ bg->full_stripe_len);
+ ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
+ ffe_ctl->num_bytes, aligned_cluster);
+ if (ret == 0) {
+ /* Now pull our allocation out of this cluster */
+ offset = btrfs_alloc_from_cluster(bg, last_ptr,
+ ffe_ctl->num_bytes, ffe_ctl->search_start,
+ &ffe_ctl->max_extent_size);
+ if (offset) {
+ /* We found one, proceed */
+ spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(bg,
+ ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ ffe_ctl->found_offset = offset;
+ return 0;
+ }
+ } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+ !ffe_ctl->retry_clustered) {
+ spin_unlock(&last_ptr->refill_lock);
+
+ ffe_ctl->retry_clustered = true;
+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size);
+ return -EAGAIN;
+ }
+ /*
+ * At this point we either didn't find a cluster or we weren't able to
+ * allocate a block from our cluster. Free the cluster we've been
+ * trying to use, and go to the next block group.
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ spin_unlock(&last_ptr->refill_lock);
+ return 1;
+}
+
+/*
+ * Return >0 to inform caller that we find nothing
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ */
+static int find_free_extent_unclustered(struct btrfs_block_group *bg,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+ u64 offset;
+
+ /*
+ * We are doing an unclustered allocation, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get more
+ * space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
+ if (ffe_ctl->cached) {
+ struct btrfs_free_space_ctl *free_space_ctl;
+
+ free_space_ctl = bg->free_space_ctl;
+ spin_lock(&free_space_ctl->tree_lock);
+ if (free_space_ctl->free_space <
+ ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
+ ffe_ctl->empty_size) {
+ ffe_ctl->total_free_space = max_t(u64,
+ ffe_ctl->total_free_space,
+ free_space_ctl->free_space);
+ spin_unlock(&free_space_ctl->tree_lock);
+ return 1;
+ }
+ spin_unlock(&free_space_ctl->tree_lock);
+ }
+
+ offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
+ ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ &ffe_ctl->max_extent_size);
+
+ /*
+ * If we didn't find a chunk, and we haven't failed on this block group
+ * before, and this block group is in the middle of caching and we are
+ * ok with waiting, then go ahead and wait for progress to be made, and
+ * set @retry_unclustered to true.
+ *
+ * If @retry_unclustered is true then we've already waited on this
+ * block group once and should move on to the next block group.
+ */
+ if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
+ ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+ ffe_ctl->empty_size);
+ ffe_ctl->retry_unclustered = true;
+ return -EAGAIN;
+ } else if (!offset) {
+ return 1;
+ }
+ ffe_ctl->found_offset = offset;
+ return 0;
+}
+
+static int do_allocation_clustered(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ int ret;
+
+ /* We want to try and use the cluster allocator, so lets look there */
+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ /* ret == -ENOENT case falls through */
+ }
+
+ return find_free_extent_unclustered(block_group, ffe_ctl);
+}
+
+/*
+ * Tree-log block group locking
+ * ============================
+ *
+ * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
+ * indicates the starting address of a block group, which is reserved only
+ * for tree-log metadata.
+ *
+ * Lock nesting
+ * ============
+ *
+ * space_info::lock
+ * block_group::lock
+ * fs_info::treelog_bg_lock
+ */
+
+/*
+ * Simple allocator for sequential-only block group. It only allows sequential
+ * allocation. No need to play with trees. This function also reserves the
+ * bytes as in btrfs_add_reserved_bytes.
+ */
+static int do_allocation_zoned(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *space_info = block_group->space_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 start = block_group->start;
+ u64 num_bytes = ffe_ctl->num_bytes;
+ u64 avail;
+ u64 bytenr = block_group->start;
+ u64 log_bytenr;
+ u64 data_reloc_bytenr;
+ int ret = 0;
+ bool skip = false;
+
+ ASSERT(btrfs_is_zoned(block_group->fs_info));
+
+ /*
+ * Do not allow non-tree-log blocks in the dedicated tree-log block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->treelog_bg_lock);
+ log_bytenr = fs_info->treelog_bg;
+ if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
+ skip = true;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ if (skip)
+ return 1;
+
+ /*
+ * Do not allow non-relocation blocks in the dedicated relocation block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->relocation_bg_lock);
+ data_reloc_bytenr = fs_info->data_reloc_bg;
+ if (data_reloc_bytenr &&
+ ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+ (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+ skip = true;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ if (skip)
+ return 1;
+
+ /* Check RO and no space case before trying to activate it */
+ spin_lock(&block_group->lock);
+ if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+ * Return the error after taking the locks.
+ */
+ }
+ spin_unlock(&block_group->lock);
+
+ if (!ret && !btrfs_zone_activate(block_group)) {
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+ * Return the error after taking the locks.
+ */
+ }
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ spin_lock(&fs_info->treelog_bg_lock);
+ spin_lock(&fs_info->relocation_bg_lock);
+
+ if (ret)
+ goto out;
+
+ ASSERT(!ffe_ctl->for_treelog ||
+ block_group->start == fs_info->treelog_bg ||
+ fs_info->treelog_bg == 0);
+ ASSERT(!ffe_ctl->for_data_reloc ||
+ block_group->start == fs_info->data_reloc_bg ||
+ fs_info->data_reloc_bg == 0);
+
+ if (block_group->ro ||
+ (!ffe_ctl->for_data_reloc &&
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Do not allow currently using block group to be tree-log dedicated
+ * block group.
+ */
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Do not allow currently used block group to be the data relocation
+ * dedicated block group.
+ */
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
+ avail = block_group->zone_capacity - block_group->alloc_offset;
+ if (avail < num_bytes) {
+ if (ffe_ctl->max_extent_size < avail) {
+ /*
+ * With sequential allocator, free space is always
+ * contiguous
+ */
+ ffe_ctl->max_extent_size = avail;
+ ffe_ctl->total_free_space = avail;
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
+ fs_info->treelog_bg = block_group->start;
+
+ if (ffe_ctl->for_data_reloc) {
+ if (!fs_info->data_reloc_bg)
+ fs_info->data_reloc_bg = block_group->start;
+ /*
+ * Do not allow allocations from this block group, unless it is
+ * for data relocation. Compared to increasing the ->ro, setting
+ * the ->zoned_data_reloc_ongoing flag still allows nocow
+ * writers to come in. See btrfs_inc_nocow_writers().
+ *
+ * We need to disable an allocation to avoid an allocation of
+ * regular (non-relocation data) extent. With mix of relocation
+ * extents and regular extents, we can dispatch WRITE commands
+ * (for relocation extents) and ZONE APPEND commands (for
+ * regular extents) at the same time to the same zone, which
+ * easily break the write pointer.
+ *
+ * Also, this flag avoids this block group to be zone finished.
+ */
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
+ }
+
+ ffe_ctl->found_offset = start + block_group->alloc_offset;
+ block_group->alloc_offset += num_bytes;
+ spin_lock(&ctl->tree_lock);
+ ctl->free_space -= num_bytes;
+ spin_unlock(&ctl->tree_lock);
+
+ /*
+ * We do not check if found_offset is aligned to stripesize. The
+ * address is anyway rewritten when using zone append writing.
+ */
+
+ ffe_ctl->search_start = ffe_ctl->found_offset;
+
+out:
+ if (ret && ffe_ctl->for_treelog)
+ fs_info->treelog_bg = 0;
+ if (ret && ffe_ctl->for_data_reloc)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ spin_unlock(&fs_info->treelog_bg_lock);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
+static int do_allocation(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
+ default:
+ BUG();
+ }
+}
+
+static void release_block_group(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ int delalloc)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
+ default:
+ BUG();
+ }
+
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ ffe_ctl->index);
+ btrfs_release_block_group(block_group, delalloc);
+}
+
+static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ if (!ffe_ctl->use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
+}
+
+static void found_extent(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ found_extent_clustered(ffe_ctl, ins);
+ break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ /* If we can activate new zone, just allocate a chunk and use it */
+ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+ return 0;
+
+ /*
+ * We already reached the max active zones. Try to finish one block
+ * group to make a room for a new block group. This is only possible
+ * for a data block group because btrfs_zone_finish() may need to wait
+ * for a running transaction which can cause a deadlock for metadata
+ * allocation.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int ret = btrfs_zone_finish_one_bg(fs_info);
+
+ if (ret == 1)
+ return 0;
+ else if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * If we have enough free space left in an already active block group
+ * and we can't activate any other zone now, do not allow allocating a
+ * new chunk and let find_free_extent() retry with a smaller size.
+ */
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
+ return -ENOSPC;
+
+ /*
+ * Even min_alloc_size is not left in any block groups. Since we cannot
+ * activate a new block group, allocating it may not help. Let's tell a
+ * caller to try again and hope it progress something by writing some
+ * parts of the region. That is only possible for data block groups,
+ * where a part of the region can be written.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
+ return -EAGAIN;
+
+ /*
+ * We cannot activate a new block group and no enough space left in any
+ * block groups. So, allocating a new block group may not help. But,
+ * there is nothing to do anyway, so let's go with it.
+ */
+ return 0;
+}
+
+static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ return can_allocate_chunk_zoned(fs_info, ffe_ctl);
+ default:
+ BUG();
+ }
+}
+
+static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ /*
+ * If we can't allocate a new chunk we've already looped through
+ * at least once, move on to the NO_EMPTY_SIZE case.
+ */
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+ return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Give up here */
+ return -ENOSPC;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Return >0 means caller needs to re-search for free extent
+ * Return 0 means we have the needed free extent.
+ * Return <0 means we failed to locate any free extent.
+ */
+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl,
+ bool full_search)
+{
+ struct btrfs_root *root = fs_info->chunk_root;
+ int ret;
+
+ if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
+ ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
+ ffe_ctl->orig_have_caching_bg = true;
+
+ if (ins->objectid) {
+ found_extent(ffe_ctl, ins);
+ return 0;
+ }
+
+ if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ return 1;
+
+ ffe_ctl->index++;
+ if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
+ return 1;
+
+ /*
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+ * caching kthreads as we move along
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+ * again
+ */
+ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
+ ffe_ctl->index = 0;
+ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any uncached bgs and we've already done a
+ * full search through.
+ */
+ if (ffe_ctl->orig_have_caching_bg || !full_search)
+ ffe_ctl->loop = LOOP_CACHING_WAIT;
+ else
+ ffe_ctl->loop = LOOP_ALLOC_CHUNK;
+ } else {
+ ffe_ctl->loop++;
+ }
+
+ if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
+ struct btrfs_trans_handle *trans;
+ int exist = 0;
+
+ /*Check if allocation policy allows to create a new chunk */
+ ret = can_allocate_chunk(fs_info, ffe_ctl);
+ if (ret)
+ return ret;
+
+ trans = current->journal_info;
+ if (trans)
+ exist = 1;
+ else
+ trans = btrfs_join_transaction(root);
+
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ return ret;
+ }
+
+ ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
+ CHUNK_ALLOC_FORCE_FOR_EXTENT);
+
+ /* Do not bail out on ENOSPC since we can do more. */
+ if (ret == -ENOSPC)
+ ret = chunk_allocation_failed(ffe_ctl);
+ else if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
+ else
+ ret = 0;
+ if (!exist)
+ btrfs_end_transaction(trans);
+ if (ret)
+ return ret;
+ }
+
+ if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+ if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
+ return -ENOSPC;
+
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (ffe_ctl->empty_size == 0 &&
+ ffe_ctl->empty_cluster == 0)
+ return -ENOSPC;
+ ffe_ctl->empty_size = 0;
+ ffe_ctl->empty_cluster = 0;
+ }
+ return 1;
+ }
+ return -ENOSPC;
+}
+
+static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ /*
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
+ */
+ if (space_info->max_extent_size) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ ffe_ctl->num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
+ }
+
+ ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
+ &ffe_ctl->empty_cluster);
+ if (ffe_ctl->last_ptr) {
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&last_ptr->lock);
+ }
+
+ return 0;
+}
+
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ } else if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ /*
+ * No lock is OK here because avail is monotinically
+ * decreasing, and this is just a hint.
+ */
+ u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+ if (block_group_bits(block_group, ffe_ctl->flags) &&
+ avail >= ffe_ctl->num_bytes) {
+ ffe_ctl->hint_byte = block_group->start;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
+ return 0;
+}
+
+static int prepare_allocation(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return prepare_allocation_clustered(fs_info, ffe_ctl,
+ space_info, ins);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ return prepare_allocation_zoned(fs_info, ffe_ctl);
+ default:
+ BUG();
+ }
+}
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == start position
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == the size of the hole.
+ * Any available blocks before search_start are skipped.
+ *
+ * If there is no suitable free space, we will record the max size of
+ * the free space extent currently.
+ *
+ * The overall logic and call chain:
+ *
+ * find_free_extent()
+ * |- Iterate through all block groups
+ * | |- Get a valid block group
+ * | |- Try to do clustered allocation in that block group
+ * | |- Try to do unclustered allocation in that block group
+ * | |- Check if the result is valid
+ * | | |- If valid, then exit
+ * | |- Jump to next block group
+ * |
+ * |- Push harder to find free extents
+ * |- If not found, re-iterate all block groups
+ */
+static noinline int find_free_extent(struct btrfs_root *root,
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+ int cache_block_group_error = 0;
+ struct btrfs_block_group *block_group = NULL;
+ struct btrfs_space_info *space_info;
+ bool full_search = false;
+
+ WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
+
+ ffe_ctl->search_start = 0;
+ /* For clustered allocation */
+ ffe_ctl->empty_cluster = 0;
+ ffe_ctl->last_ptr = NULL;
+ ffe_ctl->use_cluster = true;
+ ffe_ctl->have_caching_bg = false;
+ ffe_ctl->orig_have_caching_bg = false;
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+ ffe_ctl->loop = 0;
+ /* For clustered allocation */
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ ffe_ctl->cached = 0;
+ ffe_ctl->max_extent_size = 0;
+ ffe_ctl->total_free_space = 0;
+ ffe_ctl->found_offset = 0;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+
+ if (btrfs_is_zoned(fs_info))
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
+
+ ins->type = BTRFS_EXTENT_ITEM_KEY;
+ ins->objectid = 0;
+ ins->offset = 0;
+
+ trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ ffe_ctl->flags);
+
+ space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
+ if (!space_info) {
+ btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
+ return -ENOSPC;
+ }
+
+ ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
+ if (ret < 0)
+ return ret;
+
+ ffe_ctl->search_start = max(ffe_ctl->search_start,
+ first_logical_byte(fs_info));
+ ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
+ if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
+ block_group = btrfs_lookup_block_group(fs_info,
+ ffe_ctl->search_start);
+ /*
+ * we don't want to use the block group if it doesn't match our
+ * allocation bits, or if its not cached.
+ *
+ * However if we are re-searching with an ideal block group
+ * picked out then we don't care that the block group is cached.
+ */
+ if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
+ block_group->cached != BTRFS_CACHE_NO) {
+ down_read(&space_info->groups_sem);
+ if (list_empty(&block_group->list) ||
+ block_group->ro) {
+ /*
+ * someone is removing this block group,
+ * we can't jump into the have_block_group
+ * target because our list pointers are not
+ * valid
+ */
+ btrfs_put_block_group(block_group);
+ up_read(&space_info->groups_sem);
+ } else {
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
+ btrfs_lock_block_group(block_group,
+ ffe_ctl->delalloc);
+ goto have_block_group;
+ }
+ } else if (block_group) {
+ btrfs_put_block_group(block_group);
+ }
+ }
+search:
+ ffe_ctl->have_caching_bg = false;
+ if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+ ffe_ctl->index == 0)
+ full_search = true;
+ down_read(&space_info->groups_sem);
+ list_for_each_entry(block_group,
+ &space_info->block_groups[ffe_ctl->index], list) {
+ struct btrfs_block_group *bg_ret;
+
+ /* If the block group is read-only, we can skip it entirely. */
+ if (unlikely(block_group->ro)) {
+ if (ffe_ctl->for_treelog)
+ btrfs_clear_treelog_bg(block_group);
+ if (ffe_ctl->for_data_reloc)
+ btrfs_clear_data_reloc_bg(block_group);
+ continue;
+ }
+
+ btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
+ ffe_ctl->search_start = block_group->start;
+
+ /*
+ * this can happen if we end up cycling through all the
+ * raid types, but we want to make sure we only allocate
+ * for the proper type.
+ */
+ if (!block_group_bits(block_group, ffe_ctl->flags)) {
+ u64 extra = BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1_MASK |
+ BTRFS_BLOCK_GROUP_RAID56_MASK |
+ BTRFS_BLOCK_GROUP_RAID10;
+
+ /*
+ * if they asked for extra copies and this block group
+ * doesn't provide them, bail. This does allow us to
+ * fill raid0 from raid1.
+ */
+ if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
+ goto loop;
+
+ /*
+ * This block group has different flags than we want.
+ * It's possible that we have MIXED_GROUP flag but no
+ * block group is mixed. Just skip such block group.
+ */
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
+ continue;
+ }
+
+have_block_group:
+ ffe_ctl->cached = btrfs_block_group_done(block_group);
+ if (unlikely(!ffe_ctl->cached)) {
+ ffe_ctl->have_caching_bg = true;
+ ret = btrfs_cache_block_group(block_group, false);
+
+ /*
+ * If we get ENOMEM here or something else we want to
+ * try other block groups, because it may not be fatal.
+ * However if we can't find anything else we need to
+ * save our return here so that we return the actual
+ * error that caused problems, not ENOSPC.
+ */
+ if (ret < 0) {
+ if (!cache_block_group_error)
+ cache_block_group_error = ret;
+ ret = 0;
+ goto loop;
+ }
+ ret = 0;
+ }
+
+ if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
+ if (!cache_block_group_error)
+ cache_block_group_error = -EIO;
+ goto loop;
+ }
+
+ bg_ret = NULL;
+ ret = do_allocation(block_group, ffe_ctl, &bg_ret);
+ if (ret == 0) {
+ if (bg_ret && bg_ret != block_group) {
+ btrfs_release_block_group(block_group,
+ ffe_ctl->delalloc);
+ block_group = bg_ret;
+ }
+ } else if (ret == -EAGAIN) {
+ goto have_block_group;
+ } else if (ret > 0) {
+ goto loop;
+ }
+
+ /* Checks */
+ ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
+ fs_info->stripesize);
+
+ /* move on to the next group */
+ if (ffe_ctl->search_start + ffe_ctl->num_bytes >
+ block_group->start + block_group->length) {
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
+ goto loop;
+ }
+
+ if (ffe_ctl->found_offset < ffe_ctl->search_start)
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl->found_offset,
+ ffe_ctl->search_start - ffe_ctl->found_offset);
+
+ ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+ ffe_ctl->num_bytes,
+ ffe_ctl->delalloc);
+ if (ret == -EAGAIN) {
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
+ goto loop;
+ }
+ btrfs_inc_block_group_reservations(block_group);
+
+ /* we are all good, lets return */
+ ins->objectid = ffe_ctl->search_start;
+ ins->offset = ffe_ctl->num_bytes;
+
+ trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
+ break;
+loop:
+ release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
+ cond_resched();
+ }
+ up_read(&space_info->groups_sem);
+
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
+ if (ret > 0)
+ goto search;
+
+ if (ret == -ENOSPC && !cache_block_group_error) {
+ /*
+ * Use ffe_ctl->total_free_space as fallback if we can't find
+ * any contiguous hole.
+ */
+ if (!ffe_ctl->max_extent_size)
+ ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
+ spin_lock(&space_info->lock);
+ space_info->max_extent_size = ffe_ctl->max_extent_size;
+ spin_unlock(&space_info->lock);
+ ins->offset = ffe_ctl->max_extent_size;
+ } else if (ret == -ENOSPC) {
+ ret = cache_block_group_error;
+ }
+ return ret;
+}
+
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ * hole that is at least as big as @num_bytes.
+ *
+ * @root - The root that will contain this extent
+ *
+ * @ram_bytes - The amount of space in ram that @num_bytes take. This
+ * is used for accounting purposes. This value differs
+ * from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes - Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size - Indicates the minimum amount of space that the
+ * allocator should try to satisfy. In some cases
+ * @num_bytes may be larger than what is required and if
+ * the filesystem is fragmented then allocation fails.
+ * However, the presence of @min_alloc_size gives a
+ * chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size - A hint that you plan on doing more COW. This is the
+ * size in bytes the allocator should try to find free
+ * next to the block it returns. This is just a hint and
+ * may be ignored by the allocator.
+ *
+ * @hint_byte - Hint to the allocator to start searching above the byte
+ * address passed. It might be ignored.
+ *
+ * @ins - This key is modified to record the found hole. It will
+ * have the following values:
+ * ins->objectid == start position
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == the size of the hole.
+ *
+ * @is_data - Boolean flag indicating whether an extent is
+ * allocated for data (true) or metadata (false)
+ *
+ * @delalloc - Boolean flag indicating whether this allocation is for
+ * delalloc or not. If 'true' data_rwsem of block groups
+ * is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ struct btrfs_key *ins, int is_data, int delalloc)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct find_free_extent_ctl ffe_ctl = {};
+ bool final_tried = num_bytes == min_alloc_size;
+ u64 flags;
+ int ret;
+ bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
+
+ flags = get_alloc_profile_by_root(root, is_data);
+again:
+ WARN_ON(num_bytes < fs_info->sectorsize);
+
+ ffe_ctl.ram_bytes = ram_bytes;
+ ffe_ctl.num_bytes = num_bytes;
+ ffe_ctl.min_alloc_size = min_alloc_size;
+ ffe_ctl.empty_size = empty_size;
+ ffe_ctl.flags = flags;
+ ffe_ctl.delalloc = delalloc;
+ ffe_ctl.hint_byte = hint_byte;
+ ffe_ctl.for_treelog = for_treelog;
+ ffe_ctl.for_data_reloc = for_data_reloc;
+
+ ret = find_free_extent(root, ins, &ffe_ctl);
+ if (!ret && !is_data) {
+ btrfs_dec_block_group_reservations(fs_info, ins->objectid);
+ } else if (ret == -ENOSPC) {
+ if (!final_tried && ins->offset) {
+ num_bytes = min(num_bytes >> 1, ins->offset);
+ num_bytes = round_down(num_bytes,
+ fs_info->sectorsize);
+ num_bytes = max(num_bytes, min_alloc_size);
+ ram_bytes = num_bytes;
+ if (num_bytes == min_alloc_size)
+ final_tried = true;
+ goto again;
+ } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ struct btrfs_space_info *sinfo;
+
+ sinfo = btrfs_find_space_info(fs_info, flags);
+ btrfs_err(fs_info,
+ "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+ flags, num_bytes, for_treelog, for_data_reloc);
+ if (sinfo)
+ btrfs_dump_space_info(fs_info, sinfo,
+ num_bytes, 1);
+ }
+ }
+
+ return ret;
+}
+
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len, int delalloc)
+{
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, start);
+ if (!cache) {
+ btrfs_err(fs_info, "Unable to find block group for %llu",
+ start);
+ return -ENOSPC;
+ }
+
+ btrfs_add_free_space(cache, start, len);
+ btrfs_free_reserved_bytes(cache, len, delalloc);
+ trace_btrfs_reserved_extent_free(fs_info, start, len);
+
+ btrfs_put_block_group(cache);
+ return 0;
+}
+
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 len)
+{
+ struct btrfs_block_group *cache;
+ int ret = 0;
+
+ cache = btrfs_lookup_block_group(trans->fs_info, start);
+ if (!cache) {
+ btrfs_err(trans->fs_info, "unable to find block group for %llu",
+ start);
+ return -ENOSPC;
+ }
+
+ ret = pin_down_extent(trans, cache, start, len, 1);
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+
+ ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
+ if (ret) {
+ ASSERT(!ret);
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ bytenr, num_bytes);
+ return ret;
+ }
+
+ trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
+ return 0;
+}
+
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ u64 parent, u64 root_objectid,
+ u64 flags, u64 owner, u64 offset,
+ struct btrfs_key *ins, int ref_mod)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
+ int ret;
+ struct btrfs_extent_item *extent_item;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int type;
+ u32 size;
+
+ if (parent > 0)
+ type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ type = BTRFS_EXTENT_DATA_REF_KEY;
+
+ size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ extent_root = btrfs_extent_root(fs_info, ins->objectid);
+ ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ extent_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, extent_item, ref_mod);
+ btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+ btrfs_set_extent_flags(leaf, extent_item,
+ flags | BTRFS_EXTENT_FLAG_DATA);
+
+ iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ btrfs_set_extent_inline_ref_type(leaf, iref, type);
+ if (parent > 0) {
+ struct btrfs_shared_data_ref *ref;
+ ref = (struct btrfs_shared_data_ref *)(iref + 1);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+ } else {
+ struct btrfs_extent_data_ref *ref;
+ ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+ btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+ }
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_free_path(path);
+
+ return alloc_reserved_extent(trans, ins->objectid, ins->offset);
+}
+
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
+ int ret;
+ struct btrfs_extent_item *extent_item;
+ struct btrfs_key extent_key;
+ struct btrfs_tree_block_info *block_info;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_delayed_tree_ref *ref;
+ u32 size = sizeof(*extent_item) + sizeof(*iref);
+ u64 flags = extent_op->flags_to_set;
+ bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+
+ extent_key.objectid = node->bytenr;
+ if (skinny_metadata) {
+ extent_key.offset = ref->level;
+ extent_key.type = BTRFS_METADATA_ITEM_KEY;
+ } else {
+ extent_key.offset = node->num_bytes;
+ extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+ size += sizeof(*block_info);
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
+ ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
+ size);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ extent_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, extent_item, 1);
+ btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+ btrfs_set_extent_flags(leaf, extent_item,
+ flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+
+ if (skinny_metadata) {
+ iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ } else {
+ block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+ btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
+ btrfs_set_tree_block_level(leaf, block_info, ref->level);
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ }
+
+ if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_SHARED_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
+ } else {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_TREE_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
+ }
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
+}
+
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 owner,
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins)
+{
+ struct btrfs_ref generic_ref = { 0 };
+
+ BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
+ ins->objectid, ins->offset, 0);
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+ offset, 0, false);
+ btrfs_ref_tree_mod(root->fs_info, &generic_ref);
+
+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
+}
+
+/*
+ * this is used by the tree logging recovery code. It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ u64 root_objectid, u64 owner, u64 offset,
+ struct btrfs_key *ins)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+ struct btrfs_block_group *block_group;
+ struct btrfs_space_info *space_info;
+
+ /*
+ * Mixed block groups will exclude before processing the log so we only
+ * need to do the exclude dance if this fs isn't mixed.
+ */
+ if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ ret = __exclude_logged_extent(fs_info, ins->objectid,
+ ins->offset);
+ if (ret)
+ return ret;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
+ if (!block_group)
+ return -EINVAL;
+
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ space_info->bytes_reserved += ins->offset;
+ block_group->reserved += ins->offset;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
+ offset, ins, 1);
+ if (ret)
+ btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+static struct extent_buffer *
+btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 bytenr, int level, u64 owner,
+ enum btrfs_lock_nesting nest)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *buf;
+ u64 lockdep_owner = owner;
+
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
+ if (IS_ERR(buf))
+ return buf;
+
+ /*
+ * Extra safety check in case the extent tree is corrupted and extent
+ * allocator chooses to use a tree block which is already used and
+ * locked.
+ */
+ if (buf->lock_owner == current->pid) {
+ btrfs_err_rl(fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ buf->start, btrfs_header_owner(buf), current->pid);
+ free_extent_buffer(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
+
+ /*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ *
+ * The exception however is in replace_path() in relocation, where we
+ * hold the lock on the original fs root and then search for the reloc
+ * root. At that point we need to make sure any reloc root buffers are
+ * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
+ * lockdep happy.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ /* btrfs_clean_tree_block() accesses generation field. */
+ btrfs_set_header_generation(buf, trans->transid);
+
+ /*
+ * This needs to stay, because we could allocate a freed block from an
+ * old tree into a new tree, so we need to make sure this new block is
+ * set to the appropriate level and owner.
+ */
+ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
+
+ __btrfs_tree_lock(buf, nest);
+ btrfs_clean_tree_block(buf);
+ clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
+
+ set_extent_buffer_uptodate(buf);
+
+ memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_level(buf, level);
+ btrfs_set_header_bytenr(buf, buf->start);
+ btrfs_set_header_generation(buf, trans->transid);
+ btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(buf, owner);
+ write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
+ write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ buf->log_index = root->log_transid % 2;
+ /*
+ * we allow two log transactions at a time, use different
+ * EXTENT bit to differentiate dirty pages.
+ */
+ if (buf->log_index == 0)
+ set_extent_dirty(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ else
+ set_extent_new(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1);
+ } else {
+ buf->log_index = -1;
+ set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ }
+ /* this returns a buffer locked for blocking */
+ return buf;
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the tree buffer or an ERR_PTR on error.
+ */
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ const struct btrfs_disk_key *key,
+ int level, u64 hint,
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key ins;
+ struct btrfs_block_rsv *block_rsv;
+ struct extent_buffer *buf;
+ struct btrfs_delayed_extent_op *extent_op;
+ struct btrfs_ref generic_ref = { 0 };
+ u64 flags = 0;
+ int ret;
+ u32 blocksize = fs_info->nodesize;
+ bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (btrfs_is_testing(fs_info)) {
+ buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
+ level, root_objectid, nest);
+ if (!IS_ERR(buf))
+ root->alloc_bytenr += blocksize;
+ return buf;
+ }
+#endif
+
+ block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
+ if (IS_ERR(block_rsv))
+ return ERR_CAST(block_rsv);
+
+ ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
+ empty_size, hint, &ins, 0, 0);
+ if (ret)
+ goto out_unuse;
+
+ buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
+ root_objectid, nest);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_free_reserved;
+ }
+
+ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent == 0)
+ parent = ins.objectid;
+ flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else
+ BUG_ON(parent > 0);
+
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ extent_op = btrfs_alloc_delayed_extent_op();
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = skinny_metadata ? false : true;
+ extent_op->update_flags = true;
+ extent_op->level = level;
+
+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
+ ins.objectid, ins.offset, parent);
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid,
+ root->root_key.objectid, false);
+ btrfs_ref_tree_mod(fs_info, &generic_ref);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
+ if (ret)
+ goto out_free_delayed;
+ }
+ return buf;
+
+out_free_delayed:
+ btrfs_free_delayed_extent_op(extent_op);
+out_free_buf:
+ btrfs_tree_unlock(buf);
+ free_extent_buffer(buf);
+out_free_reserved:
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
+out_unuse:
+ btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
+ return ERR_PTR(ret);
+}
+
+struct walk_control {
+ u64 refs[BTRFS_MAX_LEVEL];
+ u64 flags[BTRFS_MAX_LEVEL];
+ struct btrfs_key update_progress;
+ struct btrfs_key drop_progress;
+ int drop_level;
+ int stage;
+ int level;
+ int shared_level;
+ int update_ref;
+ int keep_locks;
+ int reada_slot;
+ int reada_count;
+ int restarted;
+};
+
+#define DROP_REFERENCE 1
+#define UPDATE_BACKREF 2
+
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct walk_control *wc,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 bytenr;
+ u64 generation;
+ u64 refs;
+ u64 flags;
+ u32 nritems;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ int ret;
+ int slot;
+ int nread = 0;
+
+ if (path->slots[wc->level] < wc->reada_slot) {
+ wc->reada_count = wc->reada_count * 2 / 3;
+ wc->reada_count = max(wc->reada_count, 2);
+ } else {
+ wc->reada_count = wc->reada_count * 3 / 2;
+ wc->reada_count = min_t(int, wc->reada_count,
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+ }
+
+ eb = path->nodes[wc->level];
+ nritems = btrfs_header_nritems(eb);
+
+ for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+ if (nread >= wc->reada_count)
+ break;
+
+ cond_resched();
+ bytenr = btrfs_node_blockptr(eb, slot);
+ generation = btrfs_node_ptr_generation(eb, slot);
+
+ if (slot == path->slots[wc->level])
+ goto reada;
+
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset)
+ continue;
+
+ /* We don't lock the tree block, it's OK to be racy here */
+ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
+ wc->level - 1, 1, &refs,
+ &flags);
+ /* We don't care about errors in readahead. */
+ if (ret < 0)
+ continue;
+ BUG_ON(refs == 0);
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (refs == 1)
+ goto reada;
+
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ continue;
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ ret = btrfs_comp_cpu_keys(&key,
+ &wc->update_progress);
+ if (ret < 0)
+ continue;
+ } else {
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
+ }
+reada:
+ btrfs_readahead_node_child(eb, slot);
+ nread++;
+ }
+ wc->reada_slot = slot;
+}
+
+/*
+ * helper to process tree block while walking down the tree.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int lookup_info)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+ u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ int ret;
+
+ if (wc->stage == UPDATE_BACKREF &&
+ btrfs_header_owner(eb) != root->root_key.objectid)
+ return 1;
+
+ /*
+ * when reference count of tree block is 1, it won't increase
+ * again. once full backref flag is set, we never clear it.
+ */
+ if (lookup_info &&
+ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_lookup_extent_info(trans, fs_info,
+ eb->start, level, 1,
+ &wc->refs[level],
+ &wc->flags[level]);
+ BUG_ON(ret == -ENOMEM);
+ if (ret)
+ return ret;
+ BUG_ON(wc->refs[level] == 0);
+ }
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level] > 1)
+ return 1;
+
+ if (path->locks[level] && !wc->keep_locks) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+ }
+ return 0;
+ }
+
+ /* wc->stage == UPDATE_BACKREF */
+ if (!(wc->flags[level] & flag)) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_inc_ref(trans, root, eb, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_dec_ref(trans, root, eb, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_set_disk_extent_flags(trans, eb, flag,
+ btrfs_header_level(eb));
+ BUG_ON(ret); /* -ENOMEM */
+ wc->flags[level] |= flag;
+ }
+
+ /*
+ * the block is shared by multiple trees, so it's not good to
+ * keep the tree lock
+ */
+ if (path->locks[level] && level > 0) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+ }
+ return 0;
+}
+
+/*
+ * This is used to verify a ref exists for this root to deal with a bug where we
+ * would have a drop_progress key that hadn't been updated properly.
+ */
+static int check_ref_exists(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 parent,
+ int level)
+{
+ struct btrfs_path *path;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = lookup_extent_backref(trans, path, &iref, bytenr,
+ root->fs_info->nodesize, parent,
+ root->root_key.objectid, level, 0);
+ btrfs_free_path(path);
+ if (ret == -ENOENT)
+ return 0;
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
+/*
+ * helper to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int *lookup_info)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 bytenr;
+ u64 generation;
+ u64 parent;
+ struct btrfs_key key;
+ struct btrfs_key first_key;
+ struct btrfs_ref ref = { 0 };
+ struct extent_buffer *next;
+ int level = wc->level;
+ int reada = 0;
+ int ret = 0;
+ bool need_account = false;
+
+ generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+ /*
+ * if the lower level block was created before the snapshot
+ * was created, we know there is no need to update back refs
+ * for the subtree
+ */
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset) {
+ *lookup_info = 1;
+ return 1;
+ }
+
+ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+ path->slots[level]);
+
+ next = find_extent_buffer(fs_info, bytenr);
+ if (!next) {
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ root->root_key.objectid, level - 1);
+ if (IS_ERR(next))
+ return PTR_ERR(next);
+ reada = 1;
+ }
+ btrfs_tree_lock(next);
+
+ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
+ &wc->refs[level - 1],
+ &wc->flags[level - 1]);
+ if (ret < 0)
+ goto out_unlock;
+
+ if (unlikely(wc->refs[level - 1] == 0)) {
+ btrfs_err(fs_info, "Missing references.");
+ ret = -EIO;
+ goto out_unlock;
+ }
+ *lookup_info = 0;
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level - 1] > 1) {
+ need_account = true;
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ goto skip;
+
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+ if (ret < 0)
+ goto skip;
+
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
+ }
+ } else {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+ }
+
+ if (!btrfs_buffer_uptodate(next, generation, 0)) {
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ next = NULL;
+ *lookup_info = 1;
+ }
+
+ if (!next) {
+ if (reada && level == 1)
+ reada_walk_down(trans, root, wc, path);
+ next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
+ generation, level - 1, &first_key);
+ if (IS_ERR(next)) {
+ return PTR_ERR(next);
+ } else if (!extent_buffer_uptodate(next)) {
+ free_extent_buffer(next);
+ return -EIO;
+ }
+ btrfs_tree_lock(next);
+ }
+
+ level--;
+ ASSERT(level == btrfs_header_level(next));
+ if (level != btrfs_header_level(next)) {
+ btrfs_err(root->fs_info, "mismatched level");
+ ret = -EIO;
+ goto out_unlock;
+ }
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ path->locks[level] = BTRFS_WRITE_LOCK;
+ wc->level = level;
+ if (wc->level == 1)
+ wc->reada_slot = 0;
+ return 0;
+skip:
+ wc->refs[level - 1] = 0;
+ wc->flags[level - 1] = 0;
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ parent = path->nodes[level]->start;
+ } else {
+ ASSERT(root->root_key.objectid ==
+ btrfs_header_owner(path->nodes[level]));
+ if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level])) {
+ btrfs_err(root->fs_info,
+ "mismatched block owner");
+ ret = -EIO;
+ goto out_unlock;
+ }
+ parent = 0;
+ }
+
+ /*
+ * If we had a drop_progress we need to verify the refs are set
+ * as expected. If we find our ref then we know that from here
+ * on out everything should be correct, and we can clear the
+ * ->restarted flag.
+ */
+ if (wc->restarted) {
+ ret = check_ref_exists(trans, root, bytenr, parent,
+ level - 1);
+ if (ret < 0)
+ goto out_unlock;
+ if (ret == 0)
+ goto no_delete;
+ ret = 0;
+ wc->restarted = 0;
+ }
+
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have
+ * already accounted them at merge time (replace_path),
+ * thus we could skip expensive subtree trace here.
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ need_account) {
+ ret = btrfs_qgroup_trace_subtree(trans, next,
+ generation, level - 1);
+ if (ret) {
+ btrfs_err_rl(fs_info,
+ "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
+ ret);
+ }
+ }
+
+ /*
+ * We need to update the next key in our walk control so we can
+ * update the drop_progress key accordingly. We don't care if
+ * find_next_key doesn't find a key because that means we're at
+ * the end and are going to clean up now.
+ */
+ wc->drop_level = level;
+ find_next_key(path, level, &wc->drop_progress);
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+ fs_info->nodesize, parent);
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
+ 0, false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret)
+ goto out_unlock;
+ }
+no_delete:
+ *lookup_info = 1;
+ ret = 1;
+
+out_unlock:
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+
+ return ret;
+}
+
+/*
+ * helper to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+ u64 parent = 0;
+
+ if (wc->stage == UPDATE_BACKREF) {
+ BUG_ON(wc->shared_level < level);
+ if (level < wc->shared_level)
+ goto out;
+
+ ret = find_next_key(path, level + 1, &wc->update_progress);
+ if (ret > 0)
+ wc->update_ref = 0;
+
+ wc->stage = DROP_REFERENCE;
+ wc->shared_level = -1;
+ path->slots[level] = 0;
+
+ /*
+ * check reference count again if the block isn't locked.
+ * we should start walking down the tree again if reference
+ * count is one.
+ */
+ if (!path->locks[level]) {
+ BUG_ON(level == 0);
+ btrfs_tree_lock(eb);
+ path->locks[level] = BTRFS_WRITE_LOCK;
+
+ ret = btrfs_lookup_extent_info(trans, fs_info,
+ eb->start, level, 1,
+ &wc->refs[level],
+ &wc->flags[level]);
+ if (ret < 0) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+ return ret;
+ }
+ BUG_ON(wc->refs[level] == 0);
+ if (wc->refs[level] == 1) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+ return 1;
+ }
+ }
+ }
+
+ /* wc->stage == DROP_REFERENCE */
+ BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+
+ if (wc->refs[level] == 1) {
+ if (level == 0) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ ret = btrfs_dec_ref(trans, root, eb, 1);
+ else
+ ret = btrfs_dec_ref(trans, root, eb, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ if (is_fstree(root->root_key.objectid)) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, eb);
+ if (ret) {
+ btrfs_err_rl(fs_info,
+ "error %d accounting leaf items, quota is out of sync, rescan required",
+ ret);
+ }
+ }
+ }
+ /* make block locked assertion in btrfs_clean_tree_block happy */
+ if (!path->locks[level] &&
+ btrfs_header_generation(eb) == trans->transid) {
+ btrfs_tree_lock(eb);
+ path->locks[level] = BTRFS_WRITE_LOCK;
+ }
+ btrfs_clean_tree_block(eb);
+ }
+
+ if (eb == root->node) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ parent = eb->start;
+ else if (root->root_key.objectid != btrfs_header_owner(eb))
+ goto owner_mismatch;
+ } else {
+ if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ parent = path->nodes[level + 1]->start;
+ else if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]))
+ goto owner_mismatch;
+ }
+
+ btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+ wc->refs[level] == 1);
+out:
+ wc->refs[level] = 0;
+ wc->flags[level] = 0;
+ return 0;
+
+owner_mismatch:
+ btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
+ btrfs_header_owner(eb), root->root_key.objectid);
+ return -EUCLEAN;
+}
+
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
+{
+ int level = wc->level;
+ int lookup_info = 1;
+ int ret;
+
+ while (level >= 0) {
+ ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ if (ret > 0)
+ break;
+
+ if (level == 0)
+ break;
+
+ if (path->slots[level] >=
+ btrfs_header_nritems(path->nodes[level]))
+ break;
+
+ ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ if (ret > 0) {
+ path->slots[level]++;
+ continue;
+ } else if (ret < 0)
+ return ret;
+ level = wc->level;
+ }
+ return 0;
+}
+
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int max_level)
+{
+ int level = wc->level;
+ int ret;
+
+ path->slots[level] = btrfs_header_nritems(path->nodes[level]);
+ while (level < max_level && path->nodes[level]) {
+ wc->level = level;
+ if (path->slots[level] + 1 <
+ btrfs_header_nritems(path->nodes[level])) {
+ path->slots[level]++;
+ return 0;
+ } else {
+ ret = walk_up_proc(trans, root, path, wc);
+ if (ret > 0)
+ return 0;
+ if (ret < 0)
+ return ret;
+
+ if (path->locks[level]) {
+ btrfs_tree_unlock_rw(path->nodes[level],
+ path->locks[level]);
+ path->locks[level] = 0;
+ }
+ free_extent_buffer(path->nodes[level]);
+ path->nodes[level] = NULL;
+ level++;
+ }
+ }
+ return 1;
+}
+
+/*
+ * drop a subvolume tree.
+ *
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
+ *
+ * If called with for_reloc == 0, may exit early with -EAGAIN
+ */
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
+{
+ const bool is_reloc_root = (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root_item *root_item = &root->root_item;
+ struct walk_control *wc;
+ struct btrfs_key key;
+ int err = 0;
+ int ret;
+ int level;
+ bool root_dropped = false;
+ bool unfinished_drop = false;
+
+ btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ wc = kzalloc(sizeof(*wc), GFP_NOFS);
+ if (!wc) {
+ btrfs_free_path(path);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Use join to avoid potential EINTR from transaction start. See
+ * wait_reserve_ticket and the whole reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+
+ err = btrfs_run_delayed_items(trans);
+ if (err)
+ goto out_end_trans;
+
+ /*
+ * This will help us catch people modifying the fs tree while we're
+ * dropping it. It is unsafe to mess with the fs tree while it's being
+ * dropped as we unlock the root node and parent nodes as we walk down
+ * the tree, assuming nothing will change. If something does change
+ * then we'll have stale information and drop references to blocks we've
+ * already dropped.
+ */
+ set_bit(BTRFS_ROOT_DELETING, &root->state);
+ unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+
+ if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+ level = btrfs_header_level(root->node);
+ path->nodes[level] = btrfs_lock_root_node(root);
+ path->slots[level] = 0;
+ path->locks[level] = BTRFS_WRITE_LOCK;
+ memset(&wc->update_progress, 0,
+ sizeof(wc->update_progress));
+ } else {
+ btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+ memcpy(&wc->update_progress, &key,
+ sizeof(wc->update_progress));
+
+ level = btrfs_root_drop_level(root_item);
+ BUG_ON(level == 0);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ err = ret;
+ goto out_end_trans;
+ }
+ WARN_ON(ret > 0);
+
+ /*
+ * unlock our path, this is safe because only this
+ * function is allowed to delete this snapshot
+ */
+ btrfs_unlock_up_safe(path, 0);
+
+ level = btrfs_header_level(root->node);
+ while (1) {
+ btrfs_tree_lock(path->nodes[level]);
+ path->locks[level] = BTRFS_WRITE_LOCK;
+
+ ret = btrfs_lookup_extent_info(trans, fs_info,
+ path->nodes[level]->start,
+ level, 1, &wc->refs[level],
+ &wc->flags[level]);
+ if (ret < 0) {
+ err = ret;
+ goto out_end_trans;
+ }
+ BUG_ON(wc->refs[level] == 0);
+
+ if (level == btrfs_root_drop_level(root_item))
+ break;
+
+ btrfs_tree_unlock(path->nodes[level]);
+ path->locks[level] = 0;
+ WARN_ON(wc->refs[level] != 1);
+ level--;
+ }
+ }
+
+ wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
+ wc->level = level;
+ wc->shared_level = -1;
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = update_ref;
+ wc->keep_locks = 0;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
+
+ while (1) {
+
+ ret = walk_down_tree(trans, root, path, wc);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ if (ret > 0) {
+ BUG_ON(wc->stage != DROP_REFERENCE);
+ break;
+ }
+
+ if (wc->stage == DROP_REFERENCE) {
+ wc->drop_level = wc->level;
+ btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
+ &wc->drop_progress,
+ path->slots[wc->drop_level]);
+ }
+ btrfs_cpu_key_to_disk(&root_item->drop_progress,
+ &wc->drop_progress);
+ btrfs_set_root_drop_level(root_item, wc->drop_level);
+
+ BUG_ON(wc->level == 0);
+ if (btrfs_should_end_transaction(trans) ||
+ (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ root_item);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
+ btrfs_end_transaction_throttle(trans);
+ if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
+ btrfs_debug(fs_info,
+ "drop snapshot early exit");
+ err = -EAGAIN;
+ goto out_free;
+ }
+
+ /*
+ * Use join to avoid potential EINTR from transaction
+ * start. See wait_reserve_ticket and the whole
+ * reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+ }
+ }
+ btrfs_release_path(path);
+ if (err)
+ goto out_end_trans;
+
+ ret = btrfs_del_root(trans, &root->root_key);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+
+ if (!is_reloc_root) {
+ ret = btrfs_find_root(tree_root, &root->root_key, path,
+ NULL, NULL);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ err = ret;
+ goto out_end_trans;
+ } else if (ret > 0) {
+ /* if we fail to delete the orphan item this time
+ * around, it'll get picked up the next time.
+ *
+ * The most common failure here is just -ENOENT.
+ */
+ btrfs_del_orphan_item(trans, tree_root,
+ root->root_key.objectid);
+ }
+ }
+
+ /*
+ * This subvolume is going to be completely dropped, and won't be
+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
+ * commit transaction time. So free it here manually.
+ */
+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
+ btrfs_qgroup_free_meta_all_pertrans(root);
+
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ btrfs_add_dropped_root(trans, root);
+ else
+ btrfs_put_root(root);
+ root_dropped = true;
+out_end_trans:
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
+ btrfs_end_transaction_throttle(trans);
+out_free:
+ kfree(wc);
+ btrfs_free_path(path);
+out:
+ /*
+ * We were an unfinished drop root, check to see if there are any
+ * pending, and if not clear and wake up any waiters.
+ */
+ if (!err && unfinished_drop)
+ btrfs_maybe_wake_unfinished_drop(fs_info);
+
+ /*
+ * So if we need to stop dropping the snapshot for whatever reason we
+ * need to make sure to add it back to the dead root list so that we
+ * keep trying to do the work later. This also cleans up roots if we
+ * don't have it in the radix (like when we recover after a power fail
+ * or unmount) so we don't leak memory.
+ */
+ if (!for_reloc && !root_dropped)
+ btrfs_add_dead_root(root);
+ return err;
+}
+
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
+ */
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct walk_control *wc;
+ int level;
+ int parent_level;
+ int ret = 0;
+ int wret;
+
+ BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ wc = kzalloc(sizeof(*wc), GFP_NOFS);
+ if (!wc) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ btrfs_assert_tree_write_locked(parent);
+ parent_level = btrfs_header_level(parent);
+ atomic_inc(&parent->refs);
+ path->nodes[parent_level] = parent;
+ path->slots[parent_level] = btrfs_header_nritems(parent);
+
+ btrfs_assert_tree_write_locked(node);
+ level = btrfs_header_level(node);
+ path->nodes[level] = node;
+ path->slots[level] = 0;
+ path->locks[level] = BTRFS_WRITE_LOCK;
+
+ wc->refs[parent_level] = 1;
+ wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ wc->level = level;
+ wc->shared_level = -1;
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = 0;
+ wc->keep_locks = 1;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
+
+ while (1) {
+ wret = walk_down_tree(trans, root, path, wc);
+ if (wret < 0) {
+ ret = wret;
+ break;
+ }
+
+ wret = walk_up_tree(trans, root, path, wc, parent_level);
+ if (wret < 0)
+ ret = wret;
+ if (wret != 0)
+ break;
+ }
+
+ kfree(wc);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+ struct btrfs_block_group *block_group;
+ u64 free_bytes = 0;
+ int factor;
+
+ /* It's df, we don't care if it's racy */
+ if (list_empty(&sinfo->ro_bgs))
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
+ spin_lock(&block_group->lock);
+
+ if (!block_group->ro) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ factor = btrfs_bg_type_to_factor(block_group->flags);
+ free_bytes += (block_group->length -
+ block_group->used) * factor;
+
+ spin_unlock(&block_group->lock);
+ }
+ spin_unlock(&sinfo->lock);
+
+ return free_bytes;
+}
+
+int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
+ u64 start, u64 end)
+{
+ return unpin_extent_range(fs_info, start, end, false);
+}
+
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space. Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time. We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released or
+ * allocated in the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses. For that, we need to take a reference to the
+ * transaction and hold the commit root sem. We only need to hold
+ * it while performing the free space search since we have already
+ * held back allocations.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
+{
+ u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
+ int ret;
+
+ *trimmed = 0;
+
+ /* Discard not supported = nothing to do. */
+ if (!bdev_max_discard_sectors(device->bdev))
+ return 0;
+
+ /* Not writable = nothing to do. */
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ return 0;
+
+ /* No free space = nothing to do. */
+ if (device->total_bytes <= device->bytes_used)
+ return 0;
+
+ ret = 0;
+
+ while (1) {
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ u64 bytes;
+
+ ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+ if (ret)
+ break;
+
+ find_first_clear_extent_bit(&device->alloc_state, start,
+ &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ /* Check if there are any CHUNK_* bits left */
+ if (start > device->total_bytes) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn_in_rcu(fs_info,
+"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
+ start, end - start + 1,
+ rcu_str_deref(device->name),
+ device->total_bytes);
+ mutex_unlock(&fs_info->chunk_mutex);
+ ret = 0;
+ break;
+ }
+
+ /* Ensure we skip the reserved space on each device. */
+ start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
+
+ /*
+ * If find_first_clear_extent_bit find a range that spans the
+ * end of the device it will set end to -1, in this case it's up
+ * to the caller to trim the value to the size of the device.
+ */
+ end = min(end, device->total_bytes - 1);
+
+ len = end - start + 1;
+
+ /* We didn't find any extents */
+ if (!len) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_issue_discard(device->bdev, start, len,
+ &bytes);
+ if (!ret)
+ set_extent_bits(&device->alloc_state, start,
+ start + bytes - 1,
+ CHUNK_TRIMMED);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (ret)
+ break;
+
+ start += len;
+ *trimmed += bytes;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/*
+ * Trim the whole filesystem by:
+ * 1) trimming the free space in each block group
+ * 2) trimming the unallocated space on each device
+ *
+ * This will also continue trimming even if a block group or device encounters
+ * an error. The return value will be the last error, or 0 if nothing bad
+ * happens.
+ */
+int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_block_group *cache = NULL;
+ struct btrfs_device *device;
+ u64 group_trimmed;
+ u64 range_end = U64_MAX;
+ u64 start;
+ u64 end;
+ u64 trimmed = 0;
+ u64 bg_failed = 0;
+ u64 dev_failed = 0;
+ int bg_ret = 0;
+ int dev_ret = 0;
+ int ret = 0;
+
+ if (range->start == U64_MAX)
+ return -EINVAL;
+
+ /*
+ * Check range overflow if range->len is set.
+ * The default range->len is U64_MAX.
+ */
+ if (range->len != U64_MAX &&
+ check_add_overflow(range->start, range->len, &range_end))
+ return -EINVAL;
+
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ for (; cache; cache = btrfs_next_block_group(cache)) {
+ if (cache->start >= range_end) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+
+ start = max(range->start, cache->start);
+ end = min(range_end, cache->start + cache->length);
+
+ if (end - start >= range->minlen) {
+ if (!btrfs_block_group_done(cache)) {
+ ret = btrfs_cache_block_group(cache, true);
+ if (ret) {
+ bg_failed++;
+ bg_ret = ret;
+ continue;
+ }
+ }
+ ret = btrfs_trim_block_group(cache,
+ &group_trimmed,
+ start,
+ end,
+ range->minlen);
+
+ trimmed += group_trimmed;
+ if (ret) {
+ bg_failed++;
+ bg_ret = ret;
+ continue;
+ }
+ }
+ }
+
+ if (bg_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu block group(s), last error %d",
+ bg_failed, bg_ret);
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ continue;
+
+ ret = btrfs_trim_free_extents(device, &group_trimmed);
+ if (ret) {
+ dev_failed++;
+ dev_ret = ret;
+ break;
+ }
+
+ trimmed += group_trimmed;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (dev_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu device(s), last error %d",
+ dev_failed, dev_ret);
+ range->len = trimmed;
+ if (bg_ret)
+ return bg_ret;
+ return dev_ret;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000..539bc9bdc
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,5897 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/fsverity.h>
+#include "misc.h"
+#include "extent_io.h"
+#include "extent-io-tree.h"
+#include "extent_map.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "check-integrity.h"
+#include "locking.h"
+#include "rcu-string.h"
+#include "backref.h"
+#include "disk-io.h"
+#include "subpage.h"
+#include "zoned.h"
+#include "block-group.h"
+#include "compression.h"
+
+static struct kmem_cache *extent_buffer_cache;
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ unsigned long flags;
+
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_add(&eb->leak_list, &fs_info->allocated_ebs);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
+}
+
+static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ unsigned long flags;
+
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
+}
+
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
+{
+ struct extent_buffer *eb;
+ unsigned long flags;
+
+ /*
+ * If we didn't get into open_ctree our allocated_ebs will not be
+ * initialized, so just skip this.
+ */
+ if (!fs_info->allocated_ebs.next)
+ return;
+
+ WARN_ON(!list_empty(&fs_info->allocated_ebs));
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ while (!list_empty(&fs_info->allocated_ebs)) {
+ eb = list_first_entry(&fs_info->allocated_ebs,
+ struct extent_buffer, leak_list);
+ pr_err(
+ "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_header_owner(eb));
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
+}
+#else
+#define btrfs_leak_debug_add_eb(eb) do {} while (0)
+#define btrfs_leak_debug_del_eb(eb) do {} while (0)
+#endif
+
+/*
+ * Structure to record info about the bio being assembled, and other info like
+ * how many bytes are there before stripe/ordered extent boundary.
+ */
+struct btrfs_bio_ctrl {
+ struct bio *bio;
+ int mirror_num;
+ enum btrfs_compression_type compress_type;
+ u32 len_to_stripe_boundary;
+ u32 len_to_oe_boundary;
+ btrfs_bio_end_io_t end_io_func;
+};
+
+struct extent_page_data {
+ struct btrfs_bio_ctrl bio_ctrl;
+ /* tells writepage not to lock the state bits for this range
+ * it still does the unlocking
+ */
+ unsigned int extent_locked:1;
+
+ /* tells the submit_bio code to use REQ_SYNC */
+ unsigned int sync_io:1;
+};
+
+static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
+{
+ struct bio *bio;
+ struct bio_vec *bv;
+ struct inode *inode;
+ int mirror_num;
+
+ if (!bio_ctrl->bio)
+ return;
+
+ bio = bio_ctrl->bio;
+ bv = bio_first_bvec_all(bio);
+ inode = bv->bv_page->mapping->host;
+ mirror_num = bio_ctrl->mirror_num;
+
+ /* Caller should ensure the bio has at least some range added */
+ ASSERT(bio->bi_iter.bi_size);
+
+ btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
+
+ if (!is_data_inode(inode))
+ btrfs_submit_metadata_bio(inode, bio, mirror_num);
+ else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_submit_data_write_bio(inode, bio, mirror_num);
+ else
+ btrfs_submit_data_read_bio(inode, bio, mirror_num,
+ bio_ctrl->compress_type);
+
+ /* The bio is owned by the end_io handler now */
+ bio_ctrl->bio = NULL;
+}
+
+/*
+ * Submit or fail the current bio in an extent_page_data structure.
+ */
+static void submit_write_bio(struct extent_page_data *epd, int ret)
+{
+ struct bio *bio = epd->bio_ctrl.bio;
+
+ if (!bio)
+ return;
+
+ if (ret) {
+ ASSERT(ret < 0);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ /* The bio is owned by the end_io handler now */
+ epd->bio_ctrl.bio = NULL;
+ } else {
+ submit_one_bio(&epd->bio_ctrl);
+ }
+}
+
+int __init extent_buffer_init_cachep(void)
+{
+ extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
+ sizeof(struct extent_buffer), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!extent_buffer_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __cold extent_buffer_free_cachep(void)
+{
+ /*
+ * Make sure all delayed rcu free are flushed before we
+ * destroy caches.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(extent_buffer_cache);
+}
+
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(inode->i_mapping, index);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
+ clear_page_dirty_for_io(page);
+ put_page(page);
+ index++;
+ }
+}
+
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct folio *folio;
+
+ while (index <= end_index) {
+ folio = filemap_get_folio(mapping, index);
+ filemap_dirty_folio(mapping, folio);
+ folio_account_redirty(folio);
+ index += folio_nr_pages(folio);
+ folio_put(folio);
+ }
+}
+
+/*
+ * Process one page for __process_pages_contig().
+ *
+ * Return >0 if we hit @page == @locked_page.
+ * Return 0 if we updated the page status.
+ * Return -EGAIN if the we need to try again.
+ * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
+ */
+static int process_one_page(struct btrfs_fs_info *fs_info,
+ struct address_space *mapping,
+ struct page *page, struct page *locked_page,
+ unsigned long page_ops, u64 start, u64 end)
+{
+ u32 len;
+
+ ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
+ len = end + 1 - start;
+
+ if (page_ops & PAGE_SET_ORDERED)
+ btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ if (page_ops & PAGE_SET_ERROR)
+ btrfs_page_clamp_set_error(fs_info, page, start, len);
+ if (page_ops & PAGE_START_WRITEBACK) {
+ btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
+ btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ }
+ if (page_ops & PAGE_END_WRITEBACK)
+ btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+
+ if (page == locked_page)
+ return 1;
+
+ if (page_ops & PAGE_LOCK) {
+ int ret;
+
+ ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
+ if (ret)
+ return ret;
+ if (!PageDirty(page) || page->mapping != mapping) {
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+ return -EAGAIN;
+ }
+ }
+ if (page_ops & PAGE_UNLOCK)
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+ return 0;
+}
+
+static int __process_pages_contig(struct address_space *mapping,
+ struct page *locked_page,
+ u64 start, u64 end, unsigned long page_ops,
+ u64 *processed_end)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start_index;
+ unsigned long pages_processed = 0;
+ struct folio_batch fbatch;
+ int err = 0;
+ int i;
+
+ if (page_ops & PAGE_LOCK) {
+ ASSERT(page_ops == PAGE_LOCK);
+ ASSERT(processed_end && *processed_end == start);
+ }
+
+ if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index)
+ mapping_set_error(mapping, -EIO);
+
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ int found_folios;
+
+ found_folios = filemap_get_folios_contig(mapping, &index,
+ end_index, &fbatch);
+
+ if (found_folios == 0) {
+ /*
+ * Only if we're going to lock these pages, we can find
+ * nothing at @index.
+ */
+ ASSERT(page_ops & PAGE_LOCK);
+ err = -EAGAIN;
+ goto out;
+ }
+
+ for (i = 0; i < found_folios; i++) {
+ int process_ret;
+ struct folio *folio = fbatch.folios[i];
+ process_ret = process_one_page(fs_info, mapping,
+ &folio->page, locked_page, page_ops,
+ start, end);
+ if (process_ret < 0) {
+ err = -EAGAIN;
+ folio_batch_release(&fbatch);
+ goto out;
+ }
+ pages_processed += folio_nr_pages(folio);
+ }
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
+out:
+ if (err && processed_end) {
+ /*
+ * Update @processed_end. I know this is awful since it has
+ * two different return value patterns (inclusive vs exclusive).
+ *
+ * But the exclusive pattern is necessary if @start is 0, or we
+ * underflow and check against processed_end won't work as
+ * expected.
+ */
+ if (pages_processed)
+ *processed_end = min(end,
+ ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
+ else
+ *processed_end = start;
+ }
+ return err;
+}
+
+static noinline void __unlock_for_delalloc(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
+
+ ASSERT(locked_page);
+ if (index == locked_page->index && end_index == index)
+ return;
+
+ __process_pages_contig(inode->i_mapping, locked_page, start, end,
+ PAGE_UNLOCK, NULL);
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+ struct page *locked_page,
+ u64 delalloc_start,
+ u64 delalloc_end)
+{
+ unsigned long index = delalloc_start >> PAGE_SHIFT;
+ unsigned long end_index = delalloc_end >> PAGE_SHIFT;
+ u64 processed_end = delalloc_start;
+ int ret;
+
+ ASSERT(locked_page);
+ if (index == locked_page->index && index == end_index)
+ return 0;
+
+ ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
+ delalloc_end, PAGE_LOCK, &processed_end);
+ if (ret == -EAGAIN && processed_end > delalloc_start)
+ __unlock_for_delalloc(inode, locked_page, delalloc_start,
+ processed_end);
+ return ret;
+}
+
+/*
+ * Find and lock a contiguous range of bytes in the file marked as delalloc, no
+ * more than @max_bytes.
+ *
+ * @start: The original start bytenr to search.
+ * Will store the extent range start bytenr.
+ * @end: The original end bytenr of the search range
+ * Will store the extent range end bytenr.
+ *
+ * Return true if we find a delalloc range which starts inside the original
+ * range, and @start/@end will store the delalloc range start/end.
+ *
+ * Return false if we can't find any delalloc range which starts inside the
+ * original range, and @start/@end will be the non-delalloc range start/end.
+ */
+EXPORT_FOR_TESTS
+noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
+ struct page *locked_page, u64 *start,
+ u64 *end)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ const u64 orig_start = *start;
+ const u64 orig_end = *end;
+ /* The sanity tests may not set a valid fs_info. */
+ u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool found;
+ struct extent_state *cached_state = NULL;
+ int ret;
+ int loops = 0;
+
+ /* Caller should pass a valid @end to indicate the search range end */
+ ASSERT(orig_end > orig_start);
+
+ /* The range should at least cover part of the page */
+ ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
+ orig_end <= page_offset(locked_page)));
+again:
+ /* step one, find a bunch of delalloc bytes starting at start */
+ delalloc_start = *start;
+ delalloc_end = 0;
+ found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes, &cached_state);
+ if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
+ *start = delalloc_start;
+
+ /* @delalloc_end can be -1, never go beyond @orig_end */
+ *end = min(delalloc_end, orig_end);
+ free_extent_state(cached_state);
+ return false;
+ }
+
+ /*
+ * start comes from the offset of locked_page. We have to lock
+ * pages in order, so we can't process delalloc bytes before
+ * locked_page
+ */
+ if (delalloc_start < *start)
+ delalloc_start = *start;
+
+ /*
+ * make sure to limit the number of pages we try to lock down
+ */
+ if (delalloc_end + 1 - delalloc_start > max_bytes)
+ delalloc_end = delalloc_start + max_bytes - 1;
+
+ /* step two, lock all the pages after the page that has start */
+ ret = lock_delalloc_pages(inode, locked_page,
+ delalloc_start, delalloc_end);
+ ASSERT(!ret || ret == -EAGAIN);
+ if (ret == -EAGAIN) {
+ /* some of the pages are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching
+ */
+ free_extent_state(cached_state);
+ cached_state = NULL;
+ if (!loops) {
+ max_bytes = PAGE_SIZE;
+ loops = 1;
+ goto again;
+ } else {
+ found = false;
+ goto out_failed;
+ }
+ }
+
+ /* step three, lock the state bits for the whole range */
+ lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
+
+ /* then test to make sure it is all still delalloc */
+ ret = test_range_bit(tree, delalloc_start, delalloc_end,
+ EXTENT_DELALLOC, 1, cached_state);
+ if (!ret) {
+ unlock_extent(tree, delalloc_start, delalloc_end,
+ &cached_state);
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start, delalloc_end);
+ cond_resched();
+ goto again;
+ }
+ free_extent_state(cached_state);
+ *start = delalloc_start;
+ *end = delalloc_end;
+out_failed:
+ return found;
+}
+
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ struct page *locked_page,
+ u32 clear_bits, unsigned long page_ops)
+{
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
+
+ __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
+ start, end, page_ops, NULL);
+}
+
+static int insert_failrec(struct btrfs_inode *inode,
+ struct io_failure_record *failrec)
+{
+ struct rb_node *exist;
+
+ spin_lock(&inode->io_failure_lock);
+ exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
+ &failrec->rb_node);
+ spin_unlock(&inode->io_failure_lock);
+
+ return (exist == NULL) ? 0 : -EEXIST;
+}
+
+static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
+{
+ struct rb_node *node;
+ struct io_failure_record *failrec = ERR_PTR(-ENOENT);
+
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search(&inode->io_failure_tree, start);
+ if (node)
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ spin_unlock(&inode->io_failure_lock);
+ return failrec;
+}
+
+static void free_io_failure(struct btrfs_inode *inode,
+ struct io_failure_record *rec)
+{
+ spin_lock(&inode->io_failure_lock);
+ rb_erase(&rec->rb_node, &inode->io_failure_tree);
+ spin_unlock(&inode->io_failure_lock);
+
+ kfree(rec);
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchronization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
+{
+ struct btrfs_device *dev;
+ struct bio_vec bvec;
+ struct bio bio;
+ u64 map_length = 0;
+ u64 sector;
+ struct btrfs_io_context *bioc = NULL;
+ int ret = 0;
+
+ ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
+ BUG_ON(!mirror_num);
+
+ if (btrfs_repair_one_zone(fs_info, logical))
+ return 0;
+
+ map_length = length;
+
+ /*
+ * Avoid races with device replace and make sure our bioc has devices
+ * associated to its stripes that don't go away while we are doing the
+ * read repair operation.
+ */
+ btrfs_bio_counter_inc_blocked(fs_info);
+ if (btrfs_is_parity_mirror(fs_info, logical, length)) {
+ /*
+ * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+ * to update all raid stripes, but here we just want to correct
+ * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+ * stripe's dev and sector.
+ */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &map_length, &bioc, 0);
+ if (ret)
+ goto out_counter_dec;
+ ASSERT(bioc->mirror_num == 1);
+ } else {
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+ &map_length, &bioc, mirror_num);
+ if (ret)
+ goto out_counter_dec;
+ /*
+ * This happens when dev-replace is also running, and the
+ * mirror_num indicates the dev-replace target.
+ *
+ * In this case, we don't need to do anything, as the read
+ * error just means the replace progress hasn't reached our
+ * read range, and later replace routine would handle it well.
+ */
+ if (mirror_num != bioc->mirror_num)
+ goto out_counter_dec;
+ }
+
+ sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
+ dev = bioc->stripes[bioc->mirror_num - 1].dev;
+ btrfs_put_bioc(bioc);
+
+ if (!dev || !dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+ ret = -EIO;
+ goto out_counter_dec;
+ }
+
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = sector;
+ __bio_add_page(&bio, page, length, pg_offset);
+
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ if (ret) {
+ /* try to remap that extent elsewhere? */
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ goto out_bio_uninit;
+ }
+
+ btrfs_info_rl_in_rcu(fs_info,
+ "read error corrected: ino %llu off %llu (dev %s sector %llu)",
+ ino, start,
+ rcu_str_deref(dev->name), sector);
+ ret = 0;
+
+out_bio_uninit:
+ bio_uninit(&bio);
+out_counter_dec:
+ btrfs_bio_counter_dec(fs_info);
+ return ret;
+}
+
+int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u64 start = eb->start;
+ int i, num_pages = num_extent_pages(eb);
+ int ret = 0;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
+ start - page_offset(p), mirror_num);
+ if (ret)
+ break;
+ start += PAGE_SIZE;
+ }
+
+ return ret;
+}
+
+static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
+{
+ if (cur_mirror == failrec->num_copies)
+ return cur_mirror + 1 - failrec->num_copies;
+ return cur_mirror + 1;
+}
+
+static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
+{
+ if (cur_mirror == 1)
+ return failrec->num_copies;
+ return cur_mirror - 1;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ u64 ino = btrfs_ino(inode);
+ u64 locked_start, locked_end;
+ struct io_failure_record *failrec;
+ int mirror;
+ int ret;
+
+ failrec = get_failrec(inode, start);
+ if (IS_ERR(failrec))
+ return 0;
+
+ BUG_ON(!failrec->this_mirror);
+
+ if (sb_rdonly(fs_info->sb))
+ goto out;
+
+ ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
+ &locked_end, EXTENT_LOCKED, NULL);
+ if (ret || locked_start > failrec->bytenr ||
+ locked_end < failrec->bytenr + failrec->len - 1)
+ goto out;
+
+ mirror = failrec->this_mirror;
+ do {
+ mirror = prev_mirror(failrec, mirror);
+ repair_io_failure(fs_info, ino, start, failrec->len,
+ failrec->logical, page, pg_offset, mirror);
+ } while (mirror != failrec->failed_mirror);
+
+out:
+ free_io_failure(inode, failrec);
+ return 0;
+}
+
+/*
+ * Can be called when
+ * - hold extent lock
+ * - under ordered extent
+ * - the inode is freeing
+ */
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct io_failure_record *failrec;
+ struct rb_node *node, *next;
+
+ if (RB_EMPTY_ROOT(&inode->io_failure_tree))
+ return;
+
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search_first(&inode->io_failure_tree, start);
+ while (node) {
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ if (failrec->bytenr > end)
+ break;
+
+ next = rb_next(node);
+ rb_erase(&failrec->rb_node, &inode->io_failure_tree);
+ kfree(failrec);
+
+ node = next;
+ }
+ spin_unlock(&inode->io_failure_lock);
+}
+
+static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
+ struct btrfs_bio *bbio,
+ unsigned int bio_offset)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 start = bbio->file_offset + bio_offset;
+ struct io_failure_record *failrec;
+ const u32 sectorsize = fs_info->sectorsize;
+ int ret;
+
+ failrec = get_failrec(BTRFS_I(inode), start);
+ if (!IS_ERR(failrec)) {
+ btrfs_debug(fs_info,
+ "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
+ failrec->logical, failrec->bytenr, failrec->len);
+ /*
+ * when data can be on disk more than twice, add to failrec here
+ * (e.g. with a list for failed_mirror) to make
+ * clean_io_failure() clean all those errors at once.
+ */
+ ASSERT(failrec->this_mirror == bbio->mirror_num);
+ ASSERT(failrec->len == fs_info->sectorsize);
+ return failrec;
+ }
+
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return ERR_PTR(-ENOMEM);
+
+ RB_CLEAR_NODE(&failrec->rb_node);
+ failrec->bytenr = start;
+ failrec->len = sectorsize;
+ failrec->failed_mirror = bbio->mirror_num;
+ failrec->this_mirror = bbio->mirror_num;
+ failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
+
+ btrfs_debug(fs_info,
+ "new io failure record logical %llu start %llu",
+ failrec->logical, start);
+
+ failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
+ if (failrec->num_copies == 1) {
+ /*
+ * We only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. No
+ * matter what the error is, it is very likely to persist.
+ */
+ btrfs_debug(fs_info,
+ "cannot repair logical %llu num_copies %d",
+ failrec->logical, failrec->num_copies);
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ /* Set the bits in the private failure tree */
+ ret = insert_failrec(BTRFS_I(inode), failrec);
+ if (ret) {
+ kfree(failrec);
+ return ERR_PTR(ret);
+ }
+
+ return failrec;
+}
+
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+ u32 bio_offset, struct page *page, unsigned int pgoff,
+ submit_bio_hook_t *submit_bio_hook)
+{
+ u64 start = failed_bbio->file_offset + bio_offset;
+ struct io_failure_record *failrec;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct bio *failed_bio = &failed_bbio->bio;
+ const int icsum = bio_offset >> fs_info->sectorsize_bits;
+ struct bio *repair_bio;
+ struct btrfs_bio *repair_bbio;
+
+ btrfs_debug(fs_info,
+ "repair read error: read error at %llu", start);
+
+ BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+
+ failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
+ if (IS_ERR(failrec))
+ return PTR_ERR(failrec);
+
+ /*
+ * There are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ *
+ * Since we're only doing repair for one sector, we only need to get
+ * a good copy of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
+ */
+ failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
+ if (failrec->this_mirror == failrec->failed_mirror) {
+ btrfs_debug(fs_info,
+ "failed to repair num_copies %d this_mirror %d failed_mirror %d",
+ failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
+ free_io_failure(BTRFS_I(inode), failrec);
+ return -EIO;
+ }
+
+ repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
+ failed_bbio->private);
+ repair_bbio = btrfs_bio(repair_bio);
+ repair_bbio->file_offset = start;
+ repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
+
+ if (failed_bbio->csum) {
+ const u32 csum_size = fs_info->csum_size;
+
+ repair_bbio->csum = repair_bbio->csum_inline;
+ memcpy(repair_bbio->csum,
+ failed_bbio->csum + csum_size * icsum, csum_size);
+ }
+
+ bio_add_page(repair_bio, page, failrec->len, pgoff);
+ repair_bbio->iter = repair_bio->bi_iter;
+
+ btrfs_debug(btrfs_sb(inode->i_sb),
+ "repair read error: submitting new read to mirror %d",
+ failrec->this_mirror);
+
+ /*
+ * At this point we have a bio, so any errors from submit_bio_hook()
+ * will be handled by the endio on the repair_bio, so we can't return an
+ * error here.
+ */
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
+ return BLK_STS_OK;
+}
+
+static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+
+ if (uptodate) {
+ if (fsverity_active(page->mapping->host) &&
+ !PageError(page) &&
+ !PageUptodate(page) &&
+ start < i_size_read(page->mapping->host) &&
+ !fsverity_verify_page(page)) {
+ btrfs_page_set_error(fs_info, page, start, len);
+ } else {
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ }
+ } else {
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
+ }
+
+ if (!btrfs_is_subpage(fs_info, page))
+ unlock_page(page);
+ else
+ btrfs_subpage_end_reader(fs_info, page, start, len);
+}
+
+static void end_sector_io(struct page *page, u64 offset, bool uptodate)
+{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct extent_state *cached = NULL;
+
+ end_page_read(page, uptodate, offset, sectorsize);
+ if (uptodate)
+ set_extent_uptodate(&inode->io_tree, offset,
+ offset + sectorsize - 1, &cached, GFP_ATOMIC);
+ unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1,
+ &cached);
+}
+
+static void submit_data_read_repair(struct inode *inode,
+ struct btrfs_bio *failed_bbio,
+ u32 bio_offset, const struct bio_vec *bvec,
+ unsigned int error_bitmap)
+{
+ const unsigned int pgoff = bvec->bv_offset;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct page *page = bvec->bv_page;
+ const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
+ const u64 end = start + bvec->bv_len - 1;
+ const u32 sectorsize = fs_info->sectorsize;
+ const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
+ int i;
+
+ BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
+
+ /* This repair is only for data */
+ ASSERT(is_data_inode(inode));
+
+ /* We're here because we had some read errors or csum mismatch */
+ ASSERT(error_bitmap);
+
+ /*
+ * We only get called on buffered IO, thus page must be mapped and bio
+ * must not be cloned.
+ */
+ ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
+
+ /* Iterate through all the sectors in the range */
+ for (i = 0; i < nr_bits; i++) {
+ const unsigned int offset = i * sectorsize;
+ bool uptodate = false;
+ int ret;
+
+ if (!(error_bitmap & (1U << i))) {
+ /*
+ * This sector has no error, just end the page read
+ * and unlock the range.
+ */
+ uptodate = true;
+ goto next;
+ }
+
+ ret = btrfs_repair_one_sector(inode, failed_bbio,
+ bio_offset + offset, page, pgoff + offset,
+ btrfs_submit_data_read_bio);
+ if (!ret) {
+ /*
+ * We have submitted the read repair, the page release
+ * will be handled by the endio function of the
+ * submitted repair bio.
+ * Thus we don't need to do any thing here.
+ */
+ continue;
+ }
+ /*
+ * Continue on failed repair, otherwise the remaining sectors
+ * will not be properly unlocked.
+ */
+next:
+ end_sector_io(page, start + offset, uptodate);
+ }
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+{
+ struct btrfs_inode *inode;
+ const bool uptodate = (err == 0);
+ int ret = 0;
+
+ ASSERT(page && page->mapping);
+ inode = BTRFS_I(page->mapping->host);
+ btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
+
+ if (!uptodate) {
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 len;
+
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
+
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
+ ret = err < 0 ? err : -EIO;
+ mapping_set_error(page->mapping, ret);
+ }
+}
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+{
+ struct bio *bio = &bbio->bio;
+ int error = blk_status_to_errno(bio->bi_status);
+ struct bio_vec *bvec;
+ u64 start;
+ u64 end;
+ struct bvec_iter_all iter_all;
+ bool first_bvec = true;
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+
+ /* Our read/write should always be sector aligned. */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page write in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page write with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (first_bvec) {
+ btrfs_record_physical_zoned(inode, start, bio);
+ first_bvec = false;
+ }
+
+ end_extent_writepage(page, error, start, end);
+
+ btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
+ }
+
+ bio_put(bio);
+}
+
+/*
+ * Record previously processed extent range
+ *
+ * For endio_readpage_release_extent() to handle a full extent range, reducing
+ * the extent io operations.
+ */
+struct processed_extent {
+ struct btrfs_inode *inode;
+ /* Start of the range in @inode */
+ u64 start;
+ /* End of the range in @inode */
+ u64 end;
+ bool uptodate;
+};
+
+/*
+ * Try to release processed extent range
+ *
+ * May not release the extent range right now if the current range is
+ * contiguous to processed extent.
+ *
+ * Will release processed extent when any of @inode, @uptodate, the range is
+ * no longer contiguous to the processed range.
+ *
+ * Passing @inode == NULL will force processed extent to be released.
+ */
+static void endio_readpage_release_extent(struct processed_extent *processed,
+ struct btrfs_inode *inode, u64 start, u64 end,
+ bool uptodate)
+{
+ struct extent_state *cached = NULL;
+ struct extent_io_tree *tree;
+
+ /* The first extent, initialize @processed */
+ if (!processed->inode)
+ goto update;
+
+ /*
+ * Contiguous to processed extent, just uptodate the end.
+ *
+ * Several things to notice:
+ *
+ * - bio can be merged as long as on-disk bytenr is contiguous
+ * This means we can have page belonging to other inodes, thus need to
+ * check if the inode still matches.
+ * - bvec can contain range beyond current page for multi-page bvec
+ * Thus we need to do processed->end + 1 >= start check
+ */
+ if (processed->inode == inode && processed->uptodate == uptodate &&
+ processed->end + 1 >= start && end >= processed->end) {
+ processed->end = end;
+ return;
+ }
+
+ tree = &processed->inode->io_tree;
+ /*
+ * Now we don't have range contiguous to the processed range, release
+ * the processed range now.
+ */
+ unlock_extent_atomic(tree, processed->start, processed->end, &cached);
+
+update:
+ /* Update processed to current range */
+ processed->inode = inode;
+ processed->start = start;
+ processed->end = end;
+ processed->uptodate = uptodate;
+}
+
+static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
+{
+ ASSERT(PageLocked(page));
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page));
+ btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+}
+
+/*
+ * Find extent buffer for a givne bytenr.
+ *
+ * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
+ * in endio context.
+ */
+static struct extent_buffer *find_extent_buffer_readpage(
+ struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+ struct extent_buffer *eb;
+
+ /*
+ * For regular sectorsize, we can use page->private to grab extent
+ * buffer
+ */
+ if (fs_info->nodesize >= PAGE_SIZE) {
+ ASSERT(PagePrivate(page) && page->private);
+ return (struct extent_buffer *)page->private;
+ }
+
+ /* For subpage case, we need to lookup buffer radix tree */
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ bytenr >> fs_info->sectorsize_bits);
+ rcu_read_unlock();
+ ASSERT(eb);
+ return eb;
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+{
+ struct bio *bio = &bbio->bio;
+ struct bio_vec *bvec;
+ struct processed_extent processed = { 0 };
+ /*
+ * The offset to the beginning of a bio, since one bio can never be
+ * larger than UINT_MAX, u32 here is enough.
+ */
+ u32 bio_offset = 0;
+ int mirror;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ bool uptodate = !bio->bi_status;
+ struct page *page = bvec->bv_page;
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ unsigned int error_bitmap = (unsigned int)-1;
+ bool repair = false;
+ u64 start;
+ u64 end;
+ u32 len;
+
+ btrfs_debug(fs_info,
+ "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
+ bio->bi_iter.bi_sector, bio->bi_status,
+ bbio->mirror_num);
+
+ /*
+ * We always issue full-sector reads, but if some block in a
+ * page fails to read, blk_update_request() will advance
+ * bv_offset and adjust bv_len to compensate. Print a warning
+ * for unaligned offsets, and an error if they don't add up to
+ * a full sector.
+ */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page read in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
+ sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page read with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+ len = bvec->bv_len;
+
+ mirror = bbio->mirror_num;
+ if (likely(uptodate)) {
+ if (is_data_inode(inode)) {
+ error_bitmap = btrfs_verify_data_csum(bbio,
+ bio_offset, page, start, end);
+ if (error_bitmap)
+ uptodate = false;
+ } else {
+ if (btrfs_validate_metadata_buffer(bbio,
+ page, start, end, mirror))
+ uptodate = false;
+ }
+ }
+
+ if (likely(uptodate)) {
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
+
+ btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
+
+ /*
+ * Zero out the remaining part if this range straddles
+ * i_size.
+ *
+ * Here we should only zero the range inside the bvec,
+ * not touch anything else.
+ *
+ * NOTE: i_size is exclusive while end is inclusive.
+ */
+ if (page->index == end_index && i_size <= end) {
+ u32 zero_start = max(offset_in_page(i_size),
+ offset_in_page(start));
+
+ zero_user_segment(page, zero_start,
+ offset_in_page(end) + 1);
+ }
+ } else if (is_data_inode(inode)) {
+ /*
+ * Only try to repair bios that actually made it to a
+ * device. If the bio failed to be submitted mirror
+ * is 0 and we need to fail it without retrying.
+ *
+ * This also includes the high level bios for compressed
+ * extents - these never make it to a device and repair
+ * is already handled on the lower compressed bio.
+ */
+ if (mirror > 0)
+ repair = true;
+ } else {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer_readpage(fs_info, page, start);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = mirror;
+ atomic_dec(&eb->io_pages);
+ }
+
+ if (repair) {
+ /*
+ * submit_data_read_repair() will handle all the good
+ * and bad sectors, we just continue to the next bvec.
+ */
+ submit_data_read_repair(inode, bbio, bio_offset, bvec,
+ error_bitmap);
+ } else {
+ /* Update page status and unlock */
+ end_page_read(page, uptodate, start, len);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, PageUptodate(page));
+ }
+
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
+
+ }
+ /* Release the last extent */
+ endio_readpage_release_extent(&processed, NULL, 0, 0, false);
+ btrfs_bio_free_csum(bbio);
+ bio_put(bio);
+}
+
+/**
+ * Populate every free slot in a provided array with pages.
+ *
+ * @nr_pages: number of pages to allocate
+ * @page_array: the array to fill with pages; any existing non-null entries in
+ * the array will be skipped
+ *
+ * Return: 0 if all pages were able to be allocated;
+ * -ENOMEM otherwise, and the caller is responsible for freeing all
+ * non-null page pointers in the array.
+ */
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+{
+ unsigned int allocated;
+
+ for (allocated = 0; allocated < nr_pages;) {
+ unsigned int last = allocated;
+
+ allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+
+ if (allocated == nr_pages)
+ return 0;
+
+ /*
+ * During this iteration, no page could be allocated, even
+ * though alloc_pages_bulk_array() falls back to alloc_page()
+ * if it could not bulk-allocate. So we must be out of memory.
+ */
+ if (allocated == last)
+ return -ENOMEM;
+
+ memalloc_retry_wait(GFP_NOFS);
+ }
+ return 0;
+}
+
+/**
+ * Attempt to add a page to bio
+ *
+ * @bio_ctrl: record both the bio, and its bio_flags
+ * @page: page to add to the bio
+ * @disk_bytenr: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @size: portion of page that we want to write
+ * @pg_offset: starting offset in the page
+ * @compress_type: compression type of the current bio to see if we can merge them
+ *
+ * Attempt to add a page to bio considering stripe alignment etc.
+ *
+ * Return >= 0 for the number of bytes added to the bio.
+ * Can return 0 if the current bio is already at stripe/zone boundary.
+ * Return <0 for error.
+ */
+static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ struct page *page,
+ u64 disk_bytenr, unsigned int size,
+ unsigned int pg_offset,
+ enum btrfs_compression_type compress_type)
+{
+ struct bio *bio = bio_ctrl->bio;
+ u32 bio_size = bio->bi_iter.bi_size;
+ u32 real_size;
+ const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+ bool contig = false;
+ int ret;
+
+ ASSERT(bio);
+ /* The limit should be calculated when bio_ctrl->bio is allocated */
+ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
+ if (bio_ctrl->compress_type != compress_type)
+ return 0;
+
+
+ if (bio->bi_iter.bi_size == 0) {
+ /* We can always add a page into an empty bio. */
+ contig = true;
+ } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
+
+ /*
+ * The contig check requires the following conditions to be met:
+ * 1) The pages are belonging to the same inode
+ * This is implied by the call chain.
+ *
+ * 2) The range has adjacent logical bytenr
+ *
+ * 3) The range has adjacent file offset
+ * This is required for the usage of btrfs_bio->file_offset.
+ */
+ if (bio_end_sector(bio) == sector &&
+ page_offset(bvec->bv_page) + bvec->bv_offset +
+ bvec->bv_len == page_offset(page) + pg_offset)
+ contig = true;
+ } else {
+ /*
+ * For compression, all IO should have its logical bytenr
+ * set to the starting bytenr of the compressed extent.
+ */
+ contig = bio->bi_iter.bi_sector == sector;
+ }
+
+ if (!contig)
+ return 0;
+
+ real_size = min(bio_ctrl->len_to_oe_boundary,
+ bio_ctrl->len_to_stripe_boundary) - bio_size;
+ real_size = min(real_size, size);
+
+ /*
+ * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+ * bio will still execute its endio function on the page!
+ */
+ if (real_size == 0)
+ return 0;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
+ else
+ ret = bio_add_page(bio, page, real_size, pg_offset);
+
+ return ret;
+}
+
+static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
+ struct btrfs_inode *inode, u64 file_offset)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_io_geometry geom;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_map *em;
+ u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
+ int ret;
+
+ /*
+ * Pages for compressed extent are never submitted to disk directly,
+ * thus it has no real boundary, just set them to U32_MAX.
+ *
+ * The split happens for real compressed bio, which happens in
+ * btrfs_submit_compressed_read/write().
+ */
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ bio_ctrl->len_to_stripe_boundary = U32_MAX;
+ return 0;
+ }
+ em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
+ logical, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ return ret;
+ }
+ if (geom.len > U32_MAX)
+ bio_ctrl->len_to_stripe_boundary = U32_MAX;
+ else
+ bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
+
+ if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ return 0;
+ }
+
+ /* Ordered extent not yet created, so we're good */
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (!ordered) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ return 0;
+ }
+
+ bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+ ordered->disk_bytenr + ordered->disk_num_bytes - logical);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
+static int alloc_new_bio(struct btrfs_inode *inode,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ struct writeback_control *wbc,
+ blk_opf_t opf,
+ u64 disk_bytenr, u32 offset, u64 file_offset,
+ enum btrfs_compression_type compress_type)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct bio *bio;
+ int ret;
+
+ ASSERT(bio_ctrl->end_io_func);
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
+ /*
+ * For compressed page range, its disk_bytenr is always @disk_bytenr
+ * passed in, no matter if we have added any range into previous bio.
+ */
+ if (compress_type != BTRFS_COMPRESS_NONE)
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ else
+ bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
+ bio_ctrl->bio = bio;
+ bio_ctrl->compress_type = compress_type;
+ ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
+ if (ret < 0)
+ goto error;
+
+ if (wbc) {
+ /*
+ * For Zone append we need the correct block_device that we are
+ * going to write to set in the bio to be able to respect the
+ * hardware limitation. Look it up here:
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto error;
+ }
+
+ bio_set_dev(bio, dev->bdev);
+ } else {
+ /*
+ * Otherwise pick the last added device to support
+ * cgroup writeback. For multi-device file systems this
+ * means blk-cgroup policies have to always be set on the
+ * last added/replaced device. This is a bit odd but has
+ * been like that for a long time.
+ */
+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
+ }
+ wbc_init_bio(wbc, bio);
+ } else {
+ ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
+ }
+ return 0;
+error:
+ bio_ctrl->bio = NULL;
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ return ret;
+}
+
+/*
+ * @opf: bio REQ_OP_* and REQ_* flags as one value
+ * @wbc: optional writeback control for io accounting
+ * @disk_bytenr: logical bytenr where the write will be
+ * @page: page to add to the bio
+ * @size: portion of page that we want to write to
+ * @pg_offset: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @compress_type: compress type for current bio
+ *
+ * The will either add the page into the existing @bio_ctrl->bio, or allocate a
+ * new one in @bio_ctrl->bio.
+ * The mirror number for this IO should already be initizlied in
+ * @bio_ctrl->mirror_num.
+ */
+static int submit_extent_page(blk_opf_t opf,
+ struct writeback_control *wbc,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ u64 disk_bytenr, struct page *page,
+ size_t size, unsigned long pg_offset,
+ enum btrfs_compression_type compress_type,
+ bool force_bio_submit)
+{
+ int ret = 0;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ unsigned int cur = pg_offset;
+
+ ASSERT(bio_ctrl);
+
+ ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
+ pg_offset + size <= PAGE_SIZE);
+
+ ASSERT(bio_ctrl->end_io_func);
+
+ if (force_bio_submit)
+ submit_one_bio(bio_ctrl);
+
+ while (cur < pg_offset + size) {
+ u32 offset = cur - pg_offset;
+ int added;
+
+ /* Allocate new bio if needed */
+ if (!bio_ctrl->bio) {
+ ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
+ disk_bytenr, offset,
+ page_offset(page) + cur,
+ compress_type);
+ if (ret < 0)
+ return ret;
+ }
+ /*
+ * We must go through btrfs_bio_add_page() to ensure each
+ * page range won't cross various boundaries.
+ */
+ if (compress_type != BTRFS_COMPRESS_NONE)
+ added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
+ size - offset, pg_offset + offset,
+ compress_type);
+ else
+ added = btrfs_bio_add_page(bio_ctrl, page,
+ disk_bytenr + offset, size - offset,
+ pg_offset + offset, compress_type);
+
+ /* Metadata page range should never be split */
+ if (!is_data_inode(&inode->vfs_inode))
+ ASSERT(added == 0 || added == size - offset);
+
+ /* At least we added some page, update the account */
+ if (wbc && added)
+ wbc_account_cgroup_owner(wbc, page, added);
+
+ /* We have reached boundary, submit right now */
+ if (added < size - offset) {
+ /* The bio should contain some page(s) */
+ ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+ submit_one_bio(bio_ctrl);
+ }
+ cur += added;
+ }
+ return 0;
+}
+
+static int attach_extent_buffer_page(struct extent_buffer *eb,
+ struct page *page,
+ struct btrfs_subpage *prealloc)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int ret = 0;
+
+ /*
+ * If the page is mapped to btree inode, we should hold the private
+ * lock to prevent race.
+ * For cloned or dummy extent buffers, their pages are not mapped and
+ * will not race with any other ebs.
+ */
+ if (page->mapping)
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!PagePrivate(page))
+ attach_page_private(page, eb);
+ else
+ WARN_ON(page->private != (unsigned long)eb);
+ return 0;
+ }
+
+ /* Already mapped, just free prealloc */
+ if (PagePrivate(page)) {
+ btrfs_free_subpage(prealloc);
+ return 0;
+ }
+
+ if (prealloc)
+ /* Has preallocated memory for subpage */
+ attach_page_private(page, prealloc);
+ else
+ /* Do new allocation to attach subpage */
+ ret = btrfs_attach_subpage(fs_info, page,
+ BTRFS_SUBPAGE_METADATA);
+ return ret;
+}
+
+int set_page_extent_mapped(struct page *page)
+{
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
+ if (PagePrivate(page))
+ return 0;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ if (btrfs_is_subpage(fs_info, page))
+ return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+
+ attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ return 0;
+}
+
+void clear_page_extent_mapped(struct page *page)
+{
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
+ if (!PagePrivate(page))
+ return;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+ if (btrfs_is_subpage(fs_info, page))
+ return btrfs_detach_subpage(fs_info, page);
+
+ detach_page_private(page);
+}
+
+static struct extent_map *
+__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+ u64 start, u64 len, struct extent_map **em_cached)
+{
+ struct extent_map *em;
+
+ if (em_cached && *em_cached) {
+ em = *em_cached;
+ if (extent_map_in_tree(em) && start >= em->start &&
+ start < extent_map_end(em)) {
+ refcount_inc(&em->refs);
+ return em;
+ }
+
+ free_extent_map(em);
+ *em_cached = NULL;
+ }
+
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+ if (em_cached && !IS_ERR(em)) {
+ BUG_ON(*em_cached);
+ refcount_inc(&em->refs);
+ *em_cached = em;
+ }
+ return em;
+}
+/*
+ * basic readpage implementation. Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
+ * return 0 on success, otherwise return error
+ */
+static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ blk_opf_t read_flags, u64 *prev_em_start)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 start = page_offset(page);
+ const u64 end = start + PAGE_SIZE - 1;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 last_byte = i_size_read(inode);
+ u64 block_start;
+ struct extent_map *em;
+ int ret = 0;
+ size_t pg_offset = 0;
+ size_t iosize;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_extent(tree, start, end, NULL);
+ btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
+ unlock_page(page);
+ goto out;
+ }
+
+ if (page->index == last_byte >> PAGE_SHIFT) {
+ size_t zero_offset = offset_in_page(last_byte);
+
+ if (zero_offset) {
+ iosize = PAGE_SIZE - zero_offset;
+ memzero_page(page, zero_offset, iosize);
+ }
+ }
+ bio_ctrl->end_io_func = end_bio_extent_readpage;
+ begin_page_read(fs_info, page);
+ while (cur <= end) {
+ unsigned long this_bio_flag = 0;
+ bool force_bio_submit = false;
+ u64 disk_bytenr;
+
+ ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
+ if (cur >= last_byte) {
+ struct extent_state *cached = NULL;
+
+ iosize = PAGE_SIZE - pg_offset;
+ memzero_page(page, pg_offset, iosize);
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
+ break;
+ }
+ em = __get_extent_map(inode, page, pg_offset, cur,
+ end - cur + 1, em_cached);
+ if (IS_ERR(em)) {
+ unlock_extent(tree, cur, end, NULL);
+ end_page_read(page, false, cur, end + 1 - cur);
+ ret = PTR_ERR(em);
+ break;
+ }
+ extent_offset = cur - em->start;
+ BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(end < cur);
+
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ this_bio_flag = em->compress_type;
+
+ iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ iosize = ALIGN(iosize, blocksize);
+ if (this_bio_flag != BTRFS_COMPRESS_NONE)
+ disk_bytenr = em->block_start;
+ else
+ disk_bytenr = em->block_start + extent_offset;
+ block_start = em->block_start;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ block_start = EXTENT_MAP_HOLE;
+
+ /*
+ * If we have a file range that points to a compressed extent
+ * and it's followed by a consecutive file range that points
+ * to the same compressed extent (possibly with a different
+ * offset and/or length, so it either points to the whole extent
+ * or only part of it), we must make sure we do not submit a
+ * single bio to populate the pages for the 2 ranges because
+ * this makes the compressed extent read zero out the pages
+ * belonging to the 2nd range. Imagine the following scenario:
+ *
+ * File layout
+ * [0 - 8K] [8K - 24K]
+ * | |
+ * | |
+ * points to extent X, points to extent X,
+ * offset 4K, length of 8K offset 0, length 16K
+ *
+ * [extent X, compressed length = 4K uncompressed length = 16K]
+ *
+ * If the bio to read the compressed extent covers both ranges,
+ * it will decompress extent X into the pages belonging to the
+ * first range and then it will stop, zeroing out the remaining
+ * pages that belong to the other range that points to extent X.
+ * So here we make sure we submit 2 bios, one for the first
+ * range and another one for the third range. Both will target
+ * the same physical extent from disk, but we can't currently
+ * make the compressed bio endio callback populate the pages
+ * for both ranges because each compressed bio is tightly
+ * coupled with a single extent map, and each range can have
+ * an extent map with a different offset value relative to the
+ * uncompressed data of our extent and different lengths. This
+ * is a corner case so we prioritize correctness over
+ * non-optimal behavior (submitting 2 bios for the same extent).
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ prev_em_start && *prev_em_start != (u64)-1 &&
+ *prev_em_start != em->start)
+ force_bio_submit = true;
+
+ if (prev_em_start)
+ *prev_em_start = em->start;
+
+ free_extent_map(em);
+ em = NULL;
+
+ /* we've found a hole, just zero and go on */
+ if (block_start == EXTENT_MAP_HOLE) {
+ struct extent_state *cached = NULL;
+
+ memzero_page(page, pg_offset, iosize);
+
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
+ cur = cur + iosize;
+ pg_offset += iosize;
+ continue;
+ }
+ /* the get_extent function already copied into the page */
+ if (block_start == EXTENT_MAP_INLINE) {
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
+ end_page_read(page, true, cur, iosize);
+ cur = cur + iosize;
+ pg_offset += iosize;
+ continue;
+ }
+
+ ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
+ bio_ctrl, disk_bytenr, page, iosize,
+ pg_offset, this_bio_flag,
+ force_bio_submit);
+ if (ret) {
+ /*
+ * We have to unlock the remaining range, or the page
+ * will never be unlocked.
+ */
+ unlock_extent(tree, cur, end, NULL);
+ end_page_read(page, false, cur, end + 1 - cur);
+ goto out;
+ }
+ cur = cur + iosize;
+ pg_offset += iosize;
+ }
+out:
+ return ret;
+}
+
+int btrfs_read_folio(struct file *file, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ /*
+ * If btrfs_do_readpage() failed we will want to submit the assembled
+ * bio to do the cleanup.
+ */
+ submit_one_bio(&bio_ctrl);
+ return ret;
+}
+
+static inline void contiguous_readpages(struct page *pages[], int nr_pages,
+ u64 start, u64 end,
+ struct extent_map **em_cached,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ u64 *prev_em_start)
+{
+ struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
+ int index;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ for (index = 0; index < nr_pages; index++) {
+ btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
+ REQ_RAHEAD, prev_em_start);
+ put_page(pages[index]);
+ }
+}
+
+/*
+ * helper for __extent_writepage, doing all of the delayed allocation setup.
+ *
+ * This returns 1 if btrfs_run_delalloc_range function did all the work required
+ * to write the page (copy into inline extent). In this case the IO has
+ * been started and the page is already unlocked.
+ *
+ * This returns 0 if all went well (page still locked)
+ * This returns < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
+ struct page *page, struct writeback_control *wbc)
+{
+ const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 delalloc_start = page_offset(page);
+ u64 delalloc_to_write = 0;
+ /* How many pages are started by btrfs_run_delalloc_range() */
+ unsigned long nr_written = 0;
+ int ret;
+ int page_started = 0;
+
+ while (delalloc_start < page_end) {
+ u64 delalloc_end = page_end;
+ bool found;
+
+ found = find_lock_delalloc_range(&inode->vfs_inode, page,
+ &delalloc_start,
+ &delalloc_end);
+ if (!found) {
+ delalloc_start = delalloc_end + 1;
+ continue;
+ }
+ ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
+ delalloc_end, &page_started, &nr_written, wbc);
+ if (ret) {
+ btrfs_page_set_error(inode->root->fs_info, page,
+ page_offset(page), PAGE_SIZE);
+ return ret;
+ }
+ /*
+ * delalloc_end is already one less than the total length, so
+ * we don't subtract one from PAGE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_SIZE) >> PAGE_SHIFT;
+ delalloc_start = delalloc_end + 1;
+ }
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
+
+ /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
+ if (page_started) {
+ /*
+ * We've unlocked the page, so we can't update the mapping's
+ * writeback index, just update nr_to_write.
+ */
+ wbc->nr_to_write -= nr_written;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Find the first byte we need to write.
+ *
+ * For subpage, one page can contain several sectors, and
+ * __extent_writepage_io() will just grab all extent maps in the page
+ * range and try to submit all non-inline/non-compressed extents.
+ *
+ * This is a big problem for subpage, we shouldn't re-submit already written
+ * data at all.
+ * This function will lookup subpage dirty bit to find which range we really
+ * need to submit.
+ *
+ * Return the next dirty range in [@start, @end).
+ * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
+ */
+static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
+ struct page *page, u64 *start, u64 *end)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
+ u64 orig_start = *start;
+ /* Declare as unsigned long so we can use bitmap ops */
+ unsigned long flags;
+ int range_start_bit;
+ int range_end_bit;
+
+ /*
+ * For regular sector size == page size case, since one page only
+ * contains one sector, we return the page offset directly.
+ */
+ if (!btrfs_is_subpage(fs_info, page)) {
+ *start = page_offset(page);
+ *end = page_offset(page) + PAGE_SIZE;
+ return;
+ }
+
+ range_start_bit = spi->dirty_offset +
+ (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+
+ /* We should have the page locked, but just in case */
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
+ spi->dirty_offset + spi->bitmap_nr_bits);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+
+ range_start_bit -= spi->dirty_offset;
+ range_end_bit -= spi->dirty_offset;
+
+ *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
+ *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
+}
+
+/*
+ * helper for __extent_writepage. This calls the writepage start hooks,
+ * and does the loop to map the page into extents and bios.
+ *
+ * We return 1 if the IO is started and the page is unlocked,
+ * 0 if all went well (page still locked)
+ * < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
+ struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ loff_t i_size,
+ int *nr_ret)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 cur = page_offset(page);
+ u64 end = cur + PAGE_SIZE - 1;
+ u64 extent_offset;
+ u64 block_start;
+ struct extent_map *em;
+ int saved_ret = 0;
+ int ret = 0;
+ int nr = 0;
+ enum req_op op = REQ_OP_WRITE;
+ const blk_opf_t write_flags = wbc_to_write_flags(wbc);
+ bool has_error = false;
+ bool compressed;
+
+ ret = btrfs_writepage_cow_fixup(page);
+ if (ret) {
+ /* Fixup worker will requeue */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 1;
+ }
+
+ /*
+ * we don't want to touch the inode after unlocking the page,
+ * so we update the mapping writeback index now
+ */
+ wbc->nr_to_write--;
+
+ epd->bio_ctrl.end_io_func = end_bio_extent_writepage;
+ while (cur <= end) {
+ u64 disk_bytenr;
+ u64 em_end;
+ u64 dirty_range_start = cur;
+ u64 dirty_range_end;
+ u32 iosize;
+
+ if (cur >= i_size) {
+ btrfs_writepage_endio_finish_ordered(inode, page, cur,
+ end, true);
+ /*
+ * This range is beyond i_size, thus we don't need to
+ * bother writing back.
+ * But we still need to clear the dirty subpage bit, or
+ * the next time the page gets dirtied, we will try to
+ * writeback the sectors with subpage dirty bits,
+ * causing writeback without ordered extent.
+ */
+ btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
+ break;
+ }
+
+ find_next_dirty_byte(fs_info, page, &dirty_range_start,
+ &dirty_range_end);
+ if (cur < dirty_range_start) {
+ cur = dirty_range_start;
+ continue;
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
+ if (IS_ERR(em)) {
+ btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
+ ret = PTR_ERR_OR_ZERO(em);
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
+ break;
+ }
+
+ extent_offset = cur - em->start;
+ em_end = extent_map_end(em);
+ ASSERT(cur <= em_end);
+ ASSERT(cur < end);
+ ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
+ block_start = em->block_start;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ disk_bytenr = em->block_start + extent_offset;
+
+ /*
+ * Note that em_end from extent_map_end() and dirty_range_end from
+ * find_next_dirty_byte() are all exclusive
+ */
+ iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
+
+ if (btrfs_use_zone_append(inode, em->block_start))
+ op = REQ_OP_ZONE_APPEND;
+
+ free_extent_map(em);
+ em = NULL;
+
+ /*
+ * compressed and inline extents are written through other
+ * paths in the FS
+ */
+ if (compressed || block_start == EXTENT_MAP_HOLE ||
+ block_start == EXTENT_MAP_INLINE) {
+ if (compressed)
+ nr++;
+ else
+ btrfs_writepage_endio_finish_ordered(inode,
+ page, cur, cur + iosize - 1, true);
+ btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+ cur += iosize;
+ continue;
+ }
+
+ btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
+ if (!PageWriteback(page)) {
+ btrfs_err(inode->root->fs_info,
+ "page %lu not writeback, cur %llu end %llu",
+ page->index, cur, end);
+ }
+
+ /*
+ * Although the PageDirty bit is cleared before entering this
+ * function, subpage dirty bit is not cleared.
+ * So clear subpage dirty bit here so next time we won't submit
+ * page for range already written to disk.
+ */
+ btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+
+ ret = submit_extent_page(op | write_flags, wbc,
+ &epd->bio_ctrl, disk_bytenr,
+ page, iosize,
+ cur - page_offset(page),
+ 0, false);
+ if (ret) {
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
+
+ btrfs_page_set_error(fs_info, page, cur, iosize);
+ if (PageWriteback(page))
+ btrfs_page_clear_writeback(fs_info, page, cur,
+ iosize);
+ }
+
+ cur += iosize;
+ nr++;
+ }
+ /*
+ * If we finish without problem, we should not only clear page dirty,
+ * but also empty subpage dirty bits
+ */
+ if (!has_error)
+ btrfs_page_assert_not_dirty(fs_info, page);
+ else
+ ret = saved_ret;
+ *nr_ret = nr;
+ return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage. extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback. Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ *
+ * Return 0 if everything goes well.
+ * Return <0 for error.
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct folio *folio = page_folio(page);
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u64 page_start = page_offset(page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
+ int ret;
+ int nr = 0;
+ size_t pg_offset;
+ loff_t i_size = i_size_read(inode);
+ unsigned long end_index = i_size >> PAGE_SHIFT;
+
+ trace___extent_writepage(page, inode, wbc);
+
+ WARN_ON(!PageLocked(page));
+
+ btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
+ page_offset(page), PAGE_SIZE);
+
+ pg_offset = offset_in_page(i_size);
+ if (page->index > end_index ||
+ (page->index == end_index && !pg_offset)) {
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
+ return 0;
+ }
+
+ if (page->index == end_index)
+ memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ SetPageError(page);
+ goto done;
+ }
+
+ if (!epd->extent_locked) {
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
+ if (ret == 1)
+ return 0;
+ if (ret)
+ goto done;
+ }
+
+ ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+ &nr);
+ if (ret == 1)
+ return 0;
+
+done:
+ if (nr == 0) {
+ /* make sure the mapping tag for page dirty gets cleared */
+ set_page_writeback(page);
+ end_page_writeback(page);
+ }
+ /*
+ * Here we used to have a check for PageError() and then set @ret and
+ * call end_extent_writepage().
+ *
+ * But in fact setting @ret here will cause different error paths
+ * between subpage and regular sectorsize.
+ *
+ * For regular page size, we never submit current page, but only add
+ * current page to current bio.
+ * The bio submission can only happen in next page.
+ * Thus if we hit the PageError() branch, @ret is already set to
+ * non-zero value and will not get updated for regular sectorsize.
+ *
+ * But for subpage case, it's possible we submit part of current page,
+ * thus can get PageError() set by submitted bio of the same page,
+ * while our @ret is still 0.
+ *
+ * So here we unify the behavior and don't set @ret.
+ * Error can still be properly passed to higher layer as page will
+ * be set error, here we just don't handle the IO failure.
+ *
+ * NOTE: This is just a hotfix for subpage.
+ * The root fix will be properly ending ordered extent when we hit
+ * an error during writeback.
+ *
+ * But that needs a bigger refactoring, as we not only need to grab the
+ * submitted OE, but also need to know exactly at which bytenr we hit
+ * the error.
+ * Currently the full page based __extent_writepage_io() is not
+ * capable of that.
+ */
+ if (PageError(page))
+ end_extent_writepage(page, ret, page_start, page_end);
+ if (epd->extent_locked) {
+ /*
+ * If epd->extent_locked, it's from extent_write_locked_range(),
+ * the page can either be locked by lock_page() or
+ * process_one_page().
+ * Let btrfs_page_unlock_writer() handle both cases.
+ */
+ ASSERT(wbc);
+ btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
+ wbc->range_end + 1 - wbc->range_start);
+ } else {
+ unlock_page(page);
+ }
+ ASSERT(ret <= 0);
+ return ret;
+}
+
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+/*
+ * Lock extent buffer status and pages for writeback.
+ *
+ * May try to flush write bio if we can't get the lock.
+ *
+ * Return 0 if the extent buffer doesn't need to be submitted.
+ * (E.g. the extent buffer is not dirty)
+ * Return >0 is the extent buffer is submitted to bio.
+ * Return <0 if something went wrong, no page is locked.
+ */
+static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int i, num_pages;
+ int flush = 0;
+ int ret = 0;
+
+ if (!btrfs_try_tree_write_lock(eb)) {
+ submit_write_bio(epd, 0);
+ flush = 1;
+ btrfs_tree_lock(eb);
+ }
+
+ if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+ btrfs_tree_unlock(eb);
+ if (!epd->sync_io)
+ return 0;
+ if (!flush) {
+ submit_write_bio(epd, 0);
+ flush = 1;
+ }
+ while (1) {
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_lock(eb);
+ if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+ break;
+ btrfs_tree_unlock(eb);
+ }
+ }
+
+ /*
+ * We need to do this to prevent races in people who check if the eb is
+ * under IO since we can end up having no IO bits set for a short period
+ * of time.
+ */
+ spin_lock(&eb->refs_lock);
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ spin_unlock(&eb->refs_lock);
+ btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ -eb->len,
+ fs_info->dirty_metadata_batch);
+ ret = 1;
+ } else {
+ spin_unlock(&eb->refs_lock);
+ }
+
+ btrfs_tree_unlock(eb);
+
+ /*
+ * Either we don't need to submit any tree block, or we're submitting
+ * subpage eb.
+ * Subpage metadata doesn't use page locking at all, so we can skip
+ * the page locking.
+ */
+ if (!ret || fs_info->nodesize < PAGE_SIZE)
+ return ret;
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ if (!trylock_page(p)) {
+ if (!flush) {
+ submit_write_bio(epd, 0);
+ flush = 1;
+ }
+ lock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ btrfs_page_set_error(fs_info, page, eb->start, eb->len);
+ if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+ return;
+
+ /*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
+ * We need to set the mapping with the io error as well because a write
+ * error will flip the file system readonly, and then syncfs() will
+ * return a 0 because we are readonly if we don't modify the err seq for
+ * the superblock.
+ */
+ mapping_set_error(page->mapping, -EIO);
+
+ /*
+ * If we error out, we should add back the dirty_metadata_bytes
+ * to make it consistent.
+ */
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ eb->len, fs_info->dirty_metadata_batch);
+
+ /*
+ * If writeback for a btree extent that doesn't belong to a log tree
+ * failed, increment the counter transaction->eb_write_errors.
+ * We do this because while the transaction is running and before it's
+ * committing (when we call filemap_fdata[write|wait]_range against
+ * the btree inode), we might have
+ * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+ * returns an error or an error happens during writeback, when we're
+ * committing the transaction we wouldn't know about it, since the pages
+ * can be no longer dirty nor marked anymore for writeback (if a
+ * subsequent modification to the extent buffer didn't happen before the
+ * transaction commit), which makes filemap_fdata[write|wait]_range not
+ * able to find the pages tagged with SetPageError at transaction
+ * commit time. So if this happens we must abort the transaction,
+ * otherwise we commit a super block with btree roots that point to
+ * btree nodes/leafs whose content on disk is invalid - either garbage
+ * or the content of some node/leaf from a past generation that got
+ * cowed or deleted and is no longer valid.
+ *
+ * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
+ * not be enough - we need to distinguish between log tree extents vs
+ * non-log tree extents, and the next filemap_fdatawait_range() call
+ * will catch and clear such errors in the mapping - and that call might
+ * be from a log sync and not from a transaction commit. Also, checking
+ * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
+ * not done and would not be reliable - the eb might have been released
+ * from memory and reading it back again means that flag would not be
+ * set (since it's a runtime flag, not persisted on disk).
+ *
+ * Using the flags below in the btree inode also makes us achieve the
+ * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
+ * writeback for all dirty pages and before filemap_fdatawait_range()
+ * is called, the writeback for all dirty pages had already finished
+ * with errors - because we were not using AS_EIO/AS_ENOSPC,
+ * filemap_fdatawait_range() would return success, as it could not know
+ * that writeback errors happened (the pages were no longer tagged for
+ * writeback).
+ */
+ switch (eb->log_index) {
+ case -1:
+ set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
+ break;
+ case 0:
+ set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
+ break;
+ case 1:
+ set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
+ break;
+ default:
+ BUG(); /* unexpected, logic error */
+ }
+}
+
+/*
+ * The endio specific version which won't touch any unsafe spinlock in endio
+ * context.
+ */
+static struct extent_buffer *find_extent_buffer_nolock(
+ struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ return eb;
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+/*
+ * The endio function for subpage extent buffer write.
+ *
+ * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
+ * after all extent buffers in the page has finished their writeback.
+ */
+static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio)
+{
+ struct bio *bio = &bbio->bio;
+ struct btrfs_fs_info *fs_info;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
+ ASSERT(fs_info->nodesize < PAGE_SIZE);
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 bvec_end = bvec_start + bvec->bv_len - 1;
+ u64 cur_bytenr = bvec_start;
+
+ ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
+
+ /* Iterate through all extent buffers in the range */
+ while (cur_bytenr <= bvec_end) {
+ struct extent_buffer *eb;
+ int done;
+
+ /*
+ * Here we can't use find_extent_buffer(), as it may
+ * try to lock eb->refs_lock, which is not safe in endio
+ * context.
+ */
+ eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
+ ASSERT(eb);
+
+ cur_bytenr = eb->start + eb->len;
+
+ ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
+ done = atomic_dec_and_test(&eb->io_pages);
+ ASSERT(done);
+
+ if (bio->bi_status ||
+ test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+ ClearPageUptodate(page);
+ set_btree_ioerr(page, eb);
+ }
+
+ btrfs_subpage_clear_writeback(fs_info, page, eb->start,
+ eb->len);
+ end_extent_buffer_writeback(eb);
+ /*
+ * free_extent_buffer() will grab spinlock which is not
+ * safe in endio context. Thus here we manually dec
+ * the ref.
+ */
+ atomic_dec(&eb->refs);
+ }
+ }
+ bio_put(bio);
+}
+
+static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio)
+{
+ struct bio *bio = &bbio->bio;
+ struct bio_vec *bvec;
+ struct extent_buffer *eb;
+ int done;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ done = atomic_dec_and_test(&eb->io_pages);
+
+ if (bio->bi_status ||
+ test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+ ClearPageUptodate(page);
+ set_btree_ioerr(page, eb);
+ }
+
+ end_page_writeback(page);
+
+ if (!done)
+ continue;
+
+ end_extent_buffer_writeback(eb);
+ }
+
+ bio_put(bio);
+}
+
+static void prepare_eb_write(struct extent_buffer *eb)
+{
+ u32 nritems;
+ unsigned long start;
+ unsigned long end;
+
+ clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
+ atomic_set(&eb->io_pages, num_extent_pages(eb));
+
+ /* Set btree blocks beyond nritems with 0 to avoid stale content */
+ nritems = btrfs_header_nritems(eb);
+ if (btrfs_header_level(eb) > 0) {
+ end = btrfs_node_key_ptr_offset(nritems);
+ memzero_extent_buffer(eb, end, eb->len - end);
+ } else {
+ /*
+ * Leaf:
+ * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
+ */
+ start = btrfs_item_nr_offset(nritems);
+ end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
+ memzero_extent_buffer(eb, start, end - start);
+ }
+}
+
+/*
+ * Unlike the work in write_one_eb(), we rely completely on extent locking.
+ * Page locking is only utilized at minimum to keep the VMM code happy.
+ */
+static int write_one_subpage_eb(struct extent_buffer *eb,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page = eb->pages[0];
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
+ bool no_dirty_ebs = false;
+ int ret;
+
+ prepare_eb_write(eb);
+
+ /* clear_page_dirty_for_io() in subpage helper needs page locked */
+ lock_page(page);
+ btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
+
+ /* Check if this is the last dirty bit to update nr_written */
+ no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
+ eb->start, eb->len);
+ if (no_dirty_ebs)
+ clear_page_dirty_for_io(page);
+
+ epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage;
+
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+ &epd->bio_ctrl, eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, false);
+ if (ret) {
+ btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
+ set_btree_ioerr(page, eb);
+ unlock_page(page);
+
+ if (atomic_dec_and_test(&eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ return -EIO;
+ }
+ unlock_page(page);
+ /*
+ * Submission finished without problem, if no range of the page is
+ * dirty anymore, we have submitted a page. Update nr_written in wbc.
+ */
+ if (no_dirty_ebs)
+ wbc->nr_to_write--;
+ return ret;
+}
+
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ u64 disk_bytenr = eb->start;
+ int i, num_pages;
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
+ int ret = 0;
+
+ prepare_eb_write(eb);
+
+ epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage;
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ clear_page_dirty_for_io(p);
+ set_page_writeback(p);
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+ &epd->bio_ctrl, disk_bytenr, p,
+ PAGE_SIZE, 0, 0, false);
+ if (ret) {
+ set_btree_ioerr(p, eb);
+ if (PageWriteback(p))
+ end_page_writeback(p);
+ if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ ret = -EIO;
+ break;
+ }
+ disk_bytenr += PAGE_SIZE;
+ wbc->nr_to_write--;
+ unlock_page(p);
+ }
+
+ if (unlikely(ret)) {
+ for (; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+ clear_page_dirty_for_io(p);
+ unlock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Submit one subpage btree page.
+ *
+ * The main difference to submit_eb_page() is:
+ * - Page locking
+ * For subpage, we don't rely on page locking at all.
+ *
+ * - Flush write bio
+ * We only flush bio if we may be unable to fit current extent buffers into
+ * current bio.
+ *
+ * Return >=0 for the number of submitted extent buffers.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_subpage(struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ int submitted = 0;
+ u64 page_start = page_offset(page);
+ int bit_start = 0;
+ int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
+ int ret;
+
+ /* Lock and write each dirty extent buffers in the range */
+ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct extent_buffer *eb;
+ unsigned long flags;
+ u64 start;
+
+ /*
+ * Take private lock to ensure the subpage won't be detached
+ * in the meantime.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ break;
+ }
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+ subpage->bitmaps)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock(&page->mapping->private_lock);
+ bit_start++;
+ continue;
+ }
+
+ start = page_start + bit_start * fs_info->sectorsize;
+ bit_start += sectors_per_node;
+
+ /*
+ * Here we just want to grab the eb without touching extra
+ * spin locks, so call find_extent_buffer_nolock().
+ */
+ eb = find_extent_buffer_nolock(fs_info, start);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock(&page->mapping->private_lock);
+
+ /*
+ * The eb has already reached 0 refs thus find_extent_buffer()
+ * doesn't return it. We don't need to write back such eb
+ * anyway.
+ */
+ if (!eb)
+ continue;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret == 0) {
+ free_extent_buffer(eb);
+ continue;
+ }
+ if (ret < 0) {
+ free_extent_buffer(eb);
+ goto cleanup;
+ }
+ ret = write_one_subpage_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ goto cleanup;
+ submitted++;
+ }
+ return submitted;
+
+cleanup:
+ /* We hit error, end bio for the submitted extent buffers */
+ submit_write_bio(epd, ret);
+ return ret;
+}
+
+/*
+ * Submit all page(s) of one extent buffer.
+ *
+ * @page: the page of one extent buffer
+ * @eb_context: to determine if we need to submit this page, if current page
+ * belongs to this eb, we don't need to submit
+ *
+ * The caller should pass each page in their bytenr order, and here we use
+ * @eb_context to determine if we have submitted pages of one extent buffer.
+ *
+ * If we have, we just skip until we hit a new page that doesn't belong to
+ * current @eb_context.
+ *
+ * If not, we submit all the page(s) of the extent buffer.
+ *
+ * Return >0 if we have submitted the extent buffer successfully.
+ * Return 0 if we don't need to submit the page, as it's already submitted by
+ * previous call.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ struct extent_buffer **eb_context)
+{
+ struct address_space *mapping = page->mapping;
+ struct btrfs_block_group *cache = NULL;
+ struct extent_buffer *eb;
+ int ret;
+
+ if (!PagePrivate(page))
+ return 0;
+
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+ return submit_eb_subpage(page, wbc, epd);
+
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON but no point
+ * crashing the machine for something we can survive anyway.
+ */
+ if (WARN_ON(!eb)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ if (eb == *eb_context) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
+ return 0;
+
+ if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
+ /*
+ * If for_sync, this hole will be filled with
+ * trasnsaction commit.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ free_extent_buffer(eb);
+ return ret;
+ }
+
+ *eb_context = eb;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret <= 0) {
+ btrfs_revert_meta_write_pointer(cache, eb);
+ if (cache)
+ btrfs_put_block_group(cache);
+ free_extent_buffer(eb);
+ return ret;
+ }
+ if (cache) {
+ /*
+ * Implies write in zoned mode. Mark the last eb in a block group.
+ */
+ btrfs_schedule_zone_finish_bg(cache, eb);
+ btrfs_put_block_group(cache);
+ }
+ ret = write_one_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_buffer *eb_context = NULL;
+ struct extent_page_data epd = {
+ .bio_ctrl = { 0 },
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ };
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ int ret = 0;
+ int done = 0;
+ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ xa_mark_t tag;
+
+ pagevec_init(&pvec);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
+ } else {
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
+ scanned = 1;
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+ btrfs_zoned_meta_io_lock(fs_info);
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
+ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag))) {
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ ret = submit_eb_page(page, wbc, &epd, &eb_context);
+ if (ret == 0)
+ continue;
+ if (ret < 0) {
+ done = 1;
+ break;
+ }
+
+ /*
+ * The filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time.
+ */
+ nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
+ wbc->nr_to_write <= 0);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ /*
+ * If something went wrong, don't allow any metadata write bio to be
+ * submitted.
+ *
+ * This would prevent use-after-free if we had dirty pages not
+ * cleaned up, which can still happen by fuzzed images.
+ *
+ * - Bad extent tree
+ * Allowing existing tree block to be allocated for other trees.
+ *
+ * - Log tree operations
+ * Exiting tree blocks get allocated to log tree, bumps its
+ * generation, then get cleaned in tree re-balance.
+ * Such tree block will not be written back, since it's clean,
+ * thus no WRITTEN flag set.
+ * And after log writes back, this tree block is not traced by
+ * any dirty extent_io_tree.
+ *
+ * - Offending tree block gets re-dirtied from its original owner
+ * Since it has bumped generation, no WRITTEN flag, it can be
+ * reused without COWing. This tree block will not be traced
+ * by btrfs_transaction::dirty_pages.
+ *
+ * Now such dirty tree block will not be cleaned by any dirty
+ * extent io tree. Thus we don't want to submit such wild eb
+ * if the fs already has error.
+ *
+ * We can get ret > 0 from submit_extent_page() indicating how many ebs
+ * were submitted. Reset it to 0 to avoid false alerts for the caller.
+ */
+ if (ret > 0)
+ ret = 0;
+ if (!ret && BTRFS_FS_ERROR(fs_info))
+ ret = -EROFS;
+ submit_write_bio(&epd, ret);
+
+ btrfs_zoned_meta_io_unlock(fs_info);
+ return ret;
+}
+
+/**
+ * Walk the list of dirty pages of the given address space and write all of them.
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @epd: holds context for the write, namely the bio
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0;
+ int done = 0;
+ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ pgoff_t done_index;
+ int range_whole = 0;
+ int scanned = 0;
+ xa_mark_t tag;
+
+ /*
+ * We have to hold onto the inode so that ordered extents can do their
+ * work when the IO finishes. The alternative to this is failing to add
+ * an ordered extent if the igrab() fails there and that is a huge pain
+ * to deal with, so instead just hold onto the inode throughout the
+ * writepages operation. If it fails here we are freeing up the inode
+ * anyway and we'd rather not waste our time writing out stuff that is
+ * going to be truncated anyway.
+ */
+ if (!igrab(inode))
+ return 0;
+
+ pagevec_init(&pvec);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
+ } else {
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ scanned = 1;
+ }
+
+ /*
+ * We do the tagged writepage as long as the snapshot flush bit is set
+ * and we are the first one who do the filemap_flush() on this inode.
+ *
+ * The nr_to_write == LONG_MAX is needed to make sure other flushers do
+ * not race in and drop the bit.
+ */
+ if (range_whole && wbc->nr_to_write == LONG_MAX &&
+ test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+ &BTRFS_I(inode)->runtime_flags))
+ wbc->tagged_writepages = 1;
+
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, index, end);
+ done_index = index;
+ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
+ &index, end, tag))) {
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ done_index = page->index + 1;
+ /*
+ * At this point we hold neither the i_pages lock nor
+ * the page lock: the page may be truncated or
+ * invalidated (changing page->mapping to NULL),
+ * or even swizzled back from swapper_space to
+ * tmpfs file mapping
+ */
+ if (!trylock_page(page)) {
+ submit_write_bio(epd, 0);
+ lock_page(page);
+ }
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ if (PageWriteback(page))
+ submit_write_bio(epd, 0);
+ wait_on_page_writeback(page);
+ }
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ret = __extent_writepage(page, wbc, epd);
+ if (ret < 0) {
+ done = 1;
+ break;
+ }
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+
+ /*
+ * If we're looping we could run into a page that is locked by a
+ * writer and that writer could be waiting on writeback for a
+ * page in our current bio, and thus deadlock, so flush the
+ * write bio here.
+ */
+ submit_write_bio(epd, 0);
+ goto retry;
+ }
+
+ if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
+ mapping->writeback_index = done_index;
+
+ btrfs_add_delayed_iput(inode);
+ return ret;
+}
+
+/*
+ * Submit the pages in the range to bio for call sites which delalloc range has
+ * already been ran (aka, ordered extent inserted) and all pages are still
+ * locked.
+ */
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
+{
+ bool found_error = false;
+ int first_error = 0;
+ int ret = 0;
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ u64 cur = start;
+ unsigned long nr_pages;
+ const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
+ struct extent_page_data epd = {
+ .bio_ctrl = { 0 },
+ .extent_locked = 1,
+ .sync_io = 1,
+ };
+ struct writeback_control wbc_writepages = {
+ .sync_mode = WB_SYNC_ALL,
+ .range_start = start,
+ .range_end = end + 1,
+ /* We're called from an async helper function */
+ .punt_to_cgroup = 1,
+ .no_cgroup_owner = 1,
+ };
+
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+ nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+ PAGE_SHIFT;
+ wbc_writepages.nr_to_write = nr_pages * 2;
+
+ wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
+ while (cur <= end) {
+ u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+
+ page = find_get_page(mapping, cur >> PAGE_SHIFT);
+ /*
+ * All pages in the range are locked since
+ * btrfs_run_delalloc_range(), thus there is no way to clear
+ * the page dirty flag.
+ */
+ ASSERT(PageLocked(page));
+ ASSERT(PageDirty(page));
+ clear_page_dirty_for_io(page);
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ found_error = true;
+ first_error = ret;
+ }
+ put_page(page);
+ cur = cur_end + 1;
+ }
+
+ submit_write_bio(&epd, found_error ? ret : 0);
+
+ wbc_detach_inode(&wbc_writepages);
+ if (found_error)
+ return first_error;
+ return ret;
+}
+
+int extent_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0;
+ struct extent_page_data epd = {
+ .bio_ctrl = { 0 },
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ };
+
+ /*
+ * Allow only a single thread to do the reloc work in zoned mode to
+ * protect the write pointer updates.
+ */
+ btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
+ ret = extent_write_cache_pages(mapping, wbc, &epd);
+ submit_write_bio(&epd, ret);
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
+ return ret;
+}
+
+void extent_readahead(struct readahead_control *rac)
+{
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct page *pagepool[16];
+ struct extent_map *em_cached = NULL;
+ u64 prev_em_start = (u64)-1;
+ int nr;
+
+ while ((nr = readahead_page_batch(rac, pagepool))) {
+ u64 contig_start = readahead_pos(rac);
+ u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
+
+ contiguous_readpages(pagepool, nr, contig_start, contig_end,
+ &em_cached, &bio_ctrl, &prev_em_start);
+ }
+
+ if (em_cached)
+ free_extent_map(em_cached);
+ submit_one_bio(&bio_ctrl);
+}
+
+/*
+ * basic invalidate_folio code, this waits on any locked or writeback
+ * ranges corresponding to the folio, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = folio_pos(folio);
+ u64 end = start + folio_size(folio) - 1;
+ size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
+
+ /* This function is only called for the btree inode */
+ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
+
+ start += ALIGN(offset, blocksize);
+ if (start > end)
+ return 0;
+
+ lock_extent(tree, start, end, &cached_state);
+ folio_wait_writeback(folio);
+
+ /*
+ * Currently for btree io tree, only EXTENT_LOCKED is utilized,
+ * so here we only need to unlock the extent range to free any
+ * existing extent state.
+ */
+ unlock_extent(tree, start, end, &cached_state);
+ return 0;
+}
+
+/*
+ * a helper for release_folio, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+static int try_release_extent_state(struct extent_io_tree *tree,
+ struct page *page, gfp_t mask)
+{
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ int ret = 1;
+
+ if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
+ ret = 0;
+ } else {
+ u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
+ EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
+ EXTENT_QGROUP_RESERVED);
+
+ /*
+ * At this point we can safely clear everything except the
+ * locked bit, the nodatasum bit and the delalloc new bit.
+ * The delalloc new bit will be cleared by ordered extent
+ * completion.
+ */
+ ret = __clear_extent_bit(tree, start, end, clear_bits, NULL,
+ mask, NULL);
+
+ /* if clear_extent_bit failed for enomem reasons,
+ * we can't allow the release to continue.
+ */
+ if (ret < 0)
+ ret = 0;
+ else
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * a helper for release_folio. As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct page *page, gfp_t mask)
+{
+ struct extent_map *em;
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &btrfs_inode->io_tree;
+ struct extent_map_tree *map = &btrfs_inode->extent_tree;
+
+ if (gfpflags_allow_blocking(mask) &&
+ page->mapping->host->i_size > SZ_16M) {
+ u64 len;
+ while (start <= end) {
+ struct btrfs_fs_info *fs_info;
+ u64 cur_gen;
+
+ len = end - start + 1;
+ write_lock(&map->lock);
+ em = lookup_extent_mapping(map, start, len);
+ if (!em) {
+ write_unlock(&map->lock);
+ break;
+ }
+ if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ em->start != start) {
+ write_unlock(&map->lock);
+ free_extent_map(em);
+ break;
+ }
+ if (test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED, 0, NULL))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used
+ * by a fast fsync, we can remove it. If it's being
+ * logged we can safely remove it since fsync took an
+ * extra reference on the em.
+ */
+ if (list_empty(&em->list) ||
+ test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it
+ * only if its generation is older then the current one,
+ * in which case we don't need it for a fast fsync.
+ * Otherwise don't remove it, we could be racing with an
+ * ongoing fast fsync that could miss the new extent.
+ */
+ fs_info = btrfs_inode->root->fs_info;
+ spin_lock(&fs_info->trans_lock);
+ cur_gen = fs_info->generation;
+ spin_unlock(&fs_info->trans_lock);
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there
+ * is no need to set the full fsync flag on the inode (it
+ * hurts the fsync performance for workloads with a data
+ * size that exceeds or is close to the system's memory).
+ */
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
+next:
+ start = extent_map_end(em);
+ write_unlock(&map->lock);
+
+ /* once for us */
+ free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
+ }
+ }
+ return try_release_extent_state(tree, page, mask);
+}
+
+/*
+ * To cache previous fiemap extent
+ *
+ * Will be used for merging fiemap extent
+ */
+struct fiemap_cache {
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+ bool cached;
+};
+
+/*
+ * Helper to submit fiemap extent.
+ *
+ * Will try to merge current fiemap extent specified by @offset, @phys,
+ * @len and @flags with cached one.
+ * And only when we fails to merge, cached one will be submitted as
+ * fiemap extent.
+ *
+ * Return value is the same as fiemap_fill_next_extent().
+ */
+static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ u64 offset, u64 phys, u64 len, u32 flags)
+{
+ int ret = 0;
+
+ /* Set at the end of extent_fiemap(). */
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
+ if (!cache->cached)
+ goto assign;
+
+ /*
+ * Sanity check, extent_fiemap() should have ensured that new
+ * fiemap extent won't overlap with cached one.
+ * Not recoverable.
+ *
+ * NOTE: Physical address can overlap, due to compression
+ */
+ if (cache->offset + cache->len > offset) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ /*
+ * Only merges fiemap extents if
+ * 1) Their logical addresses are continuous
+ *
+ * 2) Their physical addresses are continuous
+ * So truly compressed (physical size smaller than logical size)
+ * extents won't get merged with each other
+ *
+ * 3) Share same flags
+ */
+ if (cache->offset + cache->len == offset &&
+ cache->phys + cache->len == phys &&
+ cache->flags == flags) {
+ cache->len += len;
+ return 0;
+ }
+
+ /* Not mergeable, need to submit cached one */
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret)
+ return ret;
+assign:
+ cache->cached = true;
+ cache->offset = offset;
+ cache->phys = phys;
+ cache->len = len;
+ cache->flags = flags;
+
+ return 0;
+}
+
+/*
+ * Emit last fiemap cache
+ *
+ * The last fiemap cache may still be cached in the following case:
+ * 0 4k 8k
+ * |<- Fiemap range ->|
+ * |<------------ First extent ----------->|
+ *
+ * In this case, the first extent range will be cached but not emitted.
+ * So we must emit it before ending extent_fiemap().
+ */
+static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ int ret;
+
+ if (!cache->cached)
+ return 0;
+
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
+{
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+ return 0;
+
+ ret = btrfs_next_leaf(inode->root, path);
+ if (ret != 0)
+ return ret;
+
+ /*
+ * Don't bother with cloning if there are no more file extent items for
+ * our inode.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+ u64 file_offset)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = file_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret != 0)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+ }
+
+ /*
+ * We clone the leaf and use it during fiemap. This is because while
+ * using the leaf we do expensive things like checking if an extent is
+ * shared, which can take a long time. In order to prevent blocking
+ * other tasks for too long, we use a clone of the leaf. We have locked
+ * the file range in the inode's io tree, so we know none of our file
+ * extent items can change. This way we avoid blocking other tasks that
+ * want to insert items for other inodes in the same leaf or b+tree
+ * rebalance operations (triggered for example when someone is trying
+ * to push items into this leaf when trying to insert an item in a
+ * neighbour leaf).
+ * We also need the private clone because holding a read lock on an
+ * extent buffer of the subvolume's b+tree will make lockdep unhappy
+ * when we call fiemap_fill_next_extent(), because that may cause a page
+ * fault when filling the user space buffer with fiemap data.
+ */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ struct btrfs_backref_shared_cache *backref_cache,
+ u64 disk_bytenr, u64 extent_offset,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp_ulist,
+ u64 start, u64 end)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ u64 cur_offset = start;
+ u64 last_delalloc_end = 0;
+ u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+ bool checked_extent_shared = false;
+ int ret;
+
+ /*
+ * There can be no delalloc past i_size, so don't waste time looking for
+ * it beyond i_size.
+ */
+ while (cur_offset < end && cur_offset < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 prealloc_start;
+ u64 prealloc_len = 0;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ /*
+ * If this is a prealloc extent we have to report every section
+ * of it that has no delalloc.
+ */
+ if (disk_bytenr != 0) {
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = delalloc_start - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = delalloc_start - prealloc_start;
+ }
+ }
+
+ if (prealloc_len > 0) {
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode->root,
+ ino, disk_bytenr,
+ extent_gen, roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+ checked_extent_shared = true;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ extent_offset += prealloc_len;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+ delalloc_end + 1 - delalloc_start,
+ FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ret)
+ return ret;
+
+ last_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ extent_offset += cur_offset - delalloc_start;
+ cond_resched();
+ }
+
+ /*
+ * Either we found no delalloc for the whole prealloc extent or we have
+ * a prealloc extent that spans i_size or starts at or after i_size.
+ */
+ if (disk_bytenr != 0 && last_delalloc_end < end) {
+ u64 prealloc_start;
+ u64 prealloc_len;
+
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = end + 1 - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = end + 1 - prealloc_start;
+ }
+
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode->root,
+ ino, disk_bytenr,
+ extent_gen, roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ u64 *last_extent_end_ret)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ int ret;
+
+ /*
+ * Lookup the last file extent. We're not using i_size here because
+ * there might be preallocation past i_size.
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+ /* There can't be a file extent item at offset (u64)-1 */
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * For a non-existing key, btrfs_search_slot() always leaves us at a
+ * slot > 0, except if the btree is empty, which is impossible because
+ * at least it has the inode item for this inode and all the items for
+ * the root inode 256.
+ */
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* No file extent items in the subvolume tree. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+
+ /*
+ * For an inline extent, the disk_bytenr is where inline data starts at,
+ * so first check if we have an inline extent item before checking if we
+ * have an implicit hole (disk_bytenr == 0).
+ */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+ }
+
+ /*
+ * Find the last file extent item that is not a hole (when NO_HOLES is
+ * not enabled). This should take at most 2 iterations in the worst
+ * case: we have one hole file extent item at slot 0 of a leaf and
+ * another hole file extent item as the last item in the previous leaf.
+ * This is because we merge file extent items that represent holes.
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ while (disk_bytenr == 0) {
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /* No file extent items that are not holes. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ }
+
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+}
+
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct fiemap_cache cache = { 0 };
+ struct btrfs_backref_shared_cache *backref_cache;
+ struct ulist *roots;
+ struct ulist *tmp_ulist;
+ u64 last_extent_end;
+ u64 prev_extent_end;
+ u64 lockstart;
+ u64 lockend;
+ bool stopped = false;
+ int ret;
+
+ backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
+ path = btrfs_alloc_path();
+ roots = ulist_alloc(GFP_KERNEL);
+ tmp_ulist = ulist_alloc(GFP_KERNEL);
+ if (!backref_cache || !path || !roots || !tmp_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ lockstart = round_down(start, root->fs_info->sectorsize);
+ lockend = round_up(start + len, root->fs_info->sectorsize);
+ prev_extent_end = lockstart;
+
+ btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+ if (ret < 0)
+ goto out_unlock;
+ btrfs_release_path(path);
+
+ path->reada = READA_FORWARD;
+ ret = fiemap_search_slot(inode, path, lockstart);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /*
+ * No file extent item found, but we may have delalloc between
+ * the current offset and i_size. So check for that.
+ */
+ ret = 0;
+ goto check_eof_delalloc;
+ }
+
+ while (prev_extent_end < lockend) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 extent_end;
+ u64 extent_len;
+ u64 extent_offset = 0;
+ u64 extent_gen;
+ u64 disk_bytenr = 0;
+ u64 flags = 0;
+ int extent_type;
+ u8 compression;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The first iteration can leave us at an extent item that ends
+ * before our range's start. Move to the next item.
+ */
+ if (extent_end <= lockstart)
+ goto next_item;
+
+ /* We have in implicit hole (NO_HOLES feature enabled). */
+ if (prev_extent_end < key.offset) {
+ const u64 range_end = min(key.offset, lockend) - 1;
+
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache, 0, 0, 0,
+ roots, tmp_ulist,
+ prev_extent_end, range_end);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ /* We've reached the end of the fiemap range, stop. */
+ if (key.offset >= lockend) {
+ stopped = true;
+ break;
+ }
+ }
+
+ extent_len = extent_end - key.offset;
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ if (compression != BTRFS_COMPRESS_NONE)
+ flags |= FIEMAP_EXTENT_ENCODED;
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+ extent_len, flags);
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache,
+ disk_bytenr, extent_offset,
+ extent_gen, roots, tmp_ulist,
+ key.offset, extent_end - 1);
+ } else if (disk_bytenr == 0) {
+ /* We have an explicit hole. */
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache, 0, 0, 0,
+ roots, tmp_ulist,
+ key.offset, extent_end - 1);
+ } else {
+ /* We have a regular extent. */
+ if (fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(root, ino,
+ disk_bytenr,
+ extent_gen,
+ roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ goto out_unlock;
+ else if (ret > 0)
+ flags |= FIEMAP_EXTENT_SHARED;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+ disk_bytenr + extent_offset,
+ extent_len, flags);
+ }
+
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ prev_extent_end = extent_end;
+next_item:
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out_unlock;
+ }
+
+ ret = fiemap_next_leaf_item(inode, path);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* No more file extent items for this inode. */
+ break;
+ }
+ cond_resched();
+ }
+
+check_eof_delalloc:
+ /*
+ * Release (and free) the path before emitting any final entries to
+ * fiemap_fill_next_extent() to keep lockdep happy. This is because
+ * once we find no more file extent items exist, we may have a
+ * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
+ * faults when copying data to the user space buffer.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ if (!stopped && prev_extent_end < lockend) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache,
+ 0, 0, 0, roots, tmp_ulist,
+ prev_extent_end, lockend - 1);
+ if (ret < 0)
+ goto out_unlock;
+ prev_extent_end = lockend;
+ }
+
+ if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ if (prev_extent_end < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode,
+ prev_extent_end,
+ i_size - 1,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ } else {
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ }
+ }
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
+
+out_unlock:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+out:
+ kfree(backref_cache);
+ btrfs_free_path(path);
+ ulist_free(roots);
+ ulist_free(tmp_ulist);
+ return ret;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+ kmem_cache_free(extent_buffer_cache, eb);
+}
+
+int extent_buffer_under_io(const struct extent_buffer *eb)
+{
+ return (atomic_read(&eb->io_pages) ||
+ test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+ test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+}
+
+static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ if (PagePrivate(page)) {
+ subpage = (struct btrfs_subpage *)page->private;
+ if (atomic_read(&subpage->eb_refs))
+ return true;
+ /*
+ * Even there is no eb refs here, we may still have
+ * end_page_read() call relying on page::private.
+ */
+ if (atomic_read(&subpage->readers))
+ return true;
+ }
+ return false;
+}
+
+static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+
+ /*
+ * For mapped eb, we're going to change the page private, which should
+ * be done under the private_lock.
+ */
+ if (mapped)
+ spin_lock(&page->mapping->private_lock);
+
+ if (!PagePrivate(page)) {
+ if (mapped)
+ spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ if (fs_info->nodesize >= PAGE_SIZE) {
+ /*
+ * We do this since we'll remove the pages after we've
+ * removed the eb from the radix tree, so we could race
+ * and have this page now attached to the new eb. So
+ * only clear page_private if it's still connected to
+ * this eb.
+ */
+ if (PagePrivate(page) &&
+ page->private == (unsigned long)eb) {
+ BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(PageDirty(page));
+ BUG_ON(PageWriteback(page));
+ /*
+ * We need to make sure we haven't be attached
+ * to a new eb.
+ */
+ detach_page_private(page);
+ }
+ if (mapped)
+ spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ /*
+ * For subpage, we can have dummy eb with page private. In this case,
+ * we can directly detach the private as such page is only attached to
+ * one dummy eb, no sharing.
+ */
+ if (!mapped) {
+ btrfs_detach_subpage(fs_info, page);
+ return;
+ }
+
+ btrfs_page_dec_eb_refs(fs_info, page);
+
+ /*
+ * We can only detach the page private if there are no other ebs in the
+ * page range and no unfinished IO.
+ */
+ if (!page_range_has_eb(fs_info, page))
+ btrfs_detach_subpage(fs_info, page);
+
+ spin_unlock(&page->mapping->private_lock);
+}
+
+/* Release all pages attached to the extent buffer */
+static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+{
+ int i;
+ int num_pages;
+
+ ASSERT(!extent_buffer_under_io(eb));
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = eb->pages[i];
+
+ if (!page)
+ continue;
+
+ detach_extent_buffer_page(eb, page);
+
+ /* One for when we allocated the page */
+ put_page(page);
+ }
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+ btrfs_release_extent_buffer_pages(eb);
+ btrfs_leak_debug_del_eb(eb);
+ __free_extent_buffer(eb);
+}
+
+static struct extent_buffer *
+__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
+ unsigned long len)
+{
+ struct extent_buffer *eb = NULL;
+
+ eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
+ eb->start = start;
+ eb->len = len;
+ eb->fs_info = fs_info;
+ eb->bflags = 0;
+ init_rwsem(&eb->lock);
+
+ btrfs_leak_debug_add_eb(eb);
+ INIT_LIST_HEAD(&eb->release_list);
+
+ spin_lock_init(&eb->refs_lock);
+ atomic_set(&eb->refs, 1);
+ atomic_set(&eb->io_pages, 0);
+
+ ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
+
+ return eb;
+}
+
+struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
+{
+ int i;
+ struct extent_buffer *new;
+ int num_pages = num_extent_pages(src);
+ int ret;
+
+ new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+ if (new == NULL)
+ return NULL;
+
+ /*
+ * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
+ * btrfs_release_extent_buffer() have different behavior for
+ * UNMAPPED subpage extent buffer.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+
+ memset(new->pages, 0, sizeof(*new->pages) * num_pages);
+ ret = btrfs_alloc_page_array(num_pages, new->pages);
+ if (ret) {
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
+
+ for (i = 0; i < num_pages; i++) {
+ int ret;
+ struct page *p = new->pages[i];
+
+ ret = attach_extent_buffer_page(new, p, NULL);
+ if (ret < 0) {
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
+ WARN_ON(PageDirty(p));
+ copy_page(page_address(p), page_address(src->pages[i]));
+ }
+ set_extent_buffer_uptodate(new);
+
+ return new;
+}
+
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len)
+{
+ struct extent_buffer *eb;
+ int num_pages;
+ int i;
+ int ret;
+
+ eb = __alloc_extent_buffer(fs_info, start, len);
+ if (!eb)
+ return NULL;
+
+ num_pages = num_extent_pages(eb);
+ ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ ret = attach_extent_buffer_page(eb, p, NULL);
+ if (ret < 0)
+ goto err;
+ }
+
+ set_extent_buffer_uptodate(eb);
+ btrfs_set_header_nritems(eb, 0);
+ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+
+ return eb;
+err:
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i]) {
+ detach_extent_buffer_page(eb, eb->pages[i]);
+ __free_page(eb->pages[i]);
+ }
+ }
+ __free_extent_buffer(eb);
+ return NULL;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
+}
+
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+ int refs;
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
+ *
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling release_folio when the tree reference is the only reference.
+ *
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, release_folio can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
+ *
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
+ */
+ refs = atomic_read(&eb->refs);
+ if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ return;
+
+ spin_lock(&eb->refs_lock);
+ if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_inc(&eb->refs);
+ spin_unlock(&eb->refs_lock);
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb,
+ struct page *accessed)
+{
+ int num_pages, i;
+
+ check_buffer_tree_ref(eb);
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ if (p != accessed)
+ mark_page_accessed(p);
+ }
+}
+
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer_nolock(fs_info, start);
+ if (!eb)
+ return NULL;
+ /*
+ * Lock our eb's refs_lock to avoid races with free_extent_buffer().
+ * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
+ * another task running free_extent_buffer() might have seen that flag
+ * set, eb->refs == 2, that the buffer isn't under IO (dirty and
+ * writeback flags not set) and it's still in the tree (flag
+ * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
+ * decrementing the extent buffer's reference count twice. So here we
+ * could race and increment the eb's reference count, clear its stale
+ * flag, mark it as dirty and drop our reference before the other task
+ * finishes executing free_extent_buffer, which would later result in
+ * an attempt to free an extent buffer that is dirty.
+ */
+ if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+ spin_lock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
+ }
+ mark_extent_buffer_accessed(eb, NULL);
+ return eb;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ struct extent_buffer *eb, *exists = NULL;
+ int ret;
+
+ eb = find_extent_buffer(fs_info, start);
+ if (eb)
+ return eb;
+ eb = alloc_dummy_extent_buffer(fs_info, start);
+ if (!eb)
+ return ERR_PTR(-ENOMEM);
+ eb->fs_info = fs_info;
+again:
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ else
+ goto again;
+ }
+ check_buffer_tree_ref(eb);
+ set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+ return eb;
+free_eb:
+ btrfs_release_extent_buffer(eb);
+ return exists;
+}
+#endif
+
+static struct extent_buffer *grab_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page)
+{
+ struct extent_buffer *exists;
+
+ /*
+ * For subpage case, we completely rely on radix tree to ensure we
+ * don't try to insert two ebs for the same bytenr. So here we always
+ * return NULL and just continue.
+ */
+ if (fs_info->nodesize < PAGE_SIZE)
+ return NULL;
+
+ /* Page not yet attached to an extent buffer */
+ if (!PagePrivate(page))
+ return NULL;
+
+ /*
+ * We could have already allocated an eb for this page and attached one
+ * so lets see if we can get a ref on the existing eb, and if we can we
+ * know it's good and we can just return that one, else we know we can
+ * just overwrite page->private.
+ */
+ exists = (struct extent_buffer *)page->private;
+ if (atomic_inc_not_zero(&exists->refs))
+ return exists;
+
+ WARN_ON(PageDirty(page));
+ detach_page_private(page);
+ return NULL;
+}
+
+static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+{
+ if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ btrfs_err(fs_info, "bad tree block start %llu", start);
+ return -EINVAL;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE &&
+ offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ if (fs_info->nodesize >= PAGE_SIZE &&
+ !PAGE_ALIGNED(start)) {
+ btrfs_err(fs_info,
+ "tree block is not page aligned, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, u64 owner_root, int level)
+{
+ unsigned long len = fs_info->nodesize;
+ int num_pages;
+ int i;
+ unsigned long index = start >> PAGE_SHIFT;
+ struct extent_buffer *eb;
+ struct extent_buffer *exists = NULL;
+ struct page *p;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ u64 lockdep_owner = owner_root;
+ int uptodate = 1;
+ int ret;
+
+ if (check_eb_alignment(fs_info, start))
+ return ERR_PTR(-EINVAL);
+
+#if BITS_PER_LONG == 32
+ if (start >= MAX_LFS_FILESIZE) {
+ btrfs_err_rl(fs_info,
+ "extent buffer %llu is beyond 32bit page cache limit", start);
+ btrfs_err_32bit_limit(fs_info);
+ return ERR_PTR(-EOVERFLOW);
+ }
+ if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
+ btrfs_warn_32bit_limit(fs_info);
+#endif
+
+ eb = find_extent_buffer(fs_info, start);
+ if (eb)
+ return eb;
+
+ eb = __alloc_extent_buffer(fs_info, start, len);
+ if (!eb)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++, index++) {
+ struct btrfs_subpage *prealloc = NULL;
+
+ p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
+ if (!p) {
+ exists = ERR_PTR(-ENOMEM);
+ goto free_eb;
+ }
+
+ /*
+ * Preallocate page->private for subpage case, so that we won't
+ * allocate memory with private_lock hold. The memory will be
+ * freed by attach_extent_buffer_page() or freed manually if
+ * we exit earlier.
+ *
+ * Although we have ensured one subpage eb can only have one
+ * page, but it may change in the future for 16K page size
+ * support, so we still preallocate the memory in the loop.
+ */
+ if (fs_info->nodesize < PAGE_SIZE) {
+ prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (IS_ERR(prealloc)) {
+ ret = PTR_ERR(prealloc);
+ unlock_page(p);
+ put_page(p);
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
+ }
+
+ spin_lock(&mapping->private_lock);
+ exists = grab_extent_buffer(fs_info, p);
+ if (exists) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
+ put_page(p);
+ mark_extent_buffer_accessed(exists, p);
+ btrfs_free_subpage(prealloc);
+ goto free_eb;
+ }
+ /* Should not fail, as we have preallocated the memory */
+ ret = attach_extent_buffer_page(eb, p, prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the page private
+ * when the eb hasn't yet been inserted into radix tree.
+ *
+ * The ref will be decreased when the eb released the page, in
+ * detach_extent_buffer_page().
+ * Thus needs no special handling in error path.
+ */
+ btrfs_page_inc_eb_refs(fs_info, p);
+ spin_unlock(&mapping->private_lock);
+
+ WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
+ eb->pages[i] = p;
+ if (!PageUptodate(p))
+ uptodate = 0;
+
+ /*
+ * We can't unlock the pages just yet since the extent buffer
+ * hasn't been properly inserted in the radix tree, this
+ * opens a race with btree_release_folio which can free a page
+ * while we are still filling in all pages for the buffer and
+ * we could crash.
+ */
+ }
+ if (uptodate)
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+again:
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
+
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ else
+ goto again;
+ }
+ /* add one reference for the tree */
+ check_buffer_tree_ref(eb);
+ set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+ /*
+ * Now it's safe to unlock the pages because any calls to
+ * btree_release_folio will correctly detect that a page belongs to a
+ * live buffer and won't free them prematurely.
+ */
+ for (i = 0; i < num_pages; i++)
+ unlock_page(eb->pages[i]);
+ return eb;
+
+free_eb:
+ WARN_ON(!atomic_dec_and_test(&eb->refs));
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i])
+ unlock_page(eb->pages[i]);
+ }
+
+ btrfs_release_extent_buffer(eb);
+ return exists;
+}
+
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+ struct extent_buffer *eb =
+ container_of(head, struct extent_buffer, rcu_head);
+
+ __free_extent_buffer(eb);
+}
+
+static int release_extent_buffer(struct extent_buffer *eb)
+ __releases(&eb->refs_lock)
+{
+ lockdep_assert_held(&eb->refs_lock);
+
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ if (atomic_dec_and_test(&eb->refs)) {
+ if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ spin_unlock(&eb->refs_lock);
+
+ spin_lock(&fs_info->buffer_lock);
+ radix_tree_delete(&fs_info->buffer_radix,
+ eb->start >> fs_info->sectorsize_bits);
+ spin_unlock(&fs_info->buffer_lock);
+ } else {
+ spin_unlock(&eb->refs_lock);
+ }
+
+ btrfs_leak_debug_del_eb(eb);
+ /* Should be safe to release our pages at this point */
+ btrfs_release_extent_buffer_pages(eb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
+ __free_extent_buffer(eb);
+ return 1;
+ }
+#endif
+ call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+ return 1;
+ }
+ spin_unlock(&eb->refs_lock);
+
+ return 0;
+}
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+ int refs;
+ if (!eb)
+ return;
+
+ refs = atomic_read(&eb->refs);
+ while (1) {
+ if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
+ || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
+ refs == 1))
+ break;
+ if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
+ return;
+ }
+
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+ !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ /*
+ * I know this is terrible, but it's temporary until we stop tracking
+ * the uptodate bits and such for the extent buffers.
+ */
+ release_extent_buffer(eb);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+ if (!eb)
+ return;
+
+ spin_lock(&eb->refs_lock);
+ set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+ if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+ release_extent_buffer(eb);
+}
+
+static void btree_clear_page_dirty(struct page *page)
+{
+ ASSERT(PageDirty(page));
+ ASSERT(PageLocked(page));
+ clear_page_dirty_for_io(page);
+ xa_lock_irq(&page->mapping->i_pages);
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&page->mapping->i_pages);
+}
+
+static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page = eb->pages[0];
+ bool last;
+
+ /* btree_clear_page_dirty() needs page locked */
+ lock_page(page);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
+ eb->len);
+ if (last)
+ btree_clear_page_dirty(page);
+ unlock_page(page);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+}
+
+void clear_extent_buffer_dirty(const struct extent_buffer *eb)
+{
+ int i;
+ int num_pages;
+ struct page *page;
+
+ if (eb->fs_info->nodesize < PAGE_SIZE)
+ return clear_subpage_extent_buffer_dirty(eb);
+
+ num_pages = num_extent_pages(eb);
+
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+ if (!PageDirty(page))
+ continue;
+ lock_page(page);
+ btree_clear_page_dirty(page);
+ ClearPageError(page);
+ unlock_page(page);
+ }
+ WARN_ON(atomic_read(&eb->refs) == 0);
+}
+
+bool set_extent_buffer_dirty(struct extent_buffer *eb)
+{
+ int i;
+ int num_pages;
+ bool was_dirty;
+
+ check_buffer_tree_ref(eb);
+
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
+ num_pages = num_extent_pages(eb);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
+ if (!was_dirty) {
+ bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
+
+ /*
+ * For subpage case, we can have other extent buffers in the
+ * same page, and in clear_subpage_extent_buffer_dirty() we
+ * have to clear page dirty without subpage lock held.
+ * This can cause race where our page gets dirty cleared after
+ * we just set it.
+ *
+ * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * its page for other reasons, we can use page lock to prevent
+ * the above race.
+ */
+ if (subpage)
+ lock_page(eb->pages[0]);
+ for (i = 0; i < num_pages; i++)
+ btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
+ eb->start, eb->len);
+ if (subpage)
+ unlock_page(eb->pages[0]);
+ }
+#ifdef CONFIG_BTRFS_DEBUG
+ for (i = 0; i < num_pages; i++)
+ ASSERT(PageDirty(eb->pages[i]));
+#endif
+
+ return was_dirty;
+}
+
+void clear_extent_buffer_uptodate(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page;
+ int num_pages;
+ int i;
+
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+ if (!page)
+ continue;
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ ClearPageUptodate(page);
+ else
+ btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
+ eb->len);
+ }
+}
+
+void set_extent_buffer_uptodate(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page;
+ int num_pages;
+ int i;
+
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ SetPageUptodate(page);
+ else
+ btrfs_subpage_set_uptodate(fs_info, page, eb->start,
+ eb->len);
+ }
+}
+
+static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct extent_io_tree *io_tree;
+ struct page *page = eb->pages[0];
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
+ int ret = 0;
+
+ ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
+ ASSERT(PagePrivate(page));
+ io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+
+ if (wait == WAIT_NONE) {
+ if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
+ return -EAGAIN;
+ } else {
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = 0;
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
+ PageUptodate(page) ||
+ btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
+ return ret;
+ }
+
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = 0;
+ atomic_set(&eb->io_pages, 1);
+ check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
+
+ btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
+
+ btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
+ ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
+ eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, true);
+ if (ret) {
+ /*
+ * In the endio function, if we hit something wrong we will
+ * increase the io_pages, so here we need to decrease it for
+ * error path.
+ */
+ atomic_dec(&eb->io_pages);
+ }
+ submit_one_bio(&bio_ctrl);
+ if (ret || wait != WAIT_COMPLETE)
+ return ret;
+
+ wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ ret = -EIO;
+ return ret;
+}
+
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
+{
+ int i;
+ struct page *page;
+ int err;
+ int ret = 0;
+ int locked_pages = 0;
+ int all_uptodate = 1;
+ int num_pages;
+ unsigned long num_reads = 0;
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
+
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ return 0;
+
+ /*
+ * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
+ * operation, which could potentially still be in flight. In this case
+ * we simply want to return an error.
+ */
+ if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
+ return -EIO;
+
+ if (eb->fs_info->nodesize < PAGE_SIZE)
+ return read_extent_buffer_subpage(eb, wait, mirror_num);
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+ if (wait == WAIT_NONE) {
+ /*
+ * WAIT_NONE is only utilized by readahead. If we can't
+ * acquire the lock atomically it means either the eb
+ * is being read out or under modification.
+ * Either way the eb will be or has been cached,
+ * readahead can exit safely.
+ */
+ if (!trylock_page(page))
+ goto unlock_exit;
+ } else {
+ lock_page(page);
+ }
+ locked_pages++;
+ }
+ /*
+ * We need to firstly lock all pages to make sure that
+ * the uptodate bit of our pages won't be affected by
+ * clear_extent_buffer_uptodate().
+ */
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+ if (!PageUptodate(page)) {
+ num_reads++;
+ all_uptodate = 0;
+ }
+ }
+
+ if (all_uptodate) {
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ goto unlock_exit;
+ }
+
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = 0;
+ atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for release_folio to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+
+ if (!PageUptodate(page)) {
+ if (ret) {
+ atomic_dec(&eb->io_pages);
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageError(page);
+ err = submit_extent_page(REQ_OP_READ, NULL,
+ &bio_ctrl, page_offset(page), page,
+ PAGE_SIZE, 0, 0, false);
+ if (err) {
+ /*
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
+ */
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
+ atomic_dec(&eb->io_pages);
+ }
+ } else {
+ unlock_page(page);
+ }
+ }
+
+ submit_one_bio(&bio_ctrl);
+
+ if (ret || wait != WAIT_COMPLETE)
+ return ret;
+
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ ret = -EIO;
+ }
+
+ return ret;
+
+unlock_exit:
+ while (locked_pages > 0) {
+ locked_pages--;
+ page = eb->pages[locked_pages];
+ unlock_page(page);
+ }
+ return ret;
+}
+
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
+void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ unsigned long i = get_eb_page_index(start);
+
+ if (check_eb_range(eb, start, len)) {
+ /*
+ * Invalid range hit, reset the memory, so callers won't get
+ * some random garbage for their uninitialzed memory.
+ */
+ memset(dstv, 0, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_page(eb, start);
+
+ while (len > 0) {
+ page = eb->pages[i];
+
+ cur = min(len, (PAGE_SIZE - offset));
+ kaddr = page_address(page);
+ memcpy(dst, kaddr + offset, cur);
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char __user *dst = (char __user *)dstv;
+ unsigned long i = get_eb_page_index(start);
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = get_eb_offset_in_page(eb, start);
+
+ while (len > 0) {
+ page = eb->pages[i];
+
+ cur = min(len, (PAGE_SIZE - offset));
+ kaddr = page_address(page);
+ if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+
+ return ret;
+}
+
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *ptr = (char *)ptrv;
+ unsigned long i = get_eb_page_index(start);
+ int ret = 0;
+
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
+
+ offset = get_eb_offset_in_page(eb, start);
+
+ while (len > 0) {
+ page = eb->pages[i];
+
+ cur = min(len, (PAGE_SIZE - offset));
+
+ kaddr = page_address(page);
+ ret = memcmp(ptr, kaddr + offset, cur);
+ if (ret)
+ break;
+
+ ptr += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+ return ret;
+}
+
+/*
+ * Check that the extent buffer is uptodate.
+ *
+ * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
+ * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
+ */
+static void assert_eb_page_uptodate(const struct extent_buffer *eb,
+ struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ /*
+ * If we are using the commit root we could potentially clear a page
+ * Uptodate while we're using the extent buffer that we've previously
+ * looked up. We don't want to complain in this case, as the page was
+ * valid before, we just didn't write it out. Instead we want to catch
+ * the case where we didn't actually read the block properly, which
+ * would have !PageUptodate && !PageError, as we clear PageError before
+ * reading.
+ */
+ if (fs_info->nodesize < PAGE_SIZE) {
+ bool uptodate, error;
+
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page,
+ eb->start, eb->len);
+ error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
+ WARN_ON(!uptodate && !error);
+ } else {
+ WARN_ON(!PageUptodate(page) && !PageError(page));
+ }
+}
+
+void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
+ const void *srcv)
+{
+ char *kaddr;
+
+ assert_eb_page_uptodate(eb, eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
+ chunk_tree_uuid));
+ memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
+}
+
+void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
+{
+ char *kaddr;
+
+ assert_eb_page_uptodate(eb, eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
+ memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
+}
+
+void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *src = (char *)srcv;
+ unsigned long i = get_eb_page_index(start);
+
+ WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
+
+ if (check_eb_range(eb, start, len))
+ return;
+
+ offset = get_eb_offset_in_page(eb, start);
+
+ while (len > 0) {
+ page = eb->pages[i];
+ assert_eb_page_uptodate(eb, page);
+
+ cur = min(len, PAGE_SIZE - offset);
+ kaddr = page_address(page);
+ memcpy(kaddr + offset, src, cur);
+
+ src += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ unsigned long i = get_eb_page_index(start);
+
+ if (check_eb_range(eb, start, len))
+ return;
+
+ offset = get_eb_offset_in_page(eb, start);
+
+ while (len > 0) {
+ page = eb->pages[i];
+ assert_eb_page_uptodate(eb, page);
+
+ cur = min(len, PAGE_SIZE - offset);
+ kaddr = page_address(page);
+ memset(kaddr + offset, 0, cur);
+
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+void copy_extent_buffer_full(const struct extent_buffer *dst,
+ const struct extent_buffer *src)
+{
+ int i;
+ int num_pages;
+
+ ASSERT(dst->len == src->len);
+
+ if (dst->fs_info->nodesize >= PAGE_SIZE) {
+ num_pages = num_extent_pages(dst);
+ for (i = 0; i < num_pages; i++)
+ copy_page(page_address(dst->pages[i]),
+ page_address(src->pages[i]));
+ } else {
+ size_t src_offset = get_eb_offset_in_page(src, 0);
+ size_t dst_offset = get_eb_offset_in_page(dst, 0);
+
+ ASSERT(src->fs_info->nodesize < PAGE_SIZE);
+ memcpy(page_address(dst->pages[0]) + dst_offset,
+ page_address(src->pages[0]) + src_offset,
+ src->len);
+ }
+}
+
+void copy_extent_buffer(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
+{
+ u64 dst_len = dst->len;
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ unsigned long i = get_eb_page_index(dst_offset);
+
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
+
+ WARN_ON(src->len != dst_len);
+
+ offset = get_eb_offset_in_page(dst, dst_offset);
+
+ while (len > 0) {
+ page = dst->pages[i];
+ assert_eb_page_uptodate(dst, page);
+
+ cur = min(len, (unsigned long)(PAGE_SIZE - offset));
+
+ kaddr = page_address(page);
+ read_extent_buffer(src, kaddr + offset, src_offset, cur);
+
+ src_offset += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+/*
+ * eb_bitmap_offset() - calculate the page and offset of the byte containing the
+ * given bit number
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @page_index: return index of the page in the extent buffer that contains the
+ * given bit number
+ * @page_offset: return offset into the page given by page_index
+ *
+ * This helper hides the ugliness of finding the byte in an extent buffer which
+ * contains a given bit.
+ */
+static inline void eb_bitmap_offset(const struct extent_buffer *eb,
+ unsigned long start, unsigned long nr,
+ unsigned long *page_index,
+ size_t *page_offset)
+{
+ size_t byte_offset = BIT_BYTE(nr);
+ size_t offset;
+
+ /*
+ * The byte we want is the offset of the extent buffer + the offset of
+ * the bitmap item in the extent buffer + the offset of the byte in the
+ * bitmap item.
+ */
+ offset = start + offset_in_page(eb->start) + byte_offset;
+
+ *page_index = offset >> PAGE_SHIFT;
+ *page_offset = offset_in_page(offset);
+}
+
+/**
+ * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number to test
+ */
+int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
+{
+ u8 *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+
+ eb_bitmap_offset(eb, start, nr, &i, &offset);
+ page = eb->pages[i];
+ assert_eb_page_uptodate(eb, page);
+ kaddr = page_address(page);
+ return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
+}
+
+/**
+ * extent_buffer_bitmap_set - set an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to set
+ */
+void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ u8 *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ assert_eb_page_uptodate(eb, page);
+ kaddr = page_address(page);
+
+ while (len >= bits_to_set) {
+ kaddr[offset] |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_BYTE;
+ mask_to_set = ~0;
+ if (++offset >= PAGE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ assert_eb_page_uptodate(eb, page);
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] |= mask_to_set;
+ }
+}
+
+
+/**
+ * extent_buffer_bitmap_clear - clear an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to clear
+ */
+void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ unsigned long start, unsigned long pos,
+ unsigned long len)
+{
+ u8 *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ assert_eb_page_uptodate(eb, page);
+ kaddr = page_address(page);
+
+ while (len >= bits_to_clear) {
+ kaddr[offset] &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_BYTE;
+ mask_to_clear = ~0;
+ if (++offset >= PAGE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ assert_eb_page_uptodate(eb, page);
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] &= ~mask_to_clear;
+ }
+}
+
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+ unsigned long distance = (src > dst) ? src - dst : dst - src;
+ return distance < len;
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+ unsigned long dst_off, unsigned long src_off,
+ unsigned long len)
+{
+ char *dst_kaddr = page_address(dst_page);
+ char *src_kaddr;
+ int must_memmove = 0;
+
+ if (dst_page != src_page) {
+ src_kaddr = page_address(src_page);
+ } else {
+ src_kaddr = dst_kaddr;
+ if (areas_overlap(src_off, dst_off, len))
+ must_memmove = 1;
+ }
+
+ if (must_memmove)
+ memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ else
+ memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+}
+
+void memcpy_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
+{
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ unsigned long dst_i;
+ unsigned long src_i;
+
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
+
+ while (len > 0) {
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
+ src_off_in_page = get_eb_offset_in_page(dst, src_offset);
+
+ dst_i = get_eb_page_index(dst_offset);
+ src_i = get_eb_page_index(src_offset);
+
+ cur = min(len, (unsigned long)(PAGE_SIZE -
+ src_off_in_page));
+ cur = min_t(unsigned long, cur,
+ (unsigned long)(PAGE_SIZE - dst_off_in_page));
+
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
+ dst_off_in_page, src_off_in_page, cur);
+
+ src_offset += cur;
+ dst_offset += cur;
+ len -= cur;
+ }
+}
+
+void memmove_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
+{
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ unsigned long dst_end = dst_offset + len - 1;
+ unsigned long src_end = src_offset + len - 1;
+ unsigned long dst_i;
+ unsigned long src_i;
+
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
+ if (dst_offset < src_offset) {
+ memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+ return;
+ }
+ while (len > 0) {
+ dst_i = get_eb_page_index(dst_end);
+ src_i = get_eb_page_index(src_end);
+
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
+ src_off_in_page = get_eb_offset_in_page(dst, src_end);
+
+ cur = min_t(unsigned long, len, src_off_in_page + 1);
+ cur = min(cur, dst_off_in_page + 1);
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
+ dst_off_in_page - cur + 1,
+ src_off_in_page - cur + 1, cur);
+
+ dst_end -= cur;
+ src_end -= cur;
+ len -= cur;
+ }
+}
+
+#define GANG_LOOKUP_SIZE 16
+static struct extent_buffer *get_next_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+ struct extent_buffer *gang[GANG_LOOKUP_SIZE];
+ struct extent_buffer *found = NULL;
+ u64 page_start = page_offset(page);
+ u64 cur = page_start;
+
+ ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
+ lockdep_assert_held(&fs_info->buffer_lock);
+
+ while (cur < page_start + PAGE_SIZE) {
+ int ret;
+ int i;
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+ (void **)gang, cur >> fs_info->sectorsize_bits,
+ min_t(unsigned int, GANG_LOOKUP_SIZE,
+ PAGE_SIZE / fs_info->nodesize));
+ if (ret == 0)
+ goto out;
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ goto out;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ goto out;
+ }
+ }
+ cur = gang[ret - 1]->start + gang[ret - 1]->len;
+ }
+out:
+ return found;
+}
+
+static int try_release_subpage_extent_buffer(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ u64 cur = page_offset(page);
+ const u64 end = page_offset(page) + PAGE_SIZE;
+ int ret;
+
+ while (cur < end) {
+ struct extent_buffer *eb = NULL;
+
+ /*
+ * Unlike try_release_extent_buffer() which uses page->private
+ * to grab buffer, for subpage case we rely on radix tree, thus
+ * we need to ensure radix tree consistency.
+ *
+ * We also want an atomic snapshot of the radix tree, thus go
+ * with spinlock rather than RCU.
+ */
+ spin_lock(&fs_info->buffer_lock);
+ eb = get_next_extent_buffer(fs_info, page, cur);
+ if (!eb) {
+ /* No more eb in the page range after or at cur */
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ cur = eb->start + eb->len;
+
+ /*
+ * The same as try_release_extent_buffer(), to ensure the eb
+ * won't disappear out from under us.
+ */
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a
+ * real ref, so just return, this eb will likely be freed soon
+ * anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ break;
+ }
+
+ /*
+ * Here we don't care about the return value, we will always
+ * check the page private at the end. And
+ * release_extent_buffer() will release the refs_lock.
+ */
+ release_extent_buffer(eb);
+ }
+ /*
+ * Finally to check if we have cleared page private, as if we have
+ * released all ebs in the page, the page private should be cleared now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page))
+ ret = 1;
+ else
+ ret = 0;
+ spin_unlock(&page->mapping->private_lock);
+ return ret;
+
+}
+
+int try_release_extent_buffer(struct page *page)
+{
+ struct extent_buffer *eb;
+
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+ return try_release_subpage_extent_buffer(page);
+
+ /*
+ * We need to make sure nobody is changing page->private, as we rely on
+ * page->private as the pointer to extent buffer.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ return 1;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+
+ /*
+ * This is a little awful but should be ok, we need to make sure that
+ * the eb doesn't disappear out from under us while we're looking at
+ * this page.
+ */
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return 0;
+ }
+ spin_unlock(&page->mapping->private_lock);
+
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a real ref,
+ * so just return, this page will likely be freed soon anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ return 0;
+ }
+
+ return release_extent_buffer(eb);
+}
+
+/*
+ * btrfs_readahead_tree_block - attempt to readahead a child block
+ * @fs_info: the fs_info
+ * @bytenr: bytenr to read
+ * @owner_root: objectid of the root that owns this eb
+ * @gen: generation for the uptodate check, can be 0
+ * @level: level for the eb
+ *
+ * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
+ * normal uptodate check of the eb, without checking the generation. If we have
+ * to read the block we will not block on anything.
+ */
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level)
+{
+ struct extent_buffer *eb;
+ int ret;
+
+ eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ if (IS_ERR(eb))
+ return;
+
+ if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+ if (ret < 0)
+ free_extent_buffer_stale(eb);
+ else
+ free_extent_buffer(eb);
+}
+
+/*
+ * btrfs_readahead_node_child - readahead a node's child block
+ * @node: parent node we're reading from
+ * @slot: slot in the parent node for the child we want to read
+ *
+ * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
+ * the slot in the node provided.
+ */
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
+{
+ btrfs_readahead_tree_block(node->fs_info,
+ btrfs_node_blockptr(node, slot),
+ btrfs_header_owner(node),
+ btrfs_node_ptr_generation(node, slot),
+ btrfs_header_level(node) - 1);
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000..7929f054d
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_IO_H
+#define BTRFS_EXTENT_IO_H
+
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/fiemap.h>
+#include <linux/btrfs_tree.h>
+#include "compression.h"
+#include "ulist.h"
+
+enum {
+ EXTENT_BUFFER_UPTODATE,
+ EXTENT_BUFFER_DIRTY,
+ EXTENT_BUFFER_CORRUPT,
+ /* this got triggered by readahead */
+ EXTENT_BUFFER_READAHEAD,
+ EXTENT_BUFFER_TREE_REF,
+ EXTENT_BUFFER_STALE,
+ EXTENT_BUFFER_WRITEBACK,
+ /* read IO error */
+ EXTENT_BUFFER_READ_ERR,
+ EXTENT_BUFFER_UNMAPPED,
+ EXTENT_BUFFER_IN_TREE,
+ /* write IO error */
+ EXTENT_BUFFER_WRITE_ERR,
+ EXTENT_BUFFER_NO_CHECK,
+};
+
+/* these are flags for __process_pages_contig */
+#define PAGE_UNLOCK (1 << 0)
+/* Page starts writeback, clear dirty bit and set writeback bit */
+#define PAGE_START_WRITEBACK (1 << 1)
+#define PAGE_END_WRITEBACK (1 << 2)
+#define PAGE_SET_ORDERED (1 << 3)
+#define PAGE_SET_ERROR (1 << 4)
+#define PAGE_LOCK (1 << 5)
+
+/*
+ * page->private values. Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+
+/*
+ * The extent buffer bitmap operations are done with byte granularity instead of
+ * word granularity for two reasons:
+ * 1. The bitmaps must be little-endian on disk.
+ * 2. Bitmap items are not guaranteed to be aligned to a word and therefore a
+ * single word in a bitmap may straddle two pages in the extent buffer.
+ */
+#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
+#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BITMAP_FIRST_BYTE_MASK(start) \
+ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
+#define BITMAP_LAST_BYTE_MASK(nbits) \
+ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+
+struct btrfs_bio;
+struct btrfs_root;
+struct btrfs_inode;
+struct btrfs_fs_info;
+struct io_failure_record;
+struct extent_io_tree;
+
+int __init extent_buffer_init_cachep(void);
+void __cold extent_buffer_free_cachep(void);
+
+typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ enum btrfs_compression_type compress_type);
+
+typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
+ struct bio *bio, u64 dio_file_offset);
+
+#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
+struct extent_buffer {
+ u64 start;
+ unsigned long len;
+ unsigned long bflags;
+ struct btrfs_fs_info *fs_info;
+ spinlock_t refs_lock;
+ atomic_t refs;
+ atomic_t io_pages;
+ int read_mirror;
+ struct rcu_head rcu_head;
+ pid_t lock_owner;
+ /* >= 0 if eb belongs to a log tree, -1 otherwise */
+ s8 log_index;
+
+ struct rw_semaphore lock;
+
+ struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ struct list_head release_list;
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
+};
+
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+ /* How many bytes are set/cleared in this operation */
+ u64 bytes_changed;
+
+ /* Changed ranges */
+ struct ulist range_changed;
+};
+
+static inline void extent_changeset_init(struct extent_changeset *changeset)
+{
+ changeset->bytes_changed = 0;
+ ulist_init(&changeset->range_changed);
+}
+
+static inline struct extent_changeset *extent_changeset_alloc(void)
+{
+ struct extent_changeset *ret;
+
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return NULL;
+
+ extent_changeset_init(ret);
+ return ret;
+}
+
+static inline void extent_changeset_release(struct extent_changeset *changeset)
+{
+ if (!changeset)
+ return;
+ changeset->bytes_changed = 0;
+ ulist_release(&changeset->range_changed);
+}
+
+static inline void extent_changeset_free(struct extent_changeset *changeset)
+{
+ if (!changeset)
+ return;
+ extent_changeset_release(changeset);
+ kfree(changeset);
+}
+
+struct extent_map_tree;
+
+int try_release_extent_mapping(struct page *page, gfp_t mask);
+int try_release_extent_buffer(struct page *page);
+
+int btrfs_read_folio(struct file *file, struct folio *folio);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
+int extent_writepages(struct address_space *mapping,
+ struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc);
+void extent_readahead(struct readahead_control *rac);
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+int set_page_extent_mapped(struct page *page);
+void clear_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, u64 owner_root, int level);
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
+struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
+void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
+#define WAIT_NONE 0
+#define WAIT_COMPLETE 1
+#define WAIT_PAGE_LOCK 2
+int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
+ int mirror_num);
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level);
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
+
+static inline int num_extent_pages(const struct extent_buffer *eb)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
+ * sectorsize, it's just eb->len >> PAGE_SHIFT.
+ *
+ * For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE,
+ * thus have to ensure we get at least one page.
+ */
+ return (eb->len >> PAGE_SHIFT) ?: 1;
+}
+
+static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
+{
+ return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+}
+
+int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
+ unsigned long start, unsigned long len);
+void read_extent_buffer(const struct extent_buffer *eb, void *dst,
+ unsigned long start,
+ unsigned long len);
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
+ unsigned long len);
+void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
+void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
+ const void *src);
+void write_extent_buffer(const struct extent_buffer *eb, const void *src,
+ unsigned long start, unsigned long len);
+void copy_extent_buffer_full(const struct extent_buffer *dst,
+ const struct extent_buffer *src);
+void copy_extent_buffer(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memcpy_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memmove_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len);
+int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
+void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
+void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ unsigned long start, unsigned long pos,
+ unsigned long len);
+void clear_extent_buffer_dirty(const struct extent_buffer *eb);
+bool set_extent_buffer_dirty(struct extent_buffer *eb);
+void set_extent_buffer_uptodate(struct extent_buffer *eb);
+void clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_under_io(const struct extent_buffer *eb);
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ struct page *locked_page,
+ u32 bits_to_clear, unsigned long page_ops);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset);
+
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the sector is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+ struct page *page;
+ u64 len;
+ u64 logical;
+ int this_mirror;
+ int failed_mirror;
+ int num_copies;
+};
+
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+ u32 bio_offset, struct page *page, unsigned int pgoff,
+ submit_bio_hook_t *submit_bio_hook);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+bool find_lock_delalloc_range(struct inode *inode,
+ struct page *locked_page, u64 *start,
+ u64 *end);
+#endif
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
+
+#ifdef CONFIG_BTRFS_DEBUG
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info);
+#else
+#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0)
+#endif
+
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000..56d7580fd
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,985 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "extent_map.h"
+#include "compression.h"
+#include "btrfs_inode.h"
+
+
+static struct kmem_cache *extent_map_cache;
+
+int __init extent_map_init(void)
+{
+ extent_map_cache = kmem_cache_create("btrfs_extent_map",
+ sizeof(struct extent_map), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!extent_map_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold extent_map_exit(void)
+{
+ kmem_cache_destroy(extent_map_cache);
+}
+
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree: tree to initialize
+ *
+ * Initialize the extent tree @tree. Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree)
+{
+ tree->map = RB_ROOT_CACHED;
+ INIT_LIST_HEAD(&tree->modified_extents);
+ rwlock_init(&tree->lock);
+}
+
+/**
+ * alloc_extent_map - allocate new extent map structure
+ *
+ * Allocate a new extent_map structure. The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(void)
+{
+ struct extent_map *em;
+ em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
+ if (!em)
+ return NULL;
+ RB_CLEAR_NODE(&em->rb_node);
+ em->compress_type = BTRFS_COMPRESS_NONE;
+ refcount_set(&em->refs, 1);
+ INIT_LIST_HEAD(&em->list);
+ return em;
+}
+
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em: extent map being released
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+ if (!em)
+ return;
+ if (refcount_dec_and_test(&em->refs)) {
+ WARN_ON(extent_map_in_tree(em));
+ WARN_ON(!list_empty(&em->list));
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ kfree(em->map_lookup);
+ kmem_cache_free(extent_map_cache, em);
+ }
+}
+
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+ if (start + len < start)
+ return (u64)-1;
+ return start + len;
+}
+
+static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
+{
+ struct rb_node **p = &root->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_map *entry = NULL;
+ struct rb_node *orig_parent = NULL;
+ u64 end = range_end(em->start, em->len);
+ bool leftmost = true;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct extent_map, rb_node);
+
+ if (em->start < entry->start) {
+ p = &(*p)->rb_left;
+ } else if (em->start >= extent_map_end(entry)) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ return -EEXIST;
+ }
+ }
+
+ orig_parent = parent;
+ while (parent && em->start >= extent_map_end(entry)) {
+ parent = rb_next(parent);
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ }
+ if (parent)
+ if (end > entry->start && em->start < extent_map_end(entry))
+ return -EEXIST;
+
+ parent = orig_parent;
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ while (parent && em->start < entry->start) {
+ parent = rb_prev(parent);
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ }
+ if (parent)
+ if (end > entry->start && em->start < extent_map_end(entry))
+ return -EEXIST;
+
+ rb_link_node(&em->rb_node, orig_parent, p);
+ rb_insert_color_cached(&em->rb_node, root, leftmost);
+ return 0;
+}
+
+/*
+ * search through the tree for an extent_map with a given offset. If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+ struct rb_node **prev_or_next_ret)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct extent_map *entry;
+ struct extent_map *prev_entry = NULL;
+
+ ASSERT(prev_or_next_ret);
+
+ while (n) {
+ entry = rb_entry(n, struct extent_map, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (offset < entry->start)
+ n = n->rb_left;
+ else if (offset >= extent_map_end(entry))
+ n = n->rb_right;
+ else
+ return n;
+ }
+
+ orig_prev = prev;
+ while (prev && offset >= extent_map_end(prev_entry)) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+
+ /*
+ * Previous extent map found, return as in this case the caller does not
+ * care about the next one.
+ */
+ if (prev) {
+ *prev_or_next_ret = prev;
+ return NULL;
+ }
+
+ prev = orig_prev;
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+ *prev_or_next_ret = prev;
+
+ return NULL;
+}
+
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+ if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+ return 0;
+
+ /*
+ * don't merge compressed extents, we need to know their
+ * actual size
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+ return 0;
+
+ if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+ test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+ return 0;
+
+ /*
+ * We don't want to merge stuff that hasn't been written to the log yet
+ * since it may not reflect exactly what is on disk, and that would be
+ * bad.
+ */
+ if (!list_empty(&prev->list) || !list_empty(&next->list))
+ return 0;
+
+ ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
+ prev->block_start != EXTENT_MAP_DELALLOC);
+
+ if (prev->map_lookup || next->map_lookup)
+ ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
+ test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
+
+ if (extent_map_end(prev) == next->start &&
+ prev->flags == next->flags &&
+ prev->map_lookup == next->map_lookup &&
+ ((next->block_start == EXTENT_MAP_HOLE &&
+ prev->block_start == EXTENT_MAP_HOLE) ||
+ (next->block_start == EXTENT_MAP_INLINE &&
+ prev->block_start == EXTENT_MAP_INLINE) ||
+ (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+ next->block_start == extent_map_block_end(prev)))) {
+ return 1;
+ }
+ return 0;
+}
+
+static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
+{
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
+
+ /*
+ * We can't modify an extent map that is in the tree and that is being
+ * used by another task, as it can cause that other task to see it in
+ * inconsistent state during the merging. We always have 1 reference for
+ * the tree and 1 for this task (which is unpinning the extent map or
+ * clearing the logging flag), so anything > 2 means it's being used by
+ * other tasks too.
+ */
+ if (refcount_read(&em->refs) > 2)
+ return;
+
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(merge, em)) {
+ em->start = merge->start;
+ em->orig_start = merge->orig_start;
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ em->block_start = merge->block_start;
+ em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+ em->mod_start = merge->mod_start;
+ em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
+
+ rb_erase_cached(&merge->rb_node, &tree->map);
+ RB_CLEAR_NODE(&merge->rb_node);
+ free_extent_map(merge);
+ }
+ }
+
+ rb = rb_next(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(em, merge)) {
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ rb_erase_cached(&merge->rb_node, &tree->map);
+ RB_CLEAR_NODE(&merge->rb_node);
+ em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+ em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ free_extent_map(merge);
+ }
+}
+
+/**
+ * unpin_extent_cache - unpin an extent from the cache
+ * @tree: tree to unpin the extent in
+ * @start: logical offset in the file
+ * @len: length of the extent
+ * @gen: generation that this extent has been modified in
+ *
+ * Called after an extent has been written to disk properly. Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+ u64 gen)
+{
+ int ret = 0;
+ struct extent_map *em;
+ bool prealloc = false;
+
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, start, len);
+
+ WARN_ON(!em || em->start != start);
+
+ if (!em)
+ goto out;
+
+ em->generation = gen;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+ prealloc = true;
+ clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+ }
+
+ try_merge_map(tree, em);
+
+ if (prealloc) {
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+ }
+
+ free_extent_map(em);
+out:
+ write_unlock(&tree->lock);
+ return ret;
+
+}
+
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+ lockdep_assert_held_write(&tree->lock);
+
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ if (extent_map_in_tree(em))
+ try_merge_map(tree, em);
+}
+
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em,
+ int modified)
+{
+ refcount_inc(&em->refs);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (modified)
+ list_move(&em->list, &tree->modified_extents);
+ else
+ try_merge_map(tree, em);
+}
+
+static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
+{
+ struct map_lookup *map = em->map_lookup;
+ u64 stripe_size = em->orig_block_len;
+ int i;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ set_extent_bits_nowait(&device->alloc_state, stripe->physical,
+ stripe->physical + stripe_size - 1, bits);
+ }
+}
+
+static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
+{
+ struct map_lookup *map = em->map_lookup;
+ u64 stripe_size = em->orig_block_len;
+ int i;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ __clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + stripe_size - 1, bits,
+ NULL, GFP_NOWAIT, NULL);
+ }
+}
+
+/**
+ * Add new extent map to the extent tree
+ *
+ * @tree: tree to insert new map in
+ * @em: map to insert
+ * @modified: indicate whether the given @em should be added to the
+ * modified list, which indicates the extent needs to be logged
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings. The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was successful.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em, int modified)
+{
+ int ret = 0;
+
+ lockdep_assert_held_write(&tree->lock);
+
+ ret = tree_insert(&tree->map, em);
+ if (ret)
+ goto out;
+
+ setup_extent_mapping(tree, em, modified);
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
+ extent_map_device_set_bits(em, CHUNK_ALLOCATED);
+ extent_map_device_clear_bits(em, CHUNK_TRIMMED);
+ }
+out:
+ return ret;
+}
+
+static struct extent_map *
+__lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len, int strict)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+ struct rb_node *prev_or_next = NULL;
+ u64 end = range_end(start, len);
+
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
+ if (!rb_node) {
+ if (prev_or_next)
+ rb_node = prev_or_next;
+ else
+ return NULL;
+ }
+
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+
+ if (strict && !(end > em->start && start < extent_map_end(em)))
+ return NULL;
+
+ refcount_inc(&em->refs);
+ return em;
+}
+
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range. There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ return __lookup_extent_mapping(tree, start, len, 1);
+}
+
+/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ return __lookup_extent_mapping(tree, start, len, 0);
+}
+
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree: extent tree to remove from
+ * @em: extent map being removed
+ *
+ * Removes @em from @tree. No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+ lockdep_assert_held_write(&tree->lock);
+
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ rb_erase_cached(&em->rb_node, &tree->map);
+ if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ list_del_init(&em->list);
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
+ RB_CLEAR_NODE(&em->rb_node);
+}
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified)
+{
+ lockdep_assert_held_write(&tree->lock);
+
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ ASSERT(extent_map_in_tree(cur));
+ if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ list_del_init(&cur->list);
+ rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
+ RB_CLEAR_NODE(&cur->rb_node);
+
+ setup_extent_mapping(tree, new, modified);
+}
+
+static struct extent_map *next_extent_map(const struct extent_map *em)
+{
+ struct rb_node *next;
+
+ next = rb_next(&em->rb_node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct extent_map, rb_node);
+}
+
+/*
+ * Get the extent map that immediately follows another one.
+ *
+ * @tree: The extent map tree that the extent map belong to.
+ * Holding read or write access on the tree's lock is required.
+ * @em: An extent map from the given tree. The caller must ensure that
+ * between getting @em and between calling this function, the
+ * extent map @em is not removed from the tree - for example, by
+ * holding the tree's lock for the duration of those 2 operations.
+ *
+ * Returns the extent map that immediately follows @em, or NULL if @em is the
+ * last extent map in the tree.
+ */
+struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree,
+ const struct extent_map *em)
+{
+ struct extent_map *next;
+
+ /* The lock must be acquired either in read mode or write mode. */
+ lockdep_assert_held(&tree->lock);
+ ASSERT(extent_map_in_tree(em));
+
+ next = next_extent_map(em);
+ if (next)
+ refcount_inc(&next->refs);
+
+ return next;
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+ struct rb_node *prev;
+
+ prev = rb_prev(&em->rb_node);
+ if (!prev)
+ return NULL;
+ return container_of(prev, struct extent_map, rb_node);
+}
+
+/*
+ * Helper for btrfs_get_extent. Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the best fitted new extent into the tree.
+ */
+static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map *existing,
+ struct extent_map *em,
+ u64 map_start)
+{
+ struct extent_map *prev;
+ struct extent_map *next;
+ u64 start;
+ u64 end;
+ u64 start_diff;
+
+ BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+
+ if (existing->start > map_start) {
+ next = existing;
+ prev = prev_extent_map(next);
+ } else {
+ prev = existing;
+ next = next_extent_map(prev);
+ }
+
+ start = prev ? extent_map_end(prev) : em->start;
+ start = max_t(u64, start, em->start);
+ end = next ? next->start : extent_map_end(em);
+ end = min_t(u64, end, extent_map_end(em));
+ start_diff = start - em->start;
+ em->start = start;
+ em->len = end - start;
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ em->block_start += start_diff;
+ em->block_len = em->len;
+ }
+ return add_extent_mapping(em_tree, em, 0);
+}
+
+/**
+ * Add extent mapping into em_tree
+ *
+ * @fs_info: the filesystem
+ * @em_tree: extent tree into which we want to insert the extent mapping
+ * @em_in: extent we are inserting
+ * @start: start of the logical range btrfs_get_extent() is requesting
+ * @len: length of the logical range btrfs_get_extent() is requesting
+ *
+ * Note that @em_in's range may be different from [start, start+len),
+ * but they must be overlapped.
+ *
+ * Insert @em_in into @em_tree. In case there is an overlapping range, handle
+ * the -EEXIST by either:
+ * a) Returning the existing extent in @em_in if @start is within the
+ * existing em.
+ * b) Merge the existing extent with @em_in passed in.
+ *
+ * Return 0 on success, otherwise -EEXIST.
+ *
+ */
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
+ struct extent_map **em_in, u64 start, u64 len)
+{
+ int ret;
+ struct extent_map *em = *em_in;
+
+ ret = add_extent_mapping(em_tree, em, 0);
+ /* it is possible that someone inserted the extent into the tree
+ * while we had the lock dropped. It is also possible that
+ * an overlapping map exists in the tree
+ */
+ if (ret == -EEXIST) {
+ struct extent_map *existing;
+
+ ret = 0;
+
+ existing = search_extent_mapping(em_tree, start, len);
+
+ trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
+
+ /*
+ * existing will always be non-NULL, since there must be
+ * extent causing the -EEXIST.
+ */
+ if (start >= existing->start &&
+ start < extent_map_end(existing)) {
+ free_extent_map(em);
+ *em_in = existing;
+ ret = 0;
+ } else {
+ u64 orig_start = em->start;
+ u64 orig_len = em->len;
+
+ /*
+ * The existing extent map is the one nearest to
+ * the [start, start + len) range which overlaps
+ */
+ ret = merge_extent_mapping(em_tree, existing,
+ em, start);
+ if (ret) {
+ free_extent_map(em);
+ *em_in = NULL;
+ WARN_ONCE(ret,
+"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
+ ret, existing->start, existing->len,
+ orig_start, orig_len);
+ }
+ free_extent_map(existing);
+ }
+ }
+
+ ASSERT(ret == 0 || ret == -EEXIST);
+ return ret;
+}
+
+/*
+ * Drop all extent maps from a tree in the fastest possible way, rescheduling
+ * if needed. This avoids searching the tree, from the root down to the first
+ * extent map, before each deletion.
+ */
+static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+{
+ write_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+ struct extent_map *em;
+ struct rb_node *node;
+
+ node = rb_first_cached(&tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ remove_extent_mapping(tree, em);
+ free_extent_map(em);
+ cond_resched_rwlock_write(&tree->lock);
+ }
+ write_unlock(&tree->lock);
+}
+
+/*
+ * Drop all extent maps in a given range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the range.
+ * @end: End offset of the range (inclusive value).
+ * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
+ *
+ * This drops all the extent maps that intersect the given range [@start, @end].
+ * Extent maps that partially overlap the range and extend behind or beyond it,
+ * are split.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
+ bool skip_pinned)
+{
+ struct extent_map *split;
+ struct extent_map *split2;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 len = end - start + 1;
+
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ if (start == 0 && !skip_pinned) {
+ drop_all_extent_maps_fast(em_tree);
+ return;
+ }
+ len = (u64)-1;
+ } else {
+ /* Make end offset exclusive for use in the loop below. */
+ end++;
+ }
+
+ /*
+ * It's ok if we fail to allocate the extent maps, see the comment near
+ * the bottom of the loop below. We only need two spare extent maps in
+ * the worst case, where the first extent map that intersects our range
+ * starts before the range and the last extent map that intersects our
+ * range ends after our range (and they might be the same extent map),
+ * because we need to split those two extent maps at the boundaries.
+ */
+ split = alloc_extent_map();
+ split2 = alloc_extent_map();
+
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+
+ while (em) {
+ /* extent_map_end() returns exclusive value (last byte + 1). */
+ const u64 em_end = extent_map_end(em);
+ struct extent_map *next_em = NULL;
+ u64 gen;
+ unsigned long flags;
+ bool modified;
+ bool compressed;
+
+ if (em_end < end) {
+ next_em = next_extent_map(em);
+ if (next_em) {
+ if (next_em->start < end)
+ refcount_inc(&next_em->refs);
+ else
+ next_em = NULL;
+ }
+ }
+
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ start = em_end;
+ goto next;
+ }
+
+ flags = em->flags;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ /*
+ * In case we split the extent map, we want to preserve the
+ * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
+ * it on the new extent maps.
+ */
+ clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
+
+ /*
+ * The extent map does not cross our target range, so no need to
+ * split it, we can remove it directly.
+ */
+ if (em->start >= start && em_end <= end)
+ goto remove_em;
+
+ gen = em->generation;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+ if (em->start < start) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = em->start;
+ split->len = start - em->start;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
+ } else {
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
+ split->ram_bytes = split->len;
+ }
+
+ split->generation = gen;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ replace_extent_mapping(em_tree, em, split, modified);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
+ }
+ if (em_end > end) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = end;
+ split->len = em_end - end;
+ split->block_start = em->block_start;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ split->generation = gen;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
+
+ split->ram_bytes = em->ram_bytes;
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->orig_start = em->orig_start;
+ } else {
+ const u64 diff = start + len - em->start;
+
+ split->block_len = split->len;
+ split->block_start += diff;
+ split->orig_start = em->orig_start;
+ }
+ } else {
+ split->ram_bytes = split->len;
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->orig_block_len = 0;
+ }
+
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ int ret;
+
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ /* Logic error, shouldn't happen. */
+ ASSERT(ret == 0);
+ if (WARN_ON(ret != 0) && modified)
+ btrfs_set_inode_full_sync(inode);
+ }
+ free_extent_map(split);
+ split = NULL;
+ }
+remove_em:
+ if (extent_map_in_tree(em)) {
+ /*
+ * If the extent map is still in the tree it means that
+ * either of the following is true:
+ *
+ * 1) It fits entirely in our range (doesn't end beyond
+ * it or starts before it);
+ *
+ * 2) It starts before our range and/or ends after our
+ * range, and we were not able to allocate the extent
+ * maps for split operations, @split and @split2.
+ *
+ * If we are at case 2) then we just remove the entire
+ * extent map - this is fine since if anyone needs it to
+ * access the subranges outside our range, will just
+ * load it again from the subvolume tree's file extent
+ * item. However if the extent map was in the list of
+ * modified extents, then we must mark the inode for a
+ * full fsync, otherwise a fast fsync will miss this
+ * extent if it's new and needs to be logged.
+ */
+ if ((em->start < start || em_end > end) && modified) {
+ ASSERT(!split);
+ btrfs_set_inode_full_sync(inode);
+ }
+ remove_extent_mapping(em_tree, em);
+ }
+
+ /*
+ * Once for the tree reference (we replaced or removed the
+ * extent map from the tree).
+ */
+ free_extent_map(em);
+next:
+ /* Once for us (for our lookup reference). */
+ free_extent_map(em);
+
+ em = next_em;
+ }
+
+ write_unlock(&em_tree->lock);
+
+ free_extent_map(split);
+ free_extent_map(split2);
+}
+
+/*
+ * Replace a range in the inode's extent map tree with a new extent map.
+ *
+ * @inode: The target inode.
+ * @new_em: The new extent map to add to the inode's extent map tree.
+ * @modified: Indicate if the new extent map should be added to the list of
+ * modified extents (for fast fsync tracking).
+ *
+ * Drops all the extent maps in the inode's extent map tree that intersect the
+ * range of the new extent map and adds the new extent map to the tree.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified)
+{
+ const u64 end = new_em->start + new_em->len - 1;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ int ret;
+
+ ASSERT(!extent_map_in_tree(new_em));
+
+ /*
+ * The caller has locked an appropriate file range in the inode's io
+ * tree, but getting -EEXIST when adding the new extent map can still
+ * happen in case there are extents that partially cover the range, and
+ * this is due to two tasks operating on different parts of the extent.
+ * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
+ * btrfs_get_extent") for an example and details.
+ */
+ do {
+ btrfs_drop_extent_map_range(inode, new_em->start, end, false);
+ write_lock(&tree->lock);
+ ret = add_extent_mapping(tree, new_em, modified);
+ write_unlock(&tree->lock);
+ } while (ret == -EEXIST);
+
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000..68d3f2c9e
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_MAP_H
+#define BTRFS_EXTENT_MAP_H
+
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+
+#define EXTENT_MAP_LAST_BYTE ((u64)-4)
+#define EXTENT_MAP_HOLE ((u64)-3)
+#define EXTENT_MAP_INLINE ((u64)-2)
+/* used only during fiemap calls */
+#define EXTENT_MAP_DELALLOC ((u64)-1)
+
+/* bits for the extent_map::flags field */
+enum {
+ /* this entry not yet on disk, don't free it */
+ EXTENT_FLAG_PINNED,
+ EXTENT_FLAG_COMPRESSED,
+ /* pre-allocated extent */
+ EXTENT_FLAG_PREALLOC,
+ /* Logging this extent */
+ EXTENT_FLAG_LOGGING,
+ /* Filling in a preallocated extent */
+ EXTENT_FLAG_FILLING,
+ /* filesystem extent mapping type */
+ EXTENT_FLAG_FS_MAPPING,
+ /* This em is merged from two or more physically adjacent ems */
+ EXTENT_FLAG_MERGED,
+};
+
+struct extent_map {
+ struct rb_node rb_node;
+
+ /* all of these are in bytes */
+ u64 start;
+ u64 len;
+ u64 mod_start;
+ u64 mod_len;
+ u64 orig_start;
+ u64 orig_block_len;
+ u64 ram_bytes;
+ u64 block_start;
+ u64 block_len;
+
+ /*
+ * Generation of the extent map, for merged em it's the highest
+ * generation of all merged ems.
+ * For non-merged extents, it's from btrfs_file_extent_item::generation.
+ */
+ u64 generation;
+ unsigned long flags;
+ /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
+ struct map_lookup *map_lookup;
+ refcount_t refs;
+ unsigned int compress_type;
+ struct list_head list;
+};
+
+struct extent_map_tree {
+ struct rb_root_cached map;
+ struct list_head modified_extents;
+ rwlock_t lock;
+};
+
+struct btrfs_inode;
+
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+ return !RB_EMPTY_NODE(&em->rb_node);
+}
+
+static inline u64 extent_map_end(struct extent_map *em)
+{
+ if (em->start + em->len < em->start)
+ return (u64)-1;
+ return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+ if (em->block_start + em->block_len < em->block_start)
+ return (u64)-1;
+ return em->block_start + em->block_len;
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree,
+ const struct extent_map *em);
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em, int modified);
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified);
+
+struct extent_map *alloc_extent_map(void);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void __cold extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
+ struct extent_map **em_in, u64 start, u64 len);
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
+ u64 start, u64 end,
+ bool skip_pinned);
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified);
+
+#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000..14478da87
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,1304 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/sched/mm.h>
+#include <crypto/hash.h>
+#include "misc.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "compression.h"
+
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) * 2) / \
+ size) - 1))
+
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+ PAGE_SIZE))
+
+/**
+ * Set inode's size according to filesystem options
+ *
+ * @inode: inode we want to update the disk_i_size for
+ * @new_i_size: i_size we want to set to, 0 if we use i_size
+ *
+ * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
+ * returns as it is perfectly fine with a file that has holes without hole file
+ * extent items.
+ *
+ * However without NO_HOLES we need to only return the area that is contiguous
+ * from the 0 offset of the file. Otherwise we could end up adjust i_size up
+ * to an extent that has a gap in between.
+ *
+ * Finally new_i_size should only be set in the case of truncate where we're not
+ * ready to use i_size_read() as the limiter yet.
+ */
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 start, end, i_size;
+ int ret;
+
+ spin_lock(&inode->lock);
+ i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
+ if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ inode->disk_i_size = i_size;
+ goto out_unlock;
+ }
+
+ ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ &end, EXTENT_DIRTY);
+ if (!ret && start == 0)
+ i_size = min(i_size, end + 1);
+ else
+ i_size = 0;
+ inode->disk_i_size = i_size;
+out_unlock:
+ spin_unlock(&inode->lock);
+}
+
+/**
+ * Mark range within a file as having a new extent inserted
+ *
+ * @inode: inode being modified
+ * @start: start file offset of the file extent we've inserted
+ * @len: logical length of the file extent item
+ *
+ * Call when we are inserting a new file extent where there was none before.
+ * Does not need to call this in the case where we're replacing an existing file
+ * extent, however if not sure it's fine to call this multiple times.
+ *
+ * The start and len must match the file extent item, so thus must be sectorsize
+ * aligned.
+ */
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len)
+{
+ if (len == 0)
+ return 0;
+
+ ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
+
+ if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
+ return 0;
+ return set_extent_bits(&inode->file_extent_tree, start, start + len - 1,
+ EXTENT_DIRTY);
+}
+
+/**
+ * Marks an inode range as not having a backing extent
+ *
+ * @inode: inode being modified
+ * @start: start file offset of the file extent we've inserted
+ * @len: logical length of the file extent item
+ *
+ * Called when we drop a file extent, for example when we truncate. Doesn't
+ * need to be called for cases where we're replacing a file extent, like when
+ * we've COWed a file extent.
+ *
+ * The start and len must match the file extent item, so thus must be sectorsize
+ * aligned.
+ */
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len)
+{
+ if (len == 0)
+ return 0;
+
+ ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
+ len == (u64)-1);
+
+ if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
+ return 0;
+ return clear_extent_bit(&inode->file_extent_tree, start,
+ start + len - 1, EXTENT_DIRTY, NULL);
+}
+
+static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
+ u16 csum_size)
+{
+ u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
+
+ return ncsums * fs_info->sectorsize;
+}
+
+/*
+ * Calculate the total size needed to allocate for an ordered sum structure
+ * spanning @bytes in the file.
+ */
+static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
+{
+ int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
+
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
+}
+
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 pos, u64 num_bytes)
+{
+ int ret = 0;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key file_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ file_key.objectid = objectid;
+ file_key.offset = pos;
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ sizeof(*item));
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret); /* Can't happen */
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, 0);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, 0);
+ btrfs_set_file_extent_offset(leaf, item, 0);
+ btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes);
+ btrfs_set_file_extent_generation(leaf, item, trans->transid);
+ btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_compression(leaf, item, 0);
+ btrfs_set_file_extent_encryption(leaf, item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, item, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static struct btrfs_csum_item *
+btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, int cow)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+ struct btrfs_key file_key;
+ struct btrfs_key found_key;
+ struct btrfs_csum_item *item;
+ struct extent_buffer *leaf;
+ u64 csum_offset = 0;
+ const u32 csum_size = fs_info->csum_size;
+ int csums_in_item;
+
+ file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ file_key.offset = bytenr;
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+ if (ret < 0)
+ goto fail;
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ ret = 1;
+ if (path->slots[0] == 0)
+ goto fail;
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ goto fail;
+
+ csum_offset = (bytenr - found_key.offset) >>
+ fs_info->sectorsize_bits;
+ csums_in_item = btrfs_item_size(leaf, path->slots[0]);
+ csums_in_item /= csum_size;
+
+ if (csum_offset == csums_in_item) {
+ ret = -EFBIG;
+ goto fail;
+ } else if (csum_offset > csums_in_item) {
+ goto fail;
+ }
+ }
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+ return item;
+fail:
+ if (ret > 0)
+ ret = -ENOENT;
+ return ERR_PTR(ret);
+}
+
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 offset, int mod)
+{
+ struct btrfs_key file_key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ file_key.objectid = objectid;
+ file_key.offset = offset;
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
+
+ return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+}
+
+/*
+ * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
+ * estore the result to @dst.
+ *
+ * Return >0 for the number of sectors we found.
+ * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
+ * for it. Caller may want to try next sector until one range is hit.
+ * Return <0 for fatal error.
+ */
+static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 disk_bytenr,
+ u64 len, u8 *dst)
+{
+ struct btrfs_root *csum_root;
+ struct btrfs_csum_item *item = NULL;
+ struct btrfs_key key;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ u32 itemsize;
+ int ret;
+ u64 csum_start;
+ u64 csum_len;
+
+ ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) &&
+ IS_ALIGNED(len, sectorsize));
+
+ /* Check if the current csum item covers disk_bytenr */
+ if (path->nodes[0]) {
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+
+ if (in_range(disk_bytenr, csum_start, csum_len))
+ goto found;
+ }
+
+ /* Current item doesn't contain the desired range, search again */
+ btrfs_release_path(path);
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0);
+ if (IS_ERR(item)) {
+ ret = PTR_ERR(item);
+ goto out;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+ ASSERT(in_range(disk_bytenr, csum_start, csum_len));
+
+found:
+ ret = (min(csum_start + csum_len, disk_bytenr + len) -
+ disk_bytenr) >> fs_info->sectorsize_bits;
+ read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
+ ret * csum_size);
+out:
+ if (ret == -ENOENT || ret == -EFBIG)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Locate the file_offset of @cur_disk_bytenr of a @bio.
+ *
+ * Bio of btrfs represents read range of
+ * [bi_sector << 9, bi_sector << 9 + bi_size).
+ * Knowing this, we can iterate through each bvec to locate the page belong to
+ * @cur_disk_bytenr and get the file offset.
+ *
+ * @inode is used to determine if the bvec page really belongs to @inode.
+ *
+ * Return 0 if we can't find the file offset
+ * Return >0 if we find the file offset and restore it to @file_offset_ret
+ */
+static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
+ u64 disk_bytenr, u64 *file_offset_ret)
+{
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ int ret = 0;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ struct page *page = bvec.bv_page;
+
+ if (cur > disk_bytenr)
+ break;
+ if (cur + bvec.bv_len <= disk_bytenr) {
+ cur += bvec.bv_len;
+ continue;
+ }
+ ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
+ if (page->mapping && page->mapping->host &&
+ page->mapping->host == inode) {
+ ret = 1;
+ *file_offset_ret = page_offset(page) + bvec.bv_offset +
+ disk_bytenr - cur;
+ break;
+ }
+ }
+ return ret;
+}
+
+/**
+ * Lookup the checksum for the read bio in csum tree.
+ *
+ * @inode: inode that the bio is for.
+ * @bio: bio to look up.
+ * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
+ * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
+ * NULL, the checksum buffer is allocated and returned in
+ * btrfs_bio(bio)->csum instead.
+ *
+ * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
+ */
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_bio *bbio = NULL;
+ struct btrfs_path *path;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ u32 orig_len = bio->bi_iter.bi_size;
+ u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_bytenr;
+ u8 *csum;
+ const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
+ int count = 0;
+ blk_status_t ret = BLK_STS_OK;
+
+ if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
+ return BLK_STS_OK;
+
+ /*
+ * This function is only called for read bio.
+ *
+ * This means two things:
+ * - All our csums should only be in csum tree
+ * No ordered extents csums, as ordered extents are only for write
+ * path.
+ * - No need to bother any other info from bvec
+ * Since we're looking up csums, the only important info is the
+ * disk_bytenr and the length, which can be extracted from bi_iter
+ * directly.
+ */
+ ASSERT(bio_op(bio) == REQ_OP_READ);
+ path = btrfs_alloc_path();
+ if (!path)
+ return BLK_STS_RESOURCE;
+
+ if (!dst) {
+ bbio = btrfs_bio(bio);
+
+ if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
+ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+ if (!bbio->csum) {
+ btrfs_free_path(path);
+ return BLK_STS_RESOURCE;
+ }
+ } else {
+ bbio->csum = bbio->csum_inline;
+ }
+ csum = bbio->csum;
+ } else {
+ csum = dst;
+ }
+
+ /*
+ * If requested number of sectors is larger than one leaf can contain,
+ * kick the readahead for csum tree.
+ */
+ if (nblocks > fs_info->csums_per_leaf)
+ path->reada = READA_FORWARD;
+
+ /*
+ * the free space stuff is only read when it hasn't been
+ * updated in the current transaction. So, we can safely
+ * read from the commit root and sidestep a nasty deadlock
+ * between reading the free space cache and updating the csum tree.
+ */
+ if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
+
+ for (cur_disk_bytenr = orig_disk_bytenr;
+ cur_disk_bytenr < orig_disk_bytenr + orig_len;
+ cur_disk_bytenr += (count * sectorsize)) {
+ u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
+ unsigned int sector_offset;
+ u8 *csum_dst;
+
+ /*
+ * Although both cur_disk_bytenr and orig_disk_bytenr is u64,
+ * we're calculating the offset to the bio start.
+ *
+ * Bio size is limited to UINT_MAX, thus unsigned int is large
+ * enough to contain the raw result, not to mention the right
+ * shifted result.
+ */
+ ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
+ sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
+ fs_info->sectorsize_bits;
+ csum_dst = csum + sector_offset * csum_size;
+
+ count = search_csum_tree(fs_info, path, cur_disk_bytenr,
+ search_len, csum_dst);
+ if (count < 0) {
+ ret = errno_to_blk_status(count);
+ if (bbio)
+ btrfs_bio_free_csum(bbio);
+ break;
+ }
+
+ /*
+ * We didn't find a csum for this range. We need to make sure
+ * we complain loudly about this, because we are not NODATASUM.
+ *
+ * However for the DATA_RELOC inode we could potentially be
+ * relocating data extents for a NODATASUM inode, so the inode
+ * itself won't be marked with NODATASUM, but the extent we're
+ * copying is in fact NODATASUM. If we don't find a csum we
+ * assume this is the case.
+ */
+ if (count == 0) {
+ memset(csum_dst, 0, csum_size);
+ count = 1;
+
+ if (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ u64 file_offset;
+ int ret;
+
+ ret = search_file_offset_in_bio(bio, inode,
+ cur_disk_bytenr, &file_offset);
+ if (ret)
+ set_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ } else {
+ btrfs_warn_rl(fs_info,
+ "csum hole found for disk bytenr range [%llu, %llu)",
+ cur_disk_bytenr, cur_disk_bytenr + sectorsize);
+ }
+ }
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit,
+ bool nowait)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_csum_item *item;
+ LIST_HEAD(tmplist);
+ unsigned long offset;
+ int ret;
+ size_t size;
+ u64 csum_end;
+ const u32 csum_size = fs_info->csum_size;
+
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(end + 1, fs_info->sectorsize));
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->nowait = nowait;
+ if (search_commit) {
+ path->skip_locking = 1;
+ path->reada = READA_FORWARD;
+ path->search_commit_root = 1;
+ }
+
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.offset = start;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0 && path->slots[0] > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+ key.type == BTRFS_EXTENT_CSUM_KEY) {
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
+ if (offset * csum_size <
+ btrfs_item_size(leaf, path->slots[0] - 1))
+ path->slots[0]--;
+ }
+ }
+
+ while (start <= end) {
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset > end)
+ break;
+
+ if (key.offset > start)
+ start = key.offset;
+
+ size = btrfs_item_size(leaf, path->slots[0]);
+ csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
+ if (csum_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ csum_end = min(csum_end, end + 1);
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ while (start < csum_end) {
+ size = min_t(size_t, csum_end - start,
+ max_ordered_sum_bytes(fs_info, csum_size));
+ sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
+ GFP_NOFS);
+ if (!sums) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ sums->bytenr = start;
+ sums->len = size;
+
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
+ offset *= csum_size;
+ size >>= fs_info->sectorsize_bits;
+
+ read_extent_buffer(path->nodes[0],
+ sums->sums,
+ ((unsigned long)item) + offset,
+ csum_size * size);
+
+ start += fs_info->sectorsize * size;
+ list_add_tail(&sums->list, &tmplist);
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+fail:
+ while (ret < 0 && !list_empty(&tmplist)) {
+ sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ list_splice_tail(&tmplist, list);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+/**
+ * Calculate checksums of the data contained inside a bio
+ *
+ * @inode: Owner of the data inside the bio
+ * @bio: Contains the data to be checksummed
+ * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the
+ * file offsets are determined from the page offsets in the bio.
+ * Otherwise, this is the starting file offset of the bio vecs in
+ * @bio, which must be contiguous.
+ * @one_ordered: If true, @bio only refers to one ordered extent.
+ */
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 offset, bool one_ordered)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_extent *ordered = NULL;
+ const bool use_page_offsets = (offset == (u64)-1);
+ char *data;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ int index;
+ unsigned int blockcount;
+ unsigned long total_bytes = 0;
+ unsigned long this_sum_bytes = 0;
+ int i;
+ unsigned nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
+ sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
+ GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (!sums)
+ return BLK_STS_RESOURCE;
+
+ sums->len = bio->bi_iter.bi_size;
+ INIT_LIST_HEAD(&sums->list);
+
+ sums->bytenr = bio->bi_iter.bi_sector << 9;
+ index = 0;
+
+ shash->tfm = fs_info->csum_shash;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ if (use_page_offsets)
+ offset = page_offset(bvec.bv_page) + bvec.bv_offset;
+
+ if (!ordered) {
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ /*
+ * The bio range is not covered by any ordered extent,
+ * must be a code logic error.
+ */
+ if (unlikely(!ordered)) {
+ WARN(1, KERN_WARNING
+ "no ordered extent for root %llu ino %llu offset %llu\n",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), offset);
+ kvfree(sums);
+ return BLK_STS_IOERR;
+ }
+ }
+
+ blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
+ bvec.bv_len + fs_info->sectorsize
+ - 1);
+
+ for (i = 0; i < blockcount; i++) {
+ if (!one_ordered &&
+ !in_range(offset, ordered->file_offset,
+ ordered->num_bytes)) {
+ unsigned long bytes_left;
+
+ sums->len = this_sum_bytes;
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+
+ bytes_left = bio->bi_iter.bi_size - total_bytes;
+
+ nofs_flag = memalloc_nofs_save();
+ sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
+ bytes_left), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ if (!sums)
+ return BLK_STS_RESOURCE;
+
+ sums->len = bytes_left;
+ ordered = btrfs_lookup_ordered_extent(inode,
+ offset);
+ ASSERT(ordered); /* Logic error */
+ sums->bytenr = (bio->bi_iter.bi_sector << 9)
+ + total_bytes;
+ index = 0;
+ }
+
+ data = bvec_kmap_local(&bvec);
+ crypto_shash_digest(shash,
+ data + (i * fs_info->sectorsize),
+ fs_info->sectorsize,
+ sums->sums + index);
+ kunmap_local(data);
+ index += fs_info->csum_size;
+ offset += fs_info->sectorsize;
+ this_sum_bytes += fs_info->sectorsize;
+ total_bytes += fs_info->sectorsize;
+ }
+
+ }
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u64 bytenr, u64 len)
+{
+ struct extent_buffer *leaf;
+ const u32 csum_size = fs_info->csum_size;
+ u64 csum_end;
+ u64 end_byte = bytenr + len;
+ u32 blocksize_bits = fs_info->sectorsize_bits;
+
+ leaf = path->nodes[0];
+ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
+ csum_end <<= blocksize_bits;
+ csum_end += key->offset;
+
+ if (key->offset < bytenr && csum_end <= end_byte) {
+ /*
+ * [ bytenr - len ]
+ * [ ]
+ * [csum ]
+ * A simple truncate off the end of the item
+ */
+ u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+ new_size *= csum_size;
+ btrfs_truncate_item(path, new_size, 1);
+ } else if (key->offset >= bytenr && csum_end > end_byte &&
+ end_byte > key->offset) {
+ /*
+ * [ bytenr - len ]
+ * [ ]
+ * [csum ]
+ * we need to truncate from the beginning of the csum
+ */
+ u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+ new_size *= csum_size;
+
+ btrfs_truncate_item(path, new_size, 0);
+
+ key->offset = end_byte;
+ btrfs_set_item_key_safe(fs_info, path, key);
+ } else {
+ BUG();
+ }
+}
+
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 end_byte = bytenr + len;
+ u64 csum_end;
+ struct extent_buffer *leaf;
+ int ret = 0;
+ const u32 csum_size = fs_info->csum_size;
+ u32 blocksize_bits = fs_info->sectorsize_bits;
+
+ ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.offset = end_byte - 1;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ } else if (ret < 0) {
+ break;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY) {
+ break;
+ }
+
+ if (key.offset >= end_byte)
+ break;
+
+ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
+ csum_end <<= blocksize_bits;
+ csum_end += key.offset;
+
+ /* this csum ends before we start, we're done */
+ if (csum_end <= bytenr)
+ break;
+
+ /* delete the entire item, it is inside our range */
+ if (key.offset >= bytenr && csum_end <= end_byte) {
+ int del_nr = 1;
+
+ /*
+ * Check how many csum items preceding this one in this
+ * leaf correspond to our range and then delete them all
+ * at once.
+ */
+ if (key.offset > bytenr && path->slots[0] > 0) {
+ int slot = path->slots[0] - 1;
+
+ while (slot >= 0) {
+ struct btrfs_key pk;
+
+ btrfs_item_key_to_cpu(leaf, &pk, slot);
+ if (pk.offset < bytenr ||
+ pk.type != BTRFS_EXTENT_CSUM_KEY ||
+ pk.objectid !=
+ BTRFS_EXTENT_CSUM_OBJECTID)
+ break;
+ path->slots[0] = slot;
+ del_nr++;
+ key.offset = pk.offset;
+ slot--;
+ }
+ }
+ ret = btrfs_del_items(trans, root, path,
+ path->slots[0], del_nr);
+ if (ret)
+ break;
+ if (key.offset == bytenr)
+ break;
+ } else if (key.offset < bytenr && csum_end > end_byte) {
+ unsigned long offset;
+ unsigned long shift_len;
+ unsigned long item_offset;
+ /*
+ * [ bytenr - len ]
+ * [csum ]
+ *
+ * Our bytes are in the middle of the csum,
+ * we need to split this item and insert a new one.
+ *
+ * But we can't drop the path because the
+ * csum could change, get removed, extended etc.
+ *
+ * The trick here is the max size of a csum item leaves
+ * enough room in the tree block for a single
+ * item header. So, we split the item in place,
+ * adding a new header pointing to the existing
+ * bytes. Then we loop around again and we have
+ * a nicely formed csum item that we can neatly
+ * truncate.
+ */
+ offset = (bytenr - key.offset) >> blocksize_bits;
+ offset *= csum_size;
+
+ shift_len = (len >> blocksize_bits) * csum_size;
+
+ item_offset = btrfs_item_ptr_offset(leaf,
+ path->slots[0]);
+
+ memzero_extent_buffer(leaf, item_offset + offset,
+ shift_len);
+ key.offset = bytenr;
+
+ /*
+ * btrfs_split_item returns -EAGAIN when the
+ * item changed size or key
+ */
+ ret = btrfs_split_item(trans, root, path, &key, offset);
+ if (ret && ret != -EAGAIN) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ ret = 0;
+
+ key.offset = end_byte - 1;
+ } else {
+ truncate_one_csum(fs_info, path, &key, bytenr, len);
+ if (key.offset < bytenr)
+ break;
+ }
+ btrfs_release_path(path);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int find_next_csum_offset(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 *next_offset)
+{
+ const u32 nritems = btrfs_header_nritems(path->nodes[0]);
+ struct btrfs_key found_key;
+ int slot = path->slots[0] + 1;
+ int ret;
+
+ if (nritems == 0 || slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *next_offset = (u64)-1;
+ return 0;
+ }
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ *next_offset = (u64)-1;
+ else
+ *next_offset = found_key.offset;
+
+ return 0;
+}
+
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key file_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct btrfs_csum_item *item;
+ struct btrfs_csum_item *item_end;
+ struct extent_buffer *leaf = NULL;
+ u64 next_offset;
+ u64 total_bytes = 0;
+ u64 csum_offset;
+ u64 bytenr;
+ u32 ins_size;
+ int index = 0;
+ int found_next;
+ int ret;
+ const u32 csum_size = fs_info->csum_size;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
+ next_offset = (u64)-1;
+ found_next = 0;
+ bytenr = sums->bytenr + total_bytes;
+ file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ file_key.offset = bytenr;
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
+ if (!IS_ERR(item)) {
+ ret = 0;
+ leaf = path->nodes[0];
+ item_end = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((char *)item_end +
+ btrfs_item_size(leaf, path->slots[0]));
+ goto found;
+ }
+ ret = PTR_ERR(item);
+ if (ret != -EFBIG && ret != -ENOENT)
+ goto out;
+
+ if (ret == -EFBIG) {
+ u32 item_size;
+ /* we found one, but it isn't big enough yet */
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if ((item_size / csum_size) >=
+ MAX_CSUM_ITEMS(fs_info, csum_size)) {
+ /* already at max size, make a new one */
+ goto insert;
+ }
+ } else {
+ /* We didn't find a csum item, insert one. */
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+ found_next = 1;
+ goto insert;
+ }
+
+ /*
+ * At this point, we know the tree has a checksum item that ends at an
+ * offset matching the start of the checksum range we want to insert.
+ * We try to extend that item as much as possible and then add as many
+ * checksums to it as they fit.
+ *
+ * First check if the leaf has enough free space for at least one
+ * checksum. If it has go directly to the item extension code, otherwise
+ * release the path and do a search for insertion before the extension.
+ */
+ if (btrfs_leaf_free_space(leaf) >= csum_size) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ csum_offset = (bytenr - found_key.offset) >>
+ fs_info->sectorsize_bits;
+ goto extend_csum;
+ }
+
+ btrfs_release_path(path);
+ path->search_for_extension = 1;
+ ret = btrfs_search_slot(trans, root, &file_key, path,
+ csum_size, 1);
+ path->search_for_extension = 0;
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto insert;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits;
+
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
+ found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ csum_offset >= MAX_CSUM_ITEMS(fs_info, csum_size)) {
+ goto insert;
+ }
+
+extend_csum:
+ if (csum_offset == btrfs_item_size(leaf, path->slots[0]) /
+ csum_size) {
+ int extend_nr;
+ u64 tmp;
+ u32 diff;
+
+ tmp = sums->len - total_bytes;
+ tmp >>= fs_info->sectorsize_bits;
+ WARN_ON(tmp < 1);
+ extend_nr = max_t(int, 1, tmp);
+
+ /*
+ * A log tree can already have checksum items with a subset of
+ * the checksums we are trying to log. This can happen after
+ * doing a sequence of partial writes into prealloc extents and
+ * fsyncs in between, with a full fsync logging a larger subrange
+ * of an extent for which a previous fast fsync logged a smaller
+ * subrange. And this happens in particular due to merging file
+ * extent items when we complete an ordered extent for a range
+ * covered by a prealloc extent - this is done at
+ * btrfs_mark_extent_written().
+ *
+ * So if we try to extend the previous checksum item, which has
+ * a range that ends at the start of the range we want to insert,
+ * make sure we don't extend beyond the start offset of the next
+ * checksum item. If we are at the last item in the leaf, then
+ * forget the optimization of extending and add a new checksum
+ * item - it is not worth the complexity of releasing the path,
+ * getting the first key for the next leaf, repeat the btree
+ * search, etc, because log trees are temporary anyway and it
+ * would only save a few bytes of leaf space.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (path->slots[0] + 1 >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+ found_next = 1;
+ goto insert;
+ }
+
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+
+ tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
+ if (tmp <= INT_MAX)
+ extend_nr = min_t(int, extend_nr, tmp);
+ }
+
+ diff = (csum_offset + extend_nr) * csum_size;
+ diff = min(diff,
+ MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
+
+ diff = diff - btrfs_item_size(leaf, path->slots[0]);
+ diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
+ diff /= csum_size;
+ diff *= csum_size;
+
+ btrfs_extend_item(path, diff);
+ ret = 0;
+ goto csum;
+ }
+
+insert:
+ btrfs_release_path(path);
+ csum_offset = 0;
+ if (found_next) {
+ u64 tmp;
+
+ tmp = sums->len - total_bytes;
+ tmp >>= fs_info->sectorsize_bits;
+ tmp = min(tmp, (next_offset - file_key.offset) >>
+ fs_info->sectorsize_bits);
+
+ tmp = max_t(u64, 1, tmp);
+ tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size));
+ ins_size = csum_size * tmp;
+ } else {
+ ins_size = csum_size;
+ }
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ ins_size);
+ if (ret < 0)
+ goto out;
+ if (WARN_ON(ret != 0))
+ goto out;
+ leaf = path->nodes[0];
+csum:
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+ btrfs_item_size(leaf, path->slots[0]));
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+found:
+ ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits;
+ ins_size *= csum_size;
+ ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+ ins_size);
+ write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+ ins_size);
+
+ index += ins_size;
+ ins_size /= csum_size;
+ total_bytes += ins_size * fs_info->sectorsize;
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ if (total_bytes < sums->len) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto again;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_key key;
+ u64 extent_start, extent_end;
+ u64 bytenr;
+ u8 type = btrfs_file_extent_type(leaf, fi);
+ int compress_type = btrfs_file_extent_compression(leaf, fi);
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_start = key.offset;
+ extent_end = btrfs_file_extent_end(path);
+ em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ em->generation = btrfs_file_extent_generation(leaf, fi);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ em->orig_start = extent_start -
+ btrfs_file_extent_offset(leaf, fi);
+ em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (bytenr == 0) {
+ em->block_start = EXTENT_MAP_HOLE;
+ return;
+ }
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ em->block_start = bytenr;
+ em->block_len = em->orig_block_len;
+ } else {
+ bytenr += btrfs_file_extent_offset(leaf, fi);
+ em->block_start = bytenr;
+ em->block_len = em->len;
+ if (type == BTRFS_FILE_EXTENT_PREALLOC)
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ }
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ em->block_start = EXTENT_MAP_INLINE;
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ /*
+ * Initialize orig_start and block_len with the same values
+ * as in inode.c:btrfs_get_extent().
+ */
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->block_len = (u64)-1;
+ if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ }
+ } else {
+ btrfs_err(fs_info,
+ "unknown file extent item type %d, inode %llu, offset %llu, "
+ "root %llu", type, btrfs_ino(inode), extent_start,
+ root->root_key.objectid);
+ }
+}
+
+/*
+ * Returns the end offset (non inclusive) of the file extent item the given path
+ * points to. If it points to an inline extent, the returned offset is rounded
+ * up to the sector size.
+ */
+u64 btrfs_file_extent_end(const struct btrfs_path *path)
+{
+ const struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ end = btrfs_file_extent_ram_bytes(leaf, fi);
+ end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
+ } else {
+ end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ }
+
+ return end;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000..1783a0fbf
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,3856 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/writeback.h>
+#include <linux/compat.h>
+#include <linux/slab.h>
+#include <linux/btrfs.h>
+#include <linux/uio.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "volumes.h"
+#include "qgroup.h"
+#include "compression.h"
+#include "delalloc-space.h"
+#include "reflink.h"
+#include "subpage.h"
+
+/* simple helper to fault in pages and copy. This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
+ struct page **prepared_pages,
+ struct iov_iter *i)
+{
+ size_t copied = 0;
+ size_t total_copied = 0;
+ int pg = 0;
+ int offset = offset_in_page(pos);
+
+ while (write_bytes > 0) {
+ size_t count = min_t(size_t,
+ PAGE_SIZE - offset, write_bytes);
+ struct page *page = prepared_pages[pg];
+ /*
+ * Copy data from userspace to the current page
+ */
+ copied = copy_page_from_iter_atomic(page, offset, count, i);
+
+ /* Flush processor's dcache for this page */
+ flush_dcache_page(page);
+
+ /*
+ * if we get a partial write, we can end up with
+ * partially up to date pages. These add
+ * a lot of complexity, so make sure they don't
+ * happen by forcing this copy to be retried.
+ *
+ * The rest of the btrfs_file_write code will fall
+ * back to page at a time copies after we return 0.
+ */
+ if (unlikely(copied < count)) {
+ if (!PageUptodate(page)) {
+ iov_iter_revert(i, copied);
+ copied = 0;
+ }
+ if (!copied)
+ break;
+ }
+
+ write_bytes -= copied;
+ total_copied += copied;
+ offset += copied;
+ if (offset == PAGE_SIZE) {
+ pg++;
+ offset = 0;
+ }
+ }
+ return total_copied;
+}
+
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+ struct page **pages, size_t num_pages,
+ u64 pos, u64 copied)
+{
+ size_t i;
+ u64 block_start = round_down(pos, fs_info->sectorsize);
+ u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+ ASSERT(block_len <= U32_MAX);
+ for (i = 0; i < num_pages; i++) {
+ /* page checked is some magic around finding pages that
+ * have been modified without going through btrfs_set_page_dirty
+ * clear it here. There should be no need to mark the pages
+ * accessed as prepare_pages should have marked them accessed
+ * in prepare_pages via find_or_create_page()
+ */
+ btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+ block_len);
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+}
+
+/*
+ * After btrfs_copy_from_user(), update the following things for delalloc:
+ * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * Used to advise which range is to be written back.
+ * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Update inode size for past EOF write
+ */
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos, size_t write_bytes,
+ struct extent_state **cached, bool noreserve)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ int err = 0;
+ int i;
+ u64 num_bytes;
+ u64 start_pos;
+ u64 end_of_last_block;
+ u64 end_pos = pos + write_bytes;
+ loff_t isize = i_size_read(&inode->vfs_inode);
+ unsigned int extra_bits = 0;
+
+ if (write_bytes == 0)
+ return 0;
+
+ if (noreserve)
+ extra_bits |= EXTENT_NORESERVE;
+
+ start_pos = round_down(pos, fs_info->sectorsize);
+ num_bytes = round_up(write_bytes + pos - start_pos,
+ fs_info->sectorsize);
+ ASSERT(num_bytes <= U32_MAX);
+
+ end_of_last_block = start_pos + num_bytes - 1;
+
+ /*
+ * The pages may have already been dirty, clear out old accounting so
+ * we can set things up properly
+ */
+ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ cached);
+
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+ extra_bits, cached);
+ if (err)
+ return err;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = pages[i];
+
+ btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
+ btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
+ btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
+ }
+
+ /*
+ * we've only changed i_size in ram, and we haven't updated
+ * the disk i_size. There is no need to log the inode
+ * at this time.
+ */
+ if (end_pos > isize)
+ i_size_write(&inode->vfs_inode, end_pos);
+ return 0;
+}
+
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end. hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split. Anything entirely inside the range
+ * is deleted from the tree.
+ *
+ * Note: the VFS' inode number of bytes is not updated, it's up to the caller
+ * to deal with that. We set the field 'bytes_found' of the arguments structure
+ * with the number of allocated bytes found in the target range, so that the
+ * caller can update the inode's number of bytes in an atomic way when
+ * replacing extents in a range to avoid races with stat(2).
+ */
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_ref ref = { 0 };
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 ino = btrfs_ino(inode);
+ u64 search_start = args->start;
+ u64 disk_bytenr = 0;
+ u64 num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 extent_end = 0;
+ u64 last_end = args->start;
+ int del_nr = 0;
+ int del_slot = 0;
+ int extent_type;
+ int recow;
+ int ret;
+ int modify_tree = -1;
+ int update_refs;
+ int found = 0;
+ struct btrfs_path *path = args->path;
+
+ args->bytes_found = 0;
+ args->extent_inserted = false;
+
+ /* Must always have a path if ->replace_extent is true */
+ ASSERT(!(args->replace_extent && !args->path));
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (args->drop_cache)
+ btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
+
+ if (args->start >= inode->disk_i_size && !args->replace_extent)
+ modify_tree = 0;
+
+ update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
+ while (1) {
+ recow = 0;
+ ret = btrfs_lookup_file_extent(trans, root, path, ino,
+ search_start, modify_tree);
+ if (ret < 0)
+ break;
+ if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+ ret = 0;
+next_slot:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ BUG_ON(del_nr > 0);
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ recow = 1;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY) {
+ ASSERT(del_nr == 0);
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = key.offset +
+ btrfs_file_extent_ram_bytes(leaf, fi);
+ } else {
+ /* can't happen */
+ BUG();
+ }
+
+ /*
+ * Don't skip extent items representing 0 byte lengths. They
+ * used to be created (bug) if while punching holes we hit
+ * -ENOSPC condition. So if we find one here, just ensure we
+ * delete it, otherwise we would insert a new file extent item
+ * with the same key (offset) as that 0 bytes length file
+ * extent item in the call to setup_items_for_insert() later
+ * in this function.
+ */
+ if (extent_end == key.offset && extent_end >= search_start) {
+ last_end = extent_end;
+ goto delete_extent_item;
+ }
+
+ if (extent_end <= search_start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ found = 1;
+ search_start = max(key.offset, args->start);
+ if (recow || !modify_tree) {
+ modify_tree = -1;
+ btrfs_release_path(path);
+ continue;
+ }
+
+ /*
+ * | - range to drop - |
+ * | -------- extent -------- |
+ */
+ if (args->start > key.offset && args->end < extent_end) {
+ BUG_ON(del_nr > 0);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = args->start;
+ ret = btrfs_duplicate_item(trans, root, path,
+ &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(path);
+ continue;
+ }
+ if (ret < 0)
+ break;
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ args->start - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ extent_offset += args->start - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - args->start);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (update_refs && disk_bytenr > 0) {
+ btrfs_init_generic_ref(&ref,
+ BTRFS_ADD_DELAYED_REF,
+ disk_bytenr, num_bytes, 0);
+ btrfs_init_data_ref(&ref,
+ root->root_key.objectid,
+ new_key.objectid,
+ args->start - extent_offset,
+ 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+ key.offset = args->start;
+ }
+ /*
+ * From here on out we will have actually dropped something, so
+ * last_end can be updated.
+ */
+ last_end = extent_end;
+
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (args->start <= key.offset && args->end < extent_end) {
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = args->end;
+ btrfs_set_item_key_safe(fs_info, path, &new_key);
+
+ extent_offset += args->end - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - args->end);
+ btrfs_mark_buffer_dirty(leaf);
+ if (update_refs && disk_bytenr > 0)
+ args->bytes_found += args->end - key.offset;
+ break;
+ }
+
+ search_start = extent_end;
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (args->start > key.offset && args->end >= extent_end) {
+ BUG_ON(del_nr > 0);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ args->start - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+ if (update_refs && disk_bytenr > 0)
+ args->bytes_found += extent_end - args->start;
+ if (args->end == extent_end)
+ break;
+
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ /*
+ * | ---- range to drop ----- |
+ * | ------ extent ------ |
+ */
+ if (args->start <= key.offset && args->end >= extent_end) {
+delete_extent_item:
+ if (del_nr == 0) {
+ del_slot = path->slots[0];
+ del_nr = 1;
+ } else {
+ BUG_ON(del_slot + del_nr != path->slots[0]);
+ del_nr++;
+ }
+
+ if (update_refs &&
+ extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ args->bytes_found += extent_end - key.offset;
+ extent_end = ALIGN(extent_end,
+ fs_info->sectorsize);
+ } else if (update_refs && disk_bytenr > 0) {
+ btrfs_init_generic_ref(&ref,
+ BTRFS_DROP_DELAYED_REF,
+ disk_bytenr, num_bytes, 0);
+ btrfs_init_data_ref(&ref,
+ root->root_key.objectid,
+ key.objectid,
+ key.offset - extent_offset, 0,
+ false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ args->bytes_found += extent_end - key.offset;
+ }
+
+ if (args->end == extent_end)
+ break;
+
+ if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ ret = btrfs_del_items(trans, root, path, del_slot,
+ del_nr);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ del_nr = 0;
+ del_slot = 0;
+
+ btrfs_release_path(path);
+ continue;
+ }
+
+ BUG();
+ }
+
+ if (!ret && del_nr > 0) {
+ /*
+ * Set path->slots[0] to first slot, so that after the delete
+ * if items are move off from our leaf to its immediate left or
+ * right neighbor leafs, we end up with a correct and adjusted
+ * path->slots[0] for our insertion (if args->replace_extent).
+ */
+ path->slots[0] = del_slot;
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+
+ leaf = path->nodes[0];
+ /*
+ * If btrfs_del_items() was called, it might have deleted a leaf, in
+ * which case it unlocked our path, so check path->locks[0] matches a
+ * write lock.
+ */
+ if (!ret && args->replace_extent &&
+ path->locks[0] == BTRFS_WRITE_LOCK &&
+ btrfs_leaf_free_space(leaf) >=
+ sizeof(struct btrfs_item) + args->extent_item_size) {
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = args->start;
+ if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+ struct btrfs_key slot_key;
+
+ btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+ if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+ path->slots[0]++;
+ }
+ btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
+ args->extent_inserted = true;
+ }
+
+ if (!args->path)
+ btrfs_free_path(path);
+ else if (!args->extent_inserted)
+ btrfs_release_path(path);
+out:
+ args->drop_end = found ? min(args->end, last_end) : args->end;
+
+ return ret;
+}
+
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+ u64 objectid, u64 bytenr, u64 orig_offset,
+ u64 *start, u64 *end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 extent_end;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+ btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+ btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ return 0;
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if ((*start && *start != key.offset) || (*end && *end != extent_end))
+ return 0;
+
+ *start = key.offset;
+ *end = extent_end;
+ return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_ref ref = { 0 };
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 extent_end;
+ u64 orig_offset;
+ u64 other_start;
+ u64 other_end;
+ u64 split;
+ int del_nr = 0;
+ int del_slot = 0;
+ int recow;
+ int ret = 0;
+ u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
+ recow = 0;
+ split = start;
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = split;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if (key.offset > start || extent_end < end) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+ memcpy(&new_key, &key, sizeof(new_key));
+
+ if (start == key.offset && end < extent_end) {
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ new_key.offset = end;
+ btrfs_set_item_key_safe(fs_info, path, &new_key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_set_file_extent_offset(leaf, fi,
+ end - orig_offset);
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ end - other_start);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
+
+ if (start > key.offset && end == extent_end) {
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ path->slots[0]++;
+ new_key.offset = start;
+ btrfs_set_item_key_safe(fs_info, path, &new_key);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ other_end - start);
+ btrfs_set_file_extent_offset(leaf, fi,
+ start - orig_offset);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
+
+ while (start > key.offset || end < extent_end) {
+ if (key.offset == start)
+ split = end;
+
+ new_key.offset = split;
+ ret = btrfs_duplicate_item(trans, root, path, &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(path);
+ goto again;
+ }
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ split - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - split);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
+ num_bytes, 0);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
+ orig_offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ if (split == start) {
+ key.offset = start;
+ } else {
+ if (start != key.offset) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ path->slots[0]--;
+ extent_end = end;
+ }
+ recow = 1;
+ }
+
+ other_start = end;
+ other_end = 0;
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+ num_bytes, 0);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+ 0, false);
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(path);
+ goto again;
+ }
+ extent_end = other_end;
+ del_slot = path->slots[0] + 1;
+ del_nr++;
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ }
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(path);
+ goto again;
+ }
+ key.offset = other_start;
+ del_slot = path->slots[0];
+ del_nr++;
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ }
+ if (del_nr == 0) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ fi = btrfs_item_ptr(leaf, del_slot - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct inode *inode,
+ struct page *page, u64 pos,
+ bool force_uptodate)
+{
+ struct folio *folio = page_folio(page);
+ int ret = 0;
+
+ if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
+ !PageUptodate(page)) {
+ ret = btrfs_read_folio(NULL, folio);
+ if (ret)
+ return ret;
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ return -EIO;
+ }
+
+ /*
+ * Since btrfs_read_folio() will unlock the folio before it
+ * returns, there is a window where btrfs_release_folio() can be
+ * called to release the page. Here we check both inode
+ * mapping and PagePrivate() to make sure the page was not
+ * released.
+ *
+ * The private flag check is essential for subpage as we need
+ * to store extra bitmap using page->private.
+ */
+ if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ }
+ return 0;
+}
+
+static unsigned int get_prepare_fgp_flags(bool nowait)
+{
+ unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+
+ if (nowait)
+ fgp_flags |= FGP_NOWAIT;
+
+ return fgp_flags;
+}
+
+static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
+{
+ gfp_t gfp;
+
+ gfp = btrfs_alloc_write_mask(inode->i_mapping);
+ if (nowait) {
+ gfp &= ~__GFP_DIRECT_RECLAIM;
+ gfp |= GFP_NOWAIT;
+ }
+
+ return gfp;
+}
+
+/*
+ * this just gets pages into the page cache and locks them down.
+ */
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos,
+ size_t write_bytes, bool force_uptodate,
+ bool nowait)
+{
+ int i;
+ unsigned long index = pos >> PAGE_SHIFT;
+ gfp_t mask = get_prepare_gfp_flags(inode, nowait);
+ unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
+ int err = 0;
+ int faili;
+
+ for (i = 0; i < num_pages; i++) {
+again:
+ pages[i] = pagecache_get_page(inode->i_mapping, index + i,
+ fgp_flags, mask | __GFP_WRITE);
+ if (!pages[i]) {
+ faili = i - 1;
+ if (nowait)
+ err = -EAGAIN;
+ else
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = set_page_extent_mapped(pages[i]);
+ if (err < 0) {
+ faili = i;
+ goto fail;
+ }
+
+ if (i == 0)
+ err = prepare_uptodate_page(inode, pages[i], pos,
+ force_uptodate);
+ if (!err && i == num_pages - 1)
+ err = prepare_uptodate_page(inode, pages[i],
+ pos + write_bytes, false);
+ if (err) {
+ put_page(pages[i]);
+ if (!nowait && err == -EAGAIN) {
+ err = 0;
+ goto again;
+ }
+ faili = i - 1;
+ goto fail;
+ }
+ wait_on_page_writeback(pages[i]);
+ }
+
+ return 0;
+fail:
+ while (faili >= 0) {
+ unlock_page(pages[faili]);
+ put_page(pages[faili]);
+ faili--;
+ }
+ return err;
+
+}
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos,
+ size_t write_bytes,
+ u64 *lockstart, u64 *lockend, bool nowait,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 start_pos;
+ u64 last_pos;
+ int i;
+ int ret = 0;
+
+ start_pos = round_down(pos, fs_info->sectorsize);
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
+
+ if (start_pos < inode->vfs_inode.i_size) {
+ struct btrfs_ordered_extent *ordered;
+
+ if (nowait) {
+ if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+
+ return -EAGAIN;
+ }
+ } else {
+ lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+ }
+
+ ordered = btrfs_lookup_ordered_range(inode, start_pos,
+ last_pos - start_pos + 1);
+ if (ordered &&
+ ordered->file_offset + ordered->num_bytes > start_pos &&
+ ordered->file_offset <= last_pos) {
+ unlock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ return -EAGAIN;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+
+ *lockstart = start_pos;
+ *lockend = last_pos;
+ ret = 1;
+ }
+
+ /*
+ * We should be called after prepare_pages() which should have locked
+ * all pages in the range.
+ */
+ for (i = 0; i < num_pages; i++)
+ WARN_ON(!PageLocked(pages[i]));
+
+ return ret;
+}
+
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset.
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range.
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * > 0 If we can nocow, and updates @write_bytes.
+ * 0 If we can't do a nocow write.
+ * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
+ * root is in progress.
+ * < 0 If an error happened.
+ *
+ * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
+ u64 lockstart, lockend;
+ u64 num_bytes;
+ int ret;
+
+ if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return 0;
+
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
+ return -EAGAIN;
+
+ lockstart = round_down(pos, fs_info->sectorsize);
+ lockend = round_up(pos + *write_bytes,
+ fs_info->sectorsize) - 1;
+ num_bytes = lockend - lockstart + 1;
+
+ if (nowait) {
+ if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ return -EAGAIN;
+ }
+ } else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+ }
+ ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
+ NULL, NULL, NULL, nowait, false);
+ if (ret <= 0)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ else
+ *write_bytes = min_t(size_t, *write_bytes ,
+ num_bytes - pos + lockstart);
+ unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
+
+ return ret;
+}
+
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
+{
+ btrfs_drew_write_unlock(&inode->root->snapshot_lock);
+}
+
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec64 now;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_time(inode);
+ if (!timespec64_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ if (!timespec64_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
+static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
+ size_t count)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ loff_t pos = iocb->ki_pos;
+ int ret;
+ loff_t oldsize;
+ loff_t start_pos;
+
+ /*
+ * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
+ * prealloc flags, as without those flags we always have to COW. We will
+ * later check if we can really COW into the target range (using
+ * can_nocow_extent() at btrfs_get_blocks_direct_write()).
+ */
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return -EAGAIN;
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ /*
+ * We reserve space for updating the inode when we reserve space for the
+ * extent we are going to write, so we will enospc out there. We don't
+ * need to start yet another transaction to update the inode as we will
+ * update the inode when we finish writing whatever data we write.
+ */
+ update_time_for_write(inode);
+
+ start_pos = round_down(pos, fs_info->sectorsize);
+ oldsize = i_size_read(inode);
+ if (start_pos > oldsize) {
+ /* Expand hole size to cover write data, preventing empty gap */
+ loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
+
+ ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
+ if (ret) {
+ current->backing_dev_info = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
+ struct iov_iter *i)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct page **pages = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ u64 release_bytes = 0;
+ u64 lockstart;
+ u64 lockend;
+ size_t num_written = 0;
+ int nrptrs;
+ ssize_t ret;
+ bool only_release_metadata = false;
+ bool force_page_uptodate = false;
+ loff_t old_isize = i_size_read(inode);
+ unsigned int ilock_flags = 0;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
+ unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+
+ if (nowait)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ ret = btrfs_inode_lock(inode, ilock_flags);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, i);
+ if (ret <= 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, i, ret);
+ if (ret < 0)
+ goto out;
+
+ pos = iocb->ki_pos;
+ nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
+ PAGE_SIZE / (sizeof(struct page *)));
+ nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+ nrptrs = max(nrptrs, 8);
+ pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ while (iov_iter_count(i) > 0) {
+ struct extent_state *cached_state = NULL;
+ size_t offset = offset_in_page(pos);
+ size_t sector_offset;
+ size_t write_bytes = min(iov_iter_count(i),
+ nrptrs * (size_t)PAGE_SIZE -
+ offset);
+ size_t num_pages;
+ size_t reserve_bytes;
+ size_t dirty_pages;
+ size_t copied;
+ size_t dirty_sectors;
+ size_t num_sectors;
+ int extents_locked;
+
+ /*
+ * Fault pages before locking them in prepare_pages
+ * to avoid recursive lock
+ */
+ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ only_release_metadata = false;
+ sector_offset = pos & (fs_info->sectorsize - 1);
+
+ extent_changeset_release(data_reserved);
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &data_reserved, pos,
+ write_bytes, nowait);
+ if (ret < 0) {
+ int can_nocow;
+
+ if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ /*
+ * If we don't have to COW at the offset, reserve
+ * metadata only. write_bytes may get smaller than
+ * requested here.
+ */
+ can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes, nowait);
+ if (can_nocow < 0)
+ ret = can_nocow;
+ if (can_nocow > 0)
+ ret = 0;
+ if (ret)
+ break;
+ only_release_metadata = true;
+ }
+
+ num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
+ WARN_ON(num_pages > nrptrs);
+ reserve_bytes = round_up(write_bytes + sector_offset,
+ fs_info->sectorsize);
+ WARN_ON(reserve_bytes == 0);
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+ reserve_bytes,
+ reserve_bytes, nowait);
+ if (ret) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, pos,
+ write_bytes);
+ else
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
+
+ if (nowait && ret == -ENOSPC)
+ ret = -EAGAIN;
+ break;
+ }
+
+ release_bytes = reserve_bytes;
+again:
+ ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
+ if (ret) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+ break;
+ }
+
+ /*
+ * This is going to setup the pages array with the number of
+ * pages we want, so we don't really need to worry about the
+ * contents of pages from loop to loop
+ */
+ ret = prepare_pages(inode, pages, num_pages,
+ pos, write_bytes, force_page_uptodate, false);
+ if (ret) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ reserve_bytes);
+ break;
+ }
+
+ extents_locked = lock_and_cleanup_extent_if_need(
+ BTRFS_I(inode), pages,
+ num_pages, pos, write_bytes, &lockstart,
+ &lockend, nowait, &cached_state);
+ if (extents_locked < 0) {
+ if (!nowait && extents_locked == -EAGAIN)
+ goto again;
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ reserve_bytes);
+ ret = extents_locked;
+ break;
+ }
+
+ copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
+
+ num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
+ dirty_sectors = round_up(copied + sector_offset,
+ fs_info->sectorsize);
+ dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
+
+ /*
+ * if we have trouble faulting in the pages, fall
+ * back to one page at a time
+ */
+ if (copied < write_bytes)
+ nrptrs = 1;
+
+ if (copied == 0) {
+ force_page_uptodate = true;
+ dirty_sectors = 0;
+ dirty_pages = 0;
+ } else {
+ force_page_uptodate = false;
+ dirty_pages = DIV_ROUND_UP(copied + offset,
+ PAGE_SIZE);
+ }
+
+ if (num_sectors > dirty_sectors) {
+ /* release everything except the sectors we dirtied */
+ release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
+ if (only_release_metadata) {
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ release_bytes, true);
+ } else {
+ u64 __pos;
+
+ __pos = round_down(pos,
+ fs_info->sectorsize) +
+ (dirty_pages << PAGE_SHIFT);
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, __pos,
+ release_bytes, true);
+ }
+ }
+
+ release_bytes = round_up(copied + sector_offset,
+ fs_info->sectorsize);
+
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+ dirty_pages, pos, copied,
+ &cached_state, only_release_metadata);
+
+ /*
+ * If we have not locked the extent range, because the range's
+ * start offset is >= i_size, we might still have a non-NULL
+ * cached extent state, acquired while marking the extent range
+ * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * possible cached extent state to avoid a memory leak.
+ */
+ if (extents_locked)
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+ if (ret) {
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+ break;
+ }
+
+ release_bytes = 0;
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
+
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+
+ cond_resched();
+
+ pos += copied;
+ num_written += copied;
+ }
+
+ kfree(pages);
+
+ if (release_bytes) {
+ if (only_release_metadata) {
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ release_bytes, true);
+ } else {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved,
+ round_down(pos, fs_info->sectorsize),
+ release_bytes, true);
+ }
+ }
+
+ extent_changeset_free(data_reserved);
+ if (num_written > 0) {
+ pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
+ iocb->ki_pos += num_written;
+ }
+out:
+ btrfs_inode_unlock(inode, ilock_flags);
+ return num_written ? num_written : ret;
+}
+
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ const u32 blocksize_mask = fs_info->sectorsize - 1;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ loff_t pos;
+ ssize_t written = 0;
+ ssize_t written_buffered;
+ size_t prev_left = 0;
+ loff_t endbyte;
+ ssize_t err;
+ unsigned int ilock_flags = 0;
+ struct iomap_dio *dio;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ /*
+ * If the write DIO is within EOF, use a shared lock and also only if
+ * security bits will likely not be dropped by file_remove_privs() called
+ * from btrfs_write_check(). Either will need to be rechecked after the
+ * lock was acquired.
+ */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
+ ilock_flags |= BTRFS_ILOCK_SHARED;
+
+relock:
+ err = btrfs_inode_lock(inode, ilock_flags);
+ if (err < 0)
+ return err;
+
+ /* Shared lock cannot be used with security bits set. */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ err = generic_write_checks(iocb, from);
+ if (err <= 0) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ return err;
+ }
+
+ err = btrfs_write_check(iocb, from, err);
+ if (err < 0) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ /*
+ * Re-check since file size may have changed just before taking the
+ * lock or pos may have changed because of O_APPEND in generic_write_check()
+ */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
+ pos + iov_iter_count(from) > i_size_read(inode)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ if (check_direct_IO(fs_info, from, pos)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ goto buffered;
+ }
+
+ /*
+ * The iov_iter can be mapped to the same file range we are writing to.
+ * If that's the case, then we will deadlock in the iomap code, because
+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
+ * an ordered extent, and after that it will fault in the pages that the
+ * iov_iter refers to. During the fault in we end up in the readahead
+ * pages code (starting at btrfs_readahead()), which will lock the range,
+ * find that ordered extent and then wait for it to complete (at
+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+ * obviously the ordered extent can never complete as we didn't submit
+ * yet the respective bio(s). This always happens when the buffer is
+ * memory mapped to the same file range, since the iomap DIO code always
+ * invalidates pages in the target file range (after starting and waiting
+ * for any writeback).
+ *
+ * So here we disable page faults in the iov_iter and then retry if we
+ * got -EFAULT, faulting in the pages before the retry.
+ */
+ from->nofault = true;
+ dio = btrfs_dio_write(iocb, from, written);
+ from->nofault = false;
+
+ /*
+ * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
+ * iocb, and that needs to lock the inode. So unlock it before calling
+ * iomap_dio_complete() to avoid a deadlock.
+ */
+ btrfs_inode_unlock(inode, ilock_flags);
+
+ if (IS_ERR_OR_NULL(dio))
+ err = PTR_ERR_OR_ZERO(dio);
+ else
+ err = iomap_dio_complete(dio);
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (err > 0)
+ written = err;
+
+ if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
+ const size_t left = iov_iter_count(from);
+ /*
+ * We have more data left to write. Try to fault in as many as
+ * possible of the remainder pages and retry. We do this without
+ * releasing and locking again the inode, to prevent races with
+ * truncate.
+ *
+ * Also, in case the iov refers to pages in the file range of the
+ * file we want to write to (due to a mmap), we could enter an
+ * infinite loop if we retry after faulting the pages in, since
+ * iomap will invalidate any pages in the range early on, before
+ * it tries to fault in the pages of the iov. So we keep track of
+ * how much was left of iov in the previous EFAULT and fallback
+ * to buffered IO in case we haven't made any progress.
+ */
+ if (left == prev_left) {
+ err = -ENOTBLK;
+ } else {
+ fault_in_iov_iter_readable(from, left);
+ prev_left = left;
+ goto relock;
+ }
+ }
+
+ /*
+ * If 'err' is -ENOTBLK or we have not written all data, then it means
+ * we must fallback to buffered IO.
+ */
+ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
+ goto out;
+
+buffered:
+ /*
+ * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
+ * it must retry the operation in a context where blocking is acceptable,
+ * since we currently don't have NOWAIT semantics support for buffered IO
+ * and may block there for many reasons (reserving space for example).
+ */
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ written_buffered = btrfs_buffered_write(iocb, from);
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
+ }
+ /*
+ * Ensure all data is persisted. We want the next direct IO read to be
+ * able to read what was just written.
+ */
+ endbyte = pos + written_buffered - 1;
+ err = btrfs_fdatawrite_range(inode, pos, endbyte);
+ if (err)
+ goto out;
+ err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+ if (err)
+ goto out;
+ written += written_buffered;
+ iocb->ki_pos = pos + written_buffered;
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+out:
+ return err < 0 ? err : written;
+}
+
+static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ loff_t count;
+ ssize_t ret;
+
+ btrfs_inode_lock(inode, 0);
+ count = encoded->len;
+ ret = generic_write_checks_count(iocb, &count);
+ if (ret == 0 && count != encoded->len) {
+ /*
+ * The write got truncated by generic_write_checks_count(). We
+ * can't do a partial encoded write.
+ */
+ ret = -EFBIG;
+ }
+ if (ret || encoded->len == 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, from, encoded->len);
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_do_encoded_write(iocb, from, encoded);
+out:
+ btrfs_inode_unlock(inode, 0);
+ return ret;
+}
+
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct file *file = iocb->ki_filp;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
+ ssize_t num_written, num_sync;
+ const bool sync = iocb_is_dsync(iocb);
+
+ /*
+ * If the fs flips readonly due to some impossible error, although we
+ * have opened a file as writable, we have to stop this write operation
+ * to ensure consistency.
+ */
+ if (BTRFS_FS_ERROR(inode->root->fs_info))
+ return -EROFS;
+
+ if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
+
+ if (sync)
+ atomic_inc(&inode->sync_writers);
+
+ if (encoded) {
+ num_written = btrfs_encoded_write(iocb, from, encoded);
+ num_sync = encoded->len;
+ } else if (iocb->ki_flags & IOCB_DIRECT) {
+ num_written = btrfs_direct_write(iocb, from);
+ num_sync = num_written;
+ } else {
+ num_written = btrfs_buffered_write(iocb, from);
+ num_sync = num_written;
+ }
+
+ btrfs_set_inode_last_sub_trans(inode);
+
+ if (num_sync > 0) {
+ num_sync = generic_write_sync(iocb, num_sync);
+ if (num_sync < 0)
+ num_written = num_sync;
+ }
+
+ if (sync)
+ atomic_dec(&inode->sync_writers);
+
+ current->backing_dev_info = NULL;
+ return num_written;
+}
+
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ return btrfs_do_write_iter(iocb, from, NULL);
+}
+
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+ struct btrfs_file_private *private = filp->private_data;
+
+ if (private && private->filldir_buf)
+ kfree(private->filldir_buf);
+ kfree(private);
+ filp->private_data = NULL;
+
+ /*
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
+ */
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ return 0;
+}
+
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+ struct blk_plug plug;
+
+ /*
+ * This is only called in fsync, which would do synchronous writes, so
+ * a plug can merge adjacent IOs as much as possible. Esp. in case of
+ * multiple disks using raid profile, a large IO can be split to
+ * several segments of stripe length (currently 64K).
+ */
+ blk_start_plug(&plug);
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+ ret = btrfs_fdatawrite_range(inode, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+ blk_finish_plug(&plug);
+
+ return ret;
+}
+
+static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ list_empty(&ctx->ordered_extents))
+ return true;
+
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
+ if (inode->last_trans <= fs_info->last_trans_committed &&
+ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
+ list_empty(&ctx->ordered_extents)))
+ return true;
+
+ return false;
+}
+
+/*
+ * fsync call for both files and directories. This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit. This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct dentry *dentry = file_dentry(file);
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_log_ctx ctx;
+ int ret = 0, err;
+ u64 len;
+ bool full_sync;
+
+ trace_btrfs_sync_file(file, datasync);
+
+ btrfs_init_log_ctx(&ctx, inode);
+
+ /*
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
+ */
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
+
+ /*
+ * We write the dirty pages in the range and wait until they complete
+ * out of the ->i_mutex. If so, we can flush the dirty pages by
+ * multi-task, and make the performance up. See
+ * btrfs_wait_ordered_range for an explanation of the ASYNC check.
+ */
+ ret = start_ordered_ops(inode, start, end);
+ if (ret)
+ goto out;
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+
+ atomic_inc(&root->log_batch);
+
+ /*
+ * Before we acquired the inode's lock and the mmap lock, someone may
+ * have dirtied more pages in the target range. We need to make sure
+ * that writeback for any such pages does not start while we are logging
+ * the inode, because if it does, any of the following might happen when
+ * we are not doing a full inode sync:
+ *
+ * 1) We log an extent after its writeback finishes but before its
+ * checksums are added to the csum tree, leading to -EIO errors
+ * when attempting to read the extent after a log replay.
+ *
+ * 2) We can end up logging an extent before its writeback finishes.
+ * Therefore after the log replay we will have a file extent item
+ * pointing to an unwritten extent (and no data checksums as well).
+ *
+ * So trigger writeback for any eventual new dirty pages and then we
+ * wait for all ordered extents to complete below.
+ */
+ ret = start_ordered_ops(inode, start, end);
+ if (ret) {
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ goto out;
+ }
+
+ /*
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
+ * We check the flag here after starting delalloc above, because when
+ * running delalloc the full sync flag may be set if we need to drop
+ * extra extent map ranges due to temporary memory allocation failures.
+ */
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ /*
+ * We have to do this here to avoid the priority inversion of waiting on
+ * IO of a lower priority task while holding a transaction open.
+ *
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
+ *
+ * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
+ * logical address recorded in the ordered extent may change. We need
+ * to wait for the IO to stabilize the logical address.
+ */
+ if (full_sync || btrfs_is_zoned(fs_info)) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
+ }
+
+ if (ret)
+ goto out_release_extents;
+
+ atomic_inc(&root->log_batch);
+
+ smp_mb();
+ if (skip_inode_logging(&ctx)) {
+ /*
+ * We've had everything committed since the last time we were
+ * modified so clear this flag in case it was set for whatever
+ * reason, it's no longer relevant.
+ */
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ /*
+ * An ordered extent might have started before and completed
+ * already with io errors, in which case the inode was not
+ * updated and we end up here. So check the inode's mapping
+ * for any errors that might have happened since we last
+ * checked called fsync.
+ */
+ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ goto out_release_extents;
+ }
+
+ /*
+ * We use start here because we will need to wait on the IO to complete
+ * in btrfs_sync_log, which could require joining a transaction (for
+ * example checking cross references in the nocow path). If we use join
+ * here we could get into a situation where we're waiting on IO to
+ * happen that is blocked on a transaction trying to commit. With start
+ * we inc the extwriter counter, so we wait for all extwriters to exit
+ * before we start blocking joiners. This comment is to keep somebody
+ * from thinking they are super smart and changing this to
+ * btrfs_join_transaction *cough*Josef*cough*.
+ */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_release_extents;
+ }
+ trans->in_fsync = true;
+
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
+ if (ret < 0) {
+ /* Fallthrough and commit/free transaction. */
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ }
+
+ /* we've logged all the items and now have a consistent
+ * version of the file in the log. It is possible that
+ * someone will come in and modify the file, but that's
+ * fine because the log is consistent on disk, and we
+ * have references to all of the file's extents
+ *
+ * It is possible that someone will come in and log the
+ * file again, but that will end up using the synchronization
+ * inside btrfs_sync_log to keep things safe.
+ */
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+
+ if (ret == BTRFS_NO_LOG_SYNC) {
+ ret = btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ /* We successfully logged the inode, attempt to sync the log. */
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
+ if (!ret) {
+ ret = btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
+
+ /*
+ * At this point we need to commit the transaction because we had
+ * btrfs_need_log_full_commit() or some other error.
+ *
+ * If we didn't do a full sync we have to stop the trans handle, wait on
+ * the ordered extents, start it again and commit the transaction. If
+ * we attempt to wait on the ordered extents here we could deadlock with
+ * something like fallocate() that is holding the extent lock trying to
+ * start a transaction while some other thread is trying to commit the
+ * transaction while we (fsync) are currently holding the transaction
+ * open.
+ */
+ if (!full_sync) {
+ ret = btrfs_end_transaction(trans);
+ if (ret)
+ goto out;
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret)
+ goto out;
+
+ /*
+ * This is safe to use here because we're only interested in
+ * making sure the transaction that had the ordered extents is
+ * committed. We aren't waiting on anything past this point,
+ * we're purely getting the transaction and committing it.
+ */
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+
+ /*
+ * We committed the transaction and there's no currently
+ * running transaction, this means everything we care
+ * about made it to disk and we are done.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = btrfs_commit_transaction(trans);
+out:
+ ASSERT(list_empty(&ctx.list));
+ ASSERT(list_empty(&ctx.conflict_inodes));
+ err = file_check_and_advance_wb_err(file);
+ if (!ret)
+ ret = err;
+ return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ goto out;
+}
+
+static const struct vm_operations_struct btrfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = btrfs_page_mkwrite,
+};
+
+static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = filp->f_mapping;
+
+ if (!mapping->a_ops->read_folio)
+ return -ENOEXEC;
+
+ file_accessed(filp);
+ vma->vm_ops = &btrfs_file_vm_ops;
+
+ return 0;
+}
+
+static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
+ int slot, u64 start, u64 end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
+ if (btrfs_file_extent_disk_bytenr(leaf, fi))
+ return 0;
+
+ if (key.offset == end)
+ return 1;
+ if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+ return 1;
+ return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path, u64 offset, u64 end)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct extent_map *hole_em;
+ struct btrfs_key key;
+ int ret;
+
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
+ goto out;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = offset;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret <= 0) {
+ /*
+ * We should have dropped this offset, so if we find it then
+ * something has gone horribly wrong.
+ */
+ if (ret == 0)
+ ret = -EINVAL;
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
+ u64 num_bytes;
+
+ path->slots[0]--;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+ end - offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+
+ if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
+ u64 num_bytes;
+
+ key.offset = offset;
+ btrfs_set_item_key_safe(fs_info, path, &key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+ offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
+ end - offset);
+ if (ret)
+ return ret;
+
+out:
+ btrfs_release_path(path);
+
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ btrfs_drop_extent_map_range(inode, offset, end - 1, false);
+ btrfs_set_inode_full_sync(inode);
+ } else {
+ hole_em->start = offset;
+ hole_em->len = end - offset;
+ hole_em->ram_bytes = hole_em->len;
+ hole_em->orig_start = offset;
+
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = trans->transid;
+
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
+ free_extent_map(hole_em);
+ if (ret)
+ btrfs_set_inode_full_sync(inode);
+ }
+
+ return 0;
+}
+
+/*
+ * Find a hole extent on given inode and change start/len to the end of hole
+ * extent.(hole/vacuum extent whose em->start <= start &&
+ * em->start + em->len > start)
+ * When a hole extent is found, return 1 and modify start/len.
+ */
+static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map *em;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0,
+ round_down(*start, fs_info->sectorsize),
+ round_up(*len, fs_info->sectorsize));
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ /* Hole or vacuum extent(only exists in no-hole mode) */
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ ret = 1;
+ *len = em->start + em->len > *start + *len ?
+ 0 : *start + *len - em->start - em->len;
+ *start = em->start + em->len;
+ }
+ free_extent_map(em);
+ return ret;
+}
+
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart,
+ const u64 lockend,
+ struct extent_state **cached_state)
+{
+ /*
+ * For subpage case, if the range is not at page boundary, we could
+ * have pages at the leading/tailing part of the range.
+ * This could lead to dead loop since filemap_range_has_page()
+ * will always return true.
+ * So here we need to do extra page alignment for
+ * filemap_range_has_page().
+ */
+ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
+ const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+
+ while (1) {
+ truncate_pagecache_range(inode, lockstart, lockend);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
+ /*
+ * We can't have ordered extents in the range, nor dirty/writeback
+ * pages, because we have locked the inode's VFS lock in exclusive
+ * mode, we have locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range and we have waited
+ * for any ordered extents in the range to complete.
+ * We can race with anyone reading pages from this range, so after
+ * locking the range check if we have pages in the range, and if
+ * we do, unlock the range and retry.
+ */
+ if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
+ page_lockend))
+ break;
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
+ }
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
+}
+
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len,
+ const u64 bytes_to_drop)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int slot;
+ struct btrfs_ref ref = { 0 };
+ int ret;
+
+ if (replace_len == 0)
+ return 0;
+
+ if (extent_info->disk_offset == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
+ return 0;
+ }
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = extent_info->file_offset;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_file_extent_item));
+ if (ret)
+ return ret;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, extent_info->extent_buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ sizeof(struct btrfs_file_extent_item));
+ extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
+ replace_len);
+ if (ret)
+ return ret;
+
+ /* If it's a hole, nothing more needs to be done. */
+ if (extent_info->disk_offset == 0) {
+ btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
+ return 0;
+ }
+
+ btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(inode),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(inode), ref_offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
+
+ return ret;
+}
+
+/*
+ * The respective range must have been previously locked, as well as the inode.
+ * The end offset is inclusive (last byte of the range).
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
+ */
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
+ struct btrfs_trans_handle **trans_out)
+{
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_block_rsv *rsv;
+ unsigned int rsv_count;
+ u64 cur_offset;
+ u64 len = end - start;
+ int ret = 0;
+
+ if (end <= start)
+ return -EINVAL;
+
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ rsv->failfast = true;
+
+ /*
+ * 1 - update the inode
+ * 1 - removing the extents in the range
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
+ */
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
+ rsv_count = 3;
+ else
+ rsv_count = 2;
+
+ trans = btrfs_start_transaction(root, rsv_count);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out_free;
+ }
+
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ min_size, false);
+ if (WARN_ON(ret))
+ goto out_trans;
+ trans->block_rsv = rsv;
+
+ cur_offset = start;
+ drop_args.path = path;
+ drop_args.end = end + 1;
+ drop_args.drop_cache = true;
+ while (cur_offset < end) {
+ drop_args.start = cur_offset;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ /* If we are punching a hole decrement the inode's byte count */
+ if (!extent_info)
+ btrfs_update_inode_bytes(inode, 0,
+ drop_args.bytes_found);
+ if (ret != -ENOSPC) {
+ /*
+ * The only time we don't want to abort is if we are
+ * attempting to clone a partial inline extent, in which
+ * case we'll get EOPNOTSUPP. However if we aren't
+ * clone we need to abort no matter what, because if we
+ * got EOPNOTSUPP via prealloc then we messed up and
+ * need to abort.
+ */
+ if (ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent)))
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+
+ if (!extent_info && cur_offset < drop_args.drop_end &&
+ cur_offset < ino_size) {
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_args.drop_end);
+ if (ret) {
+ /*
+ * If we failed then we didn't insert our hole
+ * entries for the area we dropped, so now the
+ * fs is corrupted, so we must abort the
+ * transaction.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ } else if (!extent_info && cur_offset < drop_args.drop_end) {
+ /*
+ * We are past the i_size here, but since we didn't
+ * insert holes we need to clear the mapped area so we
+ * know to not set disk_i_size in this area until a new
+ * file extent is inserted here.
+ */
+ ret = btrfs_inode_clear_file_extent_range(inode,
+ cur_offset,
+ drop_args.drop_end - cur_offset);
+ if (ret) {
+ /*
+ * We couldn't clear our area, so we could
+ * presumably adjust up and corrupt the fs, so
+ * we need to abort.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
+ if (extent_info &&
+ drop_args.drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_args.drop_end -
+ extent_info->file_offset;
+
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len,
+ drop_args.bytes_found);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
+ }
+
+ /*
+ * We are releasing our handle on the transaction, balance the
+ * dirty pages of the btree inode and flush delayed items, and
+ * then get a new transaction handle, which may now point to a
+ * new transaction in case someone else may have committed the
+ * transaction we used to replace/drop file extent items. So
+ * bump the inode's iversion and update mtime and ctime except
+ * if we are called from a dedupe context. This is because a
+ * power failure/crash may happen after the transaction is
+ * committed and before we finish replacing/dropping all the
+ * file extent items we need.
+ */
+ inode_inc_iversion(&inode->vfs_inode);
+
+ if (!extent_info || extent_info->update_times) {
+ inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
+ }
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ break;
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+
+ trans = btrfs_start_transaction(root, rsv_count);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
+ rsv, min_size, false);
+ if (WARN_ON(ret))
+ break;
+ trans->block_rsv = rsv;
+
+ cur_offset = drop_args.drop_end;
+ len = end - cur_offset;
+ if (!extent_info && len) {
+ ret = find_first_non_hole(inode, &cur_offset, &len);
+ if (unlikely(ret < 0))
+ break;
+ if (ret && !len) {
+ ret = 0;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If we were cloning, force the next fsync to be a full one since we
+ * we replaced (or just dropped in the case of cloning holes when
+ * NO_HOLES is enabled) file extent items and did not setup new extent
+ * maps for the replacement extents (or holes).
+ */
+ if (extent_info && !extent_info->is_new_extent)
+ btrfs_set_inode_full_sync(inode);
+
+ if (ret)
+ goto out_trans;
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ /*
+ * If we are using the NO_HOLES feature we might have had already an
+ * hole that overlaps a part of the region [lockstart, lockend] and
+ * ends at (or beyond) lockend. Since we have no file extent items to
+ * represent holes, drop_end can be less than lockend and so we must
+ * make sure we have an extent map representing the existing hole (the
+ * call to __btrfs_drop_extents() might have dropped the existing extent
+ * map representing the existing hole), otherwise the fast fsync path
+ * will not record the existence of the hole region
+ * [existing_hole_start, lockend].
+ */
+ if (drop_args.drop_end <= end)
+ drop_args.drop_end = end + 1;
+ /*
+ * Don't insert file hole extent item if it's for a range beyond eof
+ * (because it's useless) or if it represents a 0 bytes range (when
+ * cur_offset == drop_end).
+ */
+ if (!extent_info && cur_offset < ino_size &&
+ cur_offset < drop_args.drop_end) {
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_args.drop_end);
+ if (ret) {
+ /* Same comment as above. */
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+ } else if (!extent_info && cur_offset < drop_args.drop_end) {
+ /* See the comment in the loop above for the reasoning here. */
+ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
+ drop_args.drop_end - cur_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+
+ }
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, extent_info->data_len,
+ drop_args.bytes_found);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+ }
+
+out_trans:
+ if (!trans)
+ goto out_free;
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ *trans_out = trans;
+out_free:
+ btrfs_free_block_rsv(fs_info, rsv);
+out:
+ return ret;
+}
+
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = NULL;
+ u64 lockstart;
+ u64 lockend;
+ u64 tail_start;
+ u64 tail_len;
+ u64 orig_start = offset;
+ int ret = 0;
+ bool same_block;
+ u64 ino_size;
+ bool truncated_block = false;
+ bool updated_inode = false;
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+
+ ret = btrfs_wait_ordered_range(inode, offset, len);
+ if (ret)
+ goto out_only_mutex;
+
+ ino_size = round_up(inode->i_size, fs_info->sectorsize);
+ ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ /* Already in a large hole */
+ ret = 0;
+ goto out_only_mutex;
+ }
+
+ ret = file_modified(file);
+ if (ret)
+ goto out_only_mutex;
+
+ lockstart = round_up(offset, fs_info->sectorsize);
+ lockend = round_down(offset + len, fs_info->sectorsize) - 1;
+ same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
+ == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
+ /*
+ * We needn't truncate any block which is beyond the end of the file
+ * because we are sure there is no data there.
+ */
+ /*
+ * Only do this if we are in the same block and we aren't doing the
+ * entire block.
+ */
+ if (same_block && len < fs_info->sectorsize) {
+ if (offset < ino_size) {
+ truncated_block = true;
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
+ 0);
+ } else {
+ ret = 0;
+ }
+ goto out_only_mutex;
+ }
+
+ /* zero back part of the first block */
+ if (offset < ino_size) {
+ truncated_block = true;
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
+ if (ret) {
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ return ret;
+ }
+ }
+
+ /* Check the aligned pages after the first unaligned page,
+ * if offset != orig_start, which means the first unaligned page
+ * including several following pages are already in holes,
+ * the extra check can be skipped */
+ if (offset == orig_start) {
+ /* after truncate page, check hole again */
+ len = offset + len - lockstart;
+ offset = lockstart;
+ ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ ret = 0;
+ goto out_only_mutex;
+ }
+ lockstart = offset;
+ }
+
+ /* Check the tail unaligned part is in a hole */
+ tail_start = lockend + 1;
+ tail_len = offset + len - tail_start;
+ if (tail_len) {
+ ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
+ if (unlikely(ret < 0))
+ goto out_only_mutex;
+ if (!ret) {
+ /* zero the front end of the last page */
+ if (tail_start + tail_len < ino_size) {
+ truncated_block = true;
+ ret = btrfs_truncate_block(BTRFS_I(inode),
+ tail_start + tail_len,
+ 0, 1);
+ if (ret)
+ goto out_only_mutex;
+ }
+ }
+ }
+
+ if (lockend < lockstart) {
+ ret = 0;
+ goto out_only_mutex;
+ }
+
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
+ lockend, NULL, &trans);
+ btrfs_free_path(path);
+ if (ret)
+ goto out;
+
+ ASSERT(trans != NULL);
+ inode_inc_iversion(inode);
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ updated_inode = true;
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+out:
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+out_only_mutex:
+ if (!updated_inode && truncated_block && !ret) {
+ /*
+ * If we only end up zeroing part of a page, we still need to
+ * update the inode item, so that all the time fields are
+ * updated as well as the necessary btrfs inode in memory fields
+ * for detecting, at fsync time, if the inode isn't yet in the
+ * log tree or it's there but not up to date.
+ */
+ struct timespec64 now = current_time(inode);
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = now;
+ inode->i_ctime = now;
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ int ret2;
+
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret2 = btrfs_end_transaction(trans);
+ if (!ret)
+ ret = ret2;
+ }
+ }
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ return ret;
+}
+
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+ struct falloc_range *range = NULL;
+
+ if (!list_empty(head)) {
+ /*
+ * As fallocate iterates by bytenr order, we only need to check
+ * the last range.
+ */
+ range = list_last_entry(head, struct falloc_range, list);
+ if (range->start + range->len == start) {
+ range->len += len;
+ return 0;
+ }
+ }
+
+ range = kmalloc(sizeof(*range), GFP_KERNEL);
+ if (!range)
+ return -ENOMEM;
+ range->start = start;
+ range->len = len;
+ list_add_tail(&range->list, head);
+ return 0;
+}
+
+static int btrfs_fallocate_update_isize(struct inode *inode,
+ const u64 end,
+ const int mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+ int ret2;
+
+ if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+ return 0;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ inode->i_ctime = current_time(inode);
+ i_size_write(inode, end);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret2 = btrfs_end_transaction(trans);
+
+ return ret ? ret : ret2;
+}
+
+enum {
+ RANGE_BOUNDARY_WRITTEN_EXTENT,
+ RANGE_BOUNDARY_PREALLOC_EXTENT,
+ RANGE_BOUNDARY_HOLE,
+};
+
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
+ u64 offset)
+{
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
+ struct extent_map *em;
+ int ret;
+
+ offset = round_down(offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start == EXTENT_MAP_HOLE)
+ ret = RANGE_BOUNDARY_HOLE;
+ else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
+ else
+ ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
+
+ free_extent_map(em);
+ return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+ loff_t offset,
+ loff_t len,
+ const int mode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct extent_map *em;
+ struct extent_changeset *data_reserved = NULL;
+ int ret;
+ u64 alloc_hint = 0;
+ const u64 sectorsize = fs_info->sectorsize;
+ u64 alloc_start = round_down(offset, sectorsize);
+ u64 alloc_end = round_up(offset + len, sectorsize);
+ u64 bytes_to_reserve = 0;
+ bool space_reserved = false;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ alloc_end - alloc_start);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ /*
+ * Avoid hole punching and extent allocation for some cases. More cases
+ * could be considered, but these are unlikely common and we keep things
+ * as simple as possible for now. Also, intentionally, if the target
+ * range contains one or more prealloc extents together with regular
+ * extents and holes, we drop all the existing extents and allocate a
+ * new prealloc extent, so that we get a larger contiguous disk extent.
+ */
+ if (em->start <= alloc_start &&
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ const u64 em_end = em->start + em->len;
+
+ if (em_end >= offset + len) {
+ /*
+ * The whole range is already a prealloc extent,
+ * do nothing except updating the inode's i_size if
+ * needed.
+ */
+ free_extent_map(em);
+ ret = btrfs_fallocate_update_isize(inode, offset + len,
+ mode);
+ goto out;
+ }
+ /*
+ * Part of the range is already a prealloc extent, so operate
+ * only on the remaining part of the range.
+ */
+ alloc_start = em_end;
+ ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+ len = offset + len - alloc_start;
+ offset = alloc_start;
+ alloc_hint = em->block_start + em->len;
+ }
+ free_extent_map(em);
+
+ if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+ BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ sectorsize);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ free_extent_map(em);
+ ret = btrfs_fallocate_update_isize(inode, offset + len,
+ mode);
+ goto out;
+ }
+ if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+ free_extent_map(em);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
+ 0);
+ if (!ret)
+ ret = btrfs_fallocate_update_isize(inode,
+ offset + len,
+ mode);
+ return ret;
+ }
+ free_extent_map(em);
+ alloc_start = round_down(offset, sectorsize);
+ alloc_end = alloc_start + sectorsize;
+ goto reserve_space;
+ }
+
+ alloc_start = round_up(offset, sectorsize);
+ alloc_end = round_down(offset + len, sectorsize);
+
+ /*
+ * For unaligned ranges, check the pages at the boundaries, they might
+ * map to an extent, in which case we need to partially zero them, or
+ * they might map to a hole, in which case we need our allocation range
+ * to cover them.
+ */
+ if (!IS_ALIGNED(offset, sectorsize)) {
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
+ if (ret < 0)
+ goto out;
+ if (ret == RANGE_BOUNDARY_HOLE) {
+ alloc_start = round_down(offset, sectorsize);
+ ret = 0;
+ } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
+ if (ret)
+ goto out;
+ } else {
+ ret = 0;
+ }
+ }
+
+ if (!IS_ALIGNED(offset + len, sectorsize)) {
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset + len);
+ if (ret < 0)
+ goto out;
+ if (ret == RANGE_BOUNDARY_HOLE) {
+ alloc_end = round_up(offset + len, sectorsize);
+ ret = 0;
+ } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
+ 0, 1);
+ if (ret)
+ goto out;
+ } else {
+ ret = 0;
+ }
+ }
+
+reserve_space:
+ if (alloc_start < alloc_end) {
+ struct extent_state *cached_state = NULL;
+ const u64 lockstart = alloc_start;
+ const u64 lockend = alloc_end - 1;
+
+ bytes_to_reserve = alloc_end - alloc_start;
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ bytes_to_reserve);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
+ alloc_start, bytes_to_reserve);
+ if (ret) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
+ goto out;
+ }
+ ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+ alloc_end - alloc_start,
+ i_blocksize(inode),
+ offset + len, &alloc_hint);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+ /* btrfs_prealloc_file_range releases reserved space on error */
+ if (ret) {
+ space_reserved = false;
+ goto out;
+ }
+ }
+ ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
+ out:
+ if (ret && space_reserved)
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
+ alloc_start, bytes_to_reserve);
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+static long btrfs_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ struct falloc_range *range;
+ struct falloc_range *tmp;
+ struct list_head reserve_list;
+ u64 cur_offset;
+ u64 last_byte;
+ u64 alloc_start;
+ u64 alloc_end;
+ u64 alloc_hint = 0;
+ u64 locked_end;
+ u64 actual_end = 0;
+ u64 data_space_needed = 0;
+ u64 data_space_reserved = 0;
+ u64 qgroup_reserved = 0;
+ struct extent_map *em;
+ int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
+ int ret;
+
+ /* Do not allow fallocate in ZONED mode */
+ if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ alloc_start = round_down(offset, blocksize);
+ alloc_end = round_up(offset + len, blocksize);
+ cur_offset = alloc_start;
+
+ /* Make sure we aren't being give some crap mode */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_ZERO_RANGE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return btrfs_punch_hole(file, offset, len);
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret)
+ goto out;
+ }
+
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
+ /*
+ * TODO: Move these two operations after we have checked
+ * accurate reserved space, or fallocate can still fail but
+ * with page truncated or size expanded.
+ *
+ * But that's a minor problem and won't do much harm BTW.
+ */
+ if (alloc_start > inode->i_size) {
+ ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
+ alloc_start);
+ if (ret)
+ goto out;
+ } else if (offset + len > inode->i_size) {
+ /*
+ * If we are fallocating from the end of the file onward we
+ * need to zero out the end of the block if i_size lands in the
+ * middle of a block.
+ */
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * We have locked the inode at the VFS level (in exclusive mode) and we
+ * have locked the i_mmap_lock lock (in exclusive mode). Now before
+ * locking the file range, flush all dealloc in the range and wait for
+ * all ordered extents in the range to complete. After this we can lock
+ * the file range and, due to the previous locking we did, we know there
+ * can't be more delalloc or ordered extents in the range.
+ */
+ ret = btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
+
+ if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = btrfs_zero_range(inode, offset, len, mode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ return ret;
+ }
+
+ locked_end = alloc_end - 1;
+ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
+
+ /* First, check if we exceed the qgroup limit */
+ INIT_LIST_HEAD(&reserve_list);
+ while (cur_offset < alloc_end) {
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+ alloc_end - cur_offset);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ break;
+ }
+ last_byte = min(extent_map_end(em), alloc_end);
+ actual_end = min_t(u64, extent_map_end(em), offset + len);
+ last_byte = ALIGN(last_byte, blocksize);
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ (cur_offset >= inode->i_size &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ const u64 range_len = last_byte - cur_offset;
+
+ ret = add_falloc_range(&reserve_list, cur_offset, range_len);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
+ }
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
+ &data_reserved, cur_offset, range_len);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
+ }
+ qgroup_reserved += range_len;
+ data_space_needed += range_len;
+ }
+ free_extent_map(em);
+ cur_offset = last_byte;
+ }
+
+ if (!ret && data_space_needed > 0) {
+ /*
+ * We are safe to reserve space here as we can't have delalloc
+ * in the range, see above.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ data_space_needed);
+ if (!ret)
+ data_space_reserved = data_space_needed;
+ }
+
+ /*
+ * If ret is still 0, means we're OK to fallocate.
+ * Or just cleanup the list and exit.
+ */
+ list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+ if (!ret) {
+ ret = btrfs_prealloc_file_range(inode, mode,
+ range->start,
+ range->len, i_blocksize(inode),
+ offset + len, &alloc_hint);
+ /*
+ * btrfs_prealloc_file_range() releases space even
+ * if it returns an error.
+ */
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (data_space_reserved > 0) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, range->start,
+ range->len);
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (qgroup_reserved > 0) {
+ btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
+ range->start, range->len, NULL);
+ qgroup_reserved -= range->len;
+ }
+ list_del(&range->list);
+ kfree(range);
+ }
+ if (ret < 0)
+ goto out_unlock;
+
+ /*
+ * We didn't need to allocate any more space, but we still extended the
+ * size of the file so we need to update i_size and the inode item.
+ */
+ ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
+out_unlock:
+ unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
+out:
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
+/*
+ * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
+ * that has unflushed and/or flushing delalloc. There might be other adjacent
+ * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
+ * looping while it gets adjacent subranges, and merging them together.
+ */
+static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ const u64 len = end + 1 - start;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ u64 em_end;
+ u64 delalloc_len;
+
+ /*
+ * Search the io tree first for EXTENT_DELALLOC. If we find any, it
+ * means we have delalloc (dirty pages) for which writeback has not
+ * started yet.
+ */
+ *delalloc_start_ret = start;
+ delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1);
+ /*
+ * If delalloc was found then *delalloc_start_ret has a sector size
+ * aligned value (rounded down).
+ */
+ if (delalloc_len > 0)
+ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
+
+ /*
+ * Now also check if there's any extent map in the range that does not
+ * map to a hole or prealloc extent. We do this because:
+ *
+ * 1) When delalloc is flushed, the file range is locked, we clear the
+ * EXTENT_DELALLOC bit from the io tree and create an extent map for
+ * an allocated extent. So we might just have been called after
+ * delalloc is flushed and before the ordered extent completes and
+ * inserts the new file extent item in the subvolume's btree;
+ *
+ * 2) We may have an extent map created by flushing delalloc for a
+ * subrange that starts before the subrange we found marked with
+ * EXTENT_DELALLOC in the io tree.
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ return (delalloc_len > 0);
+ }
+
+ /* extent_map_end() returns a non-inclusive end offset. */
+ em_end = extent_map_end(em);
+
+ /*
+ * If we have a hole/prealloc extent map, check the next one if this one
+ * ends before our range's end.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
+ struct extent_map *next_em;
+
+ next_em = btrfs_next_extent_map(em_tree, em);
+ free_extent_map(em);
+
+ /*
+ * There's no next extent map or the next one starts beyond our
+ * range, return the range found in the io tree (if any).
+ */
+ if (!next_em || next_em->start > end) {
+ read_unlock(&em_tree->lock);
+ free_extent_map(next_em);
+ return (delalloc_len > 0);
+ }
+
+ em_end = extent_map_end(next_em);
+ em = next_em;
+ }
+
+ read_unlock(&em_tree->lock);
+
+ /*
+ * We have a hole or prealloc extent that ends at or beyond our range's
+ * end, return the range found in the io tree (if any).
+ */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ free_extent_map(em);
+ return (delalloc_len > 0);
+ }
+
+ /*
+ * We don't have any range as EXTENT_DELALLOC in the io tree, so the
+ * extent map is the only subrange representing delalloc.
+ */
+ if (delalloc_len == 0) {
+ *delalloc_start_ret = em->start;
+ *delalloc_end_ret = min(end, em_end - 1);
+ free_extent_map(em);
+ return true;
+ }
+
+ /*
+ * The extent map represents a delalloc range that starts before the
+ * delalloc range we found in the io tree.
+ */
+ if (em->start < *delalloc_start_ret) {
+ *delalloc_start_ret = em->start;
+ /*
+ * If the ranges are adjacent, return a combined range.
+ * Otherwise return the extent map's range.
+ */
+ if (em_end < *delalloc_start_ret)
+ *delalloc_end_ret = min(end, em_end - 1);
+
+ free_extent_map(em);
+ return true;
+ }
+
+ /*
+ * The extent map starts after the delalloc range we found in the io
+ * tree. If it's adjacent, return a combined range, otherwise return
+ * the range found in the io tree.
+ */
+ if (*delalloc_end_ret + 1 == em->start)
+ *delalloc_end_ret = min(end, em_end - 1);
+
+ free_extent_map(em);
+ return true;
+}
+
+/*
+ * Check if there's delalloc in a given range.
+ *
+ * @inode: The inode.
+ * @start: The start offset of the range. It does not need to be
+ * sector size aligned.
+ * @end: The end offset (inclusive value) of the search range.
+ * It does not need to be sector size aligned.
+ * @delalloc_start_ret: Output argument, set to the start offset of the
+ * subrange found with delalloc (may not be sector size
+ * aligned).
+ * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
+ * of the subrange found with delalloc.
+ *
+ * Returns true if a subrange with delalloc is found within the given range, and
+ * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
+ * end offsets of the subrange.
+ */
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
+ u64 prev_delalloc_end = 0;
+ bool ret = false;
+
+ while (cur_offset <= end) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = find_delalloc_subrange(inode, cur_offset, end,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ if (prev_delalloc_end == 0) {
+ /* First subrange found. */
+ *delalloc_start_ret = max(delalloc_start, start);
+ *delalloc_end_ret = delalloc_end;
+ ret = true;
+ } else if (delalloc_start == prev_delalloc_end + 1) {
+ /* Subrange adjacent to the previous one, merge them. */
+ *delalloc_end_ret = delalloc_end;
+ } else {
+ /* Subrange not adjacent to the previous one, exit. */
+ break;
+ }
+
+ prev_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/*
+ * Check if there's a hole or delalloc range in a range representing a hole (or
+ * prealloc extent) found in the inode's subvolume btree.
+ *
+ * @inode: The inode.
+ * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
+ * @start: Start offset of the hole region. It does not need to be sector
+ * size aligned.
+ * @end: End offset (inclusive value) of the hole region. It does not
+ * need to be sector size aligned.
+ * @start_ret: Return parameter, used to set the start of the subrange in the
+ * hole that matches the search criteria (seek mode), if such
+ * subrange is found (return value of the function is true).
+ * The value returned here may not be sector size aligned.
+ *
+ * Returns true if a subrange matching the given seek mode is found, and if one
+ * is found, it updates @start_ret with the start of the subrange.
+ */
+static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+ u64 start, u64 end, u64 *start_ret)
+{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+ &delalloc_start, &delalloc_end);
+ if (delalloc && whence == SEEK_DATA) {
+ *start_ret = delalloc_start;
+ return true;
+ }
+
+ if (delalloc && whence == SEEK_HOLE) {
+ /*
+ * We found delalloc but it starts after out start offset. So we
+ * have a hole between our start offset and the delalloc start.
+ */
+ if (start < delalloc_start) {
+ *start_ret = start;
+ return true;
+ }
+ /*
+ * Delalloc range starts at our start offset.
+ * If the delalloc range's length is smaller than our range,
+ * then it means we have a hole that starts where the delalloc
+ * subrange ends.
+ */
+ if (delalloc_end < end) {
+ *start_ret = delalloc_end + 1;
+ return true;
+ }
+
+ /* There's delalloc for the whole range. */
+ return false;
+ }
+
+ if (!delalloc && whence == SEEK_HOLE) {
+ *start_ret = start;
+ return true;
+ }
+
+ /*
+ * No delalloc in the range and we are seeking for data. The caller has
+ * to iterate to the next extent item in the subvolume btree.
+ */
+ return false;
+}
+
+static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
+ int whence)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_state *cached_state = NULL;
+ const loff_t i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 last_extent_end;
+ u64 lockstart;
+ u64 lockend;
+ u64 start;
+ int ret;
+ bool found = false;
+
+ if (i_size == 0 || offset >= i_size)
+ return -ENXIO;
+
+ /*
+ * Quick path. If the inode has no prealloc extents and its number of
+ * bytes used matches its i_size, then it can not have holes.
+ */
+ if (whence == SEEK_HOLE &&
+ !(inode->flags & BTRFS_INODE_PREALLOC) &&
+ inode_get_bytes(&inode->vfs_inode) == i_size)
+ return i_size;
+
+ /*
+ * offset can be negative, in this case we start finding DATA/HOLE from
+ * the very start of the file.
+ */
+ start = max_t(loff_t, 0, offset);
+
+ lockstart = round_down(start, fs_info->sectorsize);
+ lockend = round_up(i_size, fs_info->sectorsize);
+ if (lockend <= lockstart)
+ lockend = lockstart + fs_info->sectorsize;
+ lockend--;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ last_extent_end = lockstart;
+
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ while (start < i_size) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *extent;
+ u64 extent_end;
+ u8 type;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * In the first iteration we may have a slot that points to an
+ * extent that ends before our start offset, so skip it.
+ */
+ if (extent_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ /* We have an implicit hole, NO_HOLES feature is likely set. */
+ if (last_extent_end < key.offset) {
+ u64 search_start = last_extent_end;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ search_start,
+ key.offset - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the extent.
+ */
+ }
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, extent);
+
+ /*
+ * Can't access the extent's disk_bytenr field if this is an
+ * inline extent, since at that offset, it's where the extent
+ * data starts.
+ */
+ if (type == BTRFS_FILE_EXTENT_PREALLOC ||
+ (type == BTRFS_FILE_EXTENT_REG &&
+ btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
+ /*
+ * Explicit hole or prealloc extent, search for delalloc.
+ * A prealloc extent is treated like a hole.
+ */
+ u64 search_start = key.offset;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ search_start,
+ extent_end - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the next
+ * extent item.
+ */
+ } else {
+ /*
+ * Found a regular or inline extent.
+ * If we are seeking for data, adjust the start offset
+ * and stop, we're done.
+ */
+ if (whence == SEEK_DATA) {
+ start = max_t(u64, key.offset, offset);
+ found = true;
+ break;
+ }
+ /*
+ * Else, we are seeking for a hole, check the next file
+ * extent item.
+ */
+ }
+
+ start = extent_end;
+ last_extent_end = extent_end;
+ path->slots[0]++;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ cond_resched();
+ }
+
+ /* We have an implicit hole from the last extent found up to i_size. */
+ if (!found && start < i_size) {
+ found = find_desired_extent_in_hole(inode, whence, start,
+ i_size - 1, &start);
+ if (!found)
+ start = i_size;
+ }
+
+out:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_free_path(path);
+
+ if (ret < 0)
+ return ret;
+
+ if (whence == SEEK_DATA && start >= i_size)
+ return -ENXIO;
+
+ return min_t(loff_t, start, i_size);
+}
+
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+
+ switch (whence) {
+ default:
+ return generic_file_llseek(file, offset, whence);
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ offset = find_desired_extent(BTRFS_I(inode), offset, whence);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ break;
+ }
+
+ if (offset < 0)
+ return offset;
+
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
+static int btrfs_file_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
+
+ ret = fsverity_file_open(inode, filp);
+ if (ret)
+ return ret;
+ return generic_file_open(inode, filp);
+}
+
+static int check_direct_read(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int ret;
+ int i, seg;
+
+ ret = check_direct_IO(fs_info, iter, offset);
+ if (ret < 0)
+ return ret;
+
+ if (!iter_is_iovec(iter))
+ return 0;
+
+ for (seg = 0; seg < iter->nr_segs; seg++)
+ for (i = seg + 1; i < iter->nr_segs; i++)
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+ return -EINVAL;
+ return 0;
+}
+
+static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ size_t prev_left = 0;
+ ssize_t read = 0;
+ ssize_t ret;
+
+ if (fsverity_active(inode))
+ return 0;
+
+ if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
+ return 0;
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+again:
+ /*
+ * This is similar to what we do for direct IO writes, see the comment
+ * at btrfs_direct_write(), but we also disable page faults in addition
+ * to disabling them only at the iov_iter level. This is because when
+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+ * which can still trigger page fault ins despite having set ->nofault
+ * to true of our 'to' iov_iter.
+ *
+ * The difference to direct IO writes is that we deadlock when trying
+ * to lock the extent range in the inode's tree during he page reads
+ * triggered by the fault in (while for writes it is due to waiting for
+ * our own ordered extent). This is because for direct IO reads,
+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
+ */
+ pagefault_disable();
+ to->nofault = true;
+ ret = btrfs_dio_read(iocb, to, read);
+ to->nofault = false;
+ pagefault_enable();
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ read = ret;
+
+ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(to);
+
+ if (left == prev_left) {
+ /*
+ * We didn't make any progress since the last attempt,
+ * fallback to a buffered read for the remainder of the
+ * range. This is just to avoid any possibility of looping
+ * for too long.
+ */
+ ret = read;
+ } else {
+ /*
+ * We made some progress since the last retry or this is
+ * the first time we are retrying. Fault in as many pages
+ * as possible and retry.
+ */
+ fault_in_iov_iter_writeable(to, left);
+ prev_left = left;
+ goto again;
+ }
+ }
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ return ret < 0 ? ret : read;
+}
+
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = btrfs_direct_read(iocb, to);
+ if (ret < 0 || !iov_iter_count(to) ||
+ iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
+ return ret;
+ }
+
+ return filemap_read(iocb, to, ret);
+}
+
+const struct file_operations btrfs_file_operations = {
+ .llseek = btrfs_file_llseek,
+ .read_iter = btrfs_file_read_iter,
+ .splice_read = generic_file_splice_read,
+ .write_iter = btrfs_file_write_iter,
+ .splice_write = iter_file_splice_write,
+ .mmap = btrfs_file_mmap,
+ .open = btrfs_file_open,
+ .release = btrfs_release_file,
+ .get_unmapped_area = thp_get_unmapped_area,
+ .fsync = btrfs_sync_file,
+ .fallocate = btrfs_fallocate,
+ .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = btrfs_compat_ioctl,
+#endif
+ .remap_file_range = btrfs_remap_file_range,
+};
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ /*
+ * So with compression we will find and lock a dirty page and clear the
+ * first one as dirty, setup an async extent, and immediately return
+ * with the entire range locked but with nobody actually marked with
+ * writeback. So we can't just filemap_write_and_wait_range() and
+ * expect it to work since it will just kick off a thread to do the
+ * actual work. So we need to call filemap_fdatawrite_range _again_
+ * since it will wait on the page lock, which won't be unlocked until
+ * after the pages have been marked as writeback and so we're good to go
+ * from there. We have to do this otherwise we'll miss the ordered
+ * extents and that results in badness. Please Josef, do not think you
+ * know better and pull this out at some point in the future, it is
+ * right and you are wrong.
+ */
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+
+ return ret;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000..b27795e13
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,4292 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2008 Red Hat. All rights reserved.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/math64.h>
+#include <linux/ratelimit.h>
+#include <linux/error-injection.h>
+#include <linux/sched/mm.h>
+#include "misc.h"
+#include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "extent_io.h"
+#include "volumes.h"
+#include "space-info.h"
+#include "delalloc-space.h"
+#include "block-group.h"
+#include "discard.h"
+#include "subpage.h"
+#include "inode-item.h"
+
+#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
+#define MAX_CACHE_BYTES_PER_GIG SZ_64K
+#define FORCE_EXTENT_THRESHOLD SZ_1M
+
+struct btrfs_trim_range {
+ u64 start;
+ u64 bytes;
+ struct list_head list;
+};
+
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, bool update_stat);
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes, bool for_alloc);
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info);
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes, bool update_stats);
+
+static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+
+ while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info, true);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ free_bitmap(ctl, info);
+ }
+
+ cond_resched_lock(&ctl->tree_lock);
+ }
+}
+
+static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 offset)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct btrfs_key location;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ struct inode *inode = NULL;
+ unsigned nofs_flag;
+ int ret;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0) {
+ btrfs_release_path(path);
+ return ERR_PTR(-ENOENT);
+ }
+
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_free_space_key(leaf, header, &disk_key);
+ btrfs_disk_key_to_cpu(&location, &disk_key);
+ btrfs_release_path(path);
+
+ /*
+ * We are often under a trans handle at this point, so we need to make
+ * sure NOFS is set to keep us from deadlocking.
+ */
+ nofs_flag = memalloc_nofs_save();
+ inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
+ btrfs_release_path(path);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(inode))
+ return inode;
+
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_constraint(inode->i_mapping,
+ ~(__GFP_FS | __GFP_HIGHMEM)));
+
+ return inode;
+}
+
+struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct inode *inode = NULL;
+ u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
+ spin_lock(&block_group->lock);
+ if (block_group->inode)
+ inode = igrab(block_group->inode);
+ spin_unlock(&block_group->lock);
+ if (inode)
+ return inode;
+
+ inode = __lookup_free_space_inode(fs_info->tree_root, path,
+ block_group->start);
+ if (IS_ERR(inode))
+ return inode;
+
+ spin_lock(&block_group->lock);
+ if (!((BTRFS_I(inode)->flags & flags) == flags)) {
+ btrfs_info(fs_info, "Old style space inode found, converting.");
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+ BTRFS_INODE_NODATACOW;
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ }
+
+ if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
+ block_group->inode = igrab(inode);
+ spin_unlock(&block_group->lock);
+
+ return inode;
+}
+
+static int __create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 ino, u64 offset)
+{
+ struct btrfs_key key;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_free_space_header *header;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ /* We inline CRCs for the free disk space cache */
+ const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
+ BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+ int ret;
+
+ ret = btrfs_insert_empty_inode(trans, root, path, ino);
+ if (ret)
+ return ret;
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+ btrfs_item_key(leaf, &disk_key, path->slots[0]);
+ memzero_extent_buffer(leaf, (unsigned long)inode_item,
+ sizeof(*inode_item));
+ btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+ btrfs_set_inode_size(leaf, inode_item, 0);
+ btrfs_set_inode_nbytes(leaf, inode_item, 0);
+ btrfs_set_inode_uid(leaf, inode_item, 0);
+ btrfs_set_inode_gid(leaf, inode_item, 0);
+ btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+ btrfs_set_inode_flags(leaf, inode_item, flags);
+ btrfs_set_inode_nlink(leaf, inode_item, 1);
+ btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+ btrfs_set_inode_block_group(leaf, inode_item, offset);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_free_space_header));
+ if (ret < 0) {
+ btrfs_release_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
+ btrfs_set_free_space_key(leaf, header, &disk_key);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int create_free_space_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
+{
+ int ret;
+ u64 ino;
+
+ ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
+ if (ret < 0)
+ return ret;
+
+ return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
+ ino, block_group->start);
+}
+
+/*
+ * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
+ * handles lookup, otherwise it takes ownership and iputs the inode.
+ * Don't reuse an inode pointer after passing it into this function.
+ */
+int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (!inode)
+ inode = lookup_free_space_inode(block_group, path);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) != -ENOENT)
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
+ clear_nlink(inode);
+ /* One for the block groups ref */
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ /* One for the lookup ref */
+ btrfs_add_delayed_iput(inode);
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.type = 0;
+ key.offset = block_group->start;
+ ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
+ -1, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ goto out;
+ }
+ ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv)
+{
+ u64 needed_bytes;
+ int ret;
+
+ /* 1 for slack space, 1 for updating the inode */
+ needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
+ btrfs_calc_metadata_size(fs_info, 1);
+
+ spin_lock(&rsv->lock);
+ if (rsv->reserved < needed_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+ spin_unlock(&rsv->lock);
+ return ret;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct inode *vfs_inode)
+{
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(vfs_inode),
+ .new_size = 0,
+ .ino = btrfs_ino(BTRFS_I(vfs_inode)),
+ .min_type = BTRFS_EXTENT_DATA_KEY,
+ .clear_extent_range = true,
+ };
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_state *cached_state = NULL;
+ int ret = 0;
+ bool locked = false;
+
+ if (block_group) {
+ struct btrfs_path *path = btrfs_alloc_path();
+
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ locked = true;
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ btrfs_wait_cache_io(trans, block_group, path);
+ btrfs_put_block_group(block_group);
+ }
+
+ /*
+ * now that we've truncated the cache away, its no longer
+ * setup or written
+ */
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_unlock(&block_group->lock);
+ btrfs_free_path(path);
+ }
+
+ btrfs_i_size_write(inode, 0);
+ truncate_pagecache(vfs_inode, 0);
+
+ lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
+
+ /*
+ * We skip the throttling logic for free space cache inodes, so we don't
+ * need to check for -EAGAIN.
+ */
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+
+ inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
+
+ unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ if (ret)
+ goto fail;
+
+ ret = btrfs_update_inode(trans, root, inode);
+
+fail:
+ if (locked)
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+}
+
+static void readahead_cache(struct inode *inode)
+{
+ struct file_ra_state ra;
+ unsigned long last_index;
+
+ file_ra_state_init(&ra, inode->i_mapping);
+ last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+ page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
+}
+
+static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
+ int write)
+{
+ int num_pages;
+
+ num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+
+ /* Make sure we can fit our crcs and generation into the first page */
+ if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
+ return -ENOSPC;
+
+ memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
+
+ io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
+ if (!io_ctl->pages)
+ return -ENOMEM;
+
+ io_ctl->num_pages = num_pages;
+ io_ctl->fs_info = btrfs_sb(inode->i_sb);
+ io_ctl->inode = inode;
+
+ return 0;
+}
+ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
+
+static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
+{
+ kfree(io_ctl->pages);
+ io_ctl->pages = NULL;
+}
+
+static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
+{
+ if (io_ctl->cur) {
+ io_ctl->cur = NULL;
+ io_ctl->orig = NULL;
+ }
+}
+
+static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
+{
+ ASSERT(io_ctl->index < io_ctl->num_pages);
+ io_ctl->page = io_ctl->pages[io_ctl->index++];
+ io_ctl->cur = page_address(io_ctl->page);
+ io_ctl->orig = io_ctl->cur;
+ io_ctl->size = PAGE_SIZE;
+ if (clear)
+ clear_page(io_ctl->cur);
+}
+
+static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
+{
+ int i;
+
+ io_ctl_unmap_page(io_ctl);
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ if (io_ctl->pages[i]) {
+ btrfs_page_clear_checked(io_ctl->fs_info,
+ io_ctl->pages[i],
+ page_offset(io_ctl->pages[i]),
+ PAGE_SIZE);
+ unlock_page(io_ctl->pages[i]);
+ put_page(io_ctl->pages[i]);
+ }
+ }
+}
+
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
+{
+ struct page *page;
+ struct inode *inode = io_ctl->inode;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ int i;
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ int ret;
+
+ page = find_or_create_page(inode->i_mapping, i, mask);
+ if (!page) {
+ io_ctl_drop_pages(io_ctl);
+ return -ENOMEM;
+ }
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ io_ctl_drop_pages(io_ctl);
+ return ret;
+ }
+
+ io_ctl->pages[i] = page;
+ if (uptodate && !PageUptodate(page)) {
+ btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "free space cache page truncated");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
+ if (!PageUptodate(page)) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "error reading free space cache");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
+ }
+ }
+
+ for (i = 0; i < io_ctl->num_pages; i++)
+ clear_page_dirty_for_io(io_ctl->pages[i]);
+
+ return 0;
+}
+
+static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
+{
+ io_ctl_map_page(io_ctl, 1);
+
+ /*
+ * Skip the csum areas. If we don't check crcs then we just have a
+ * 64bit chunk at the front of the first page.
+ */
+ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+
+ put_unaligned_le64(generation, io_ctl->cur);
+ io_ctl->cur += sizeof(u64);
+}
+
+static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
+{
+ u64 cache_gen;
+
+ /*
+ * Skip the crc area. If we don't check crcs then we just have a 64bit
+ * chunk at the front of the first page.
+ */
+ io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+
+ cache_gen = get_unaligned_le64(io_ctl->cur);
+ if (cache_gen != generation) {
+ btrfs_err_rl(io_ctl->fs_info,
+ "space cache generation (%llu) does not match inode (%llu)",
+ cache_gen, generation);
+ io_ctl_unmap_page(io_ctl);
+ return -EIO;
+ }
+ io_ctl->cur += sizeof(u64);
+ return 0;
+}
+
+static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
+{
+ u32 *tmp;
+ u32 crc = ~(u32)0;
+ unsigned offset = 0;
+
+ if (index == 0)
+ offset = sizeof(u32) * io_ctl->num_pages;
+
+ crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ btrfs_crc32c_final(crc, (u8 *)&crc);
+ io_ctl_unmap_page(io_ctl);
+ tmp = page_address(io_ctl->pages[0]);
+ tmp += index;
+ *tmp = crc;
+}
+
+static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
+{
+ u32 *tmp, val;
+ u32 crc = ~(u32)0;
+ unsigned offset = 0;
+
+ if (index == 0)
+ offset = sizeof(u32) * io_ctl->num_pages;
+
+ tmp = page_address(io_ctl->pages[0]);
+ tmp += index;
+ val = *tmp;
+
+ io_ctl_map_page(io_ctl, 0);
+ crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ btrfs_crc32c_final(crc, (u8 *)&crc);
+ if (val != crc) {
+ btrfs_err_rl(io_ctl->fs_info,
+ "csum mismatch on free space cache");
+ io_ctl_unmap_page(io_ctl);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
+ void *bitmap)
+{
+ struct btrfs_free_space_entry *entry;
+
+ if (!io_ctl->cur)
+ return -ENOSPC;
+
+ entry = io_ctl->cur;
+ put_unaligned_le64(offset, &entry->offset);
+ put_unaligned_le64(bytes, &entry->bytes);
+ entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+ BTRFS_FREE_SPACE_EXTENT;
+ io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+ io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+ if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+ return 0;
+
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+
+ /* No more pages to map */
+ if (io_ctl->index >= io_ctl->num_pages)
+ return 0;
+
+ /* map the next page */
+ io_ctl_map_page(io_ctl, 1);
+ return 0;
+}
+
+static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
+{
+ if (!io_ctl->cur)
+ return -ENOSPC;
+
+ /*
+ * If we aren't at the start of the current page, unmap this one and
+ * map the next one if there is any left.
+ */
+ if (io_ctl->cur != io_ctl->orig) {
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ if (io_ctl->index >= io_ctl->num_pages)
+ return -ENOSPC;
+ io_ctl_map_page(io_ctl, 0);
+ }
+
+ copy_page(io_ctl->cur, bitmap);
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ if (io_ctl->index < io_ctl->num_pages)
+ io_ctl_map_page(io_ctl, 0);
+ return 0;
+}
+
+static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
+{
+ /*
+ * If we're not on the boundary we know we've modified the page and we
+ * need to crc the page.
+ */
+ if (io_ctl->cur != io_ctl->orig)
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ else
+ io_ctl_unmap_page(io_ctl);
+
+ while (io_ctl->index < io_ctl->num_pages) {
+ io_ctl_map_page(io_ctl, 1);
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ }
+}
+
+static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
+ struct btrfs_free_space *entry, u8 *type)
+{
+ struct btrfs_free_space_entry *e;
+ int ret;
+
+ if (!io_ctl->cur) {
+ ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+ if (ret)
+ return ret;
+ }
+
+ e = io_ctl->cur;
+ entry->offset = get_unaligned_le64(&e->offset);
+ entry->bytes = get_unaligned_le64(&e->bytes);
+ *type = e->type;
+ io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+ io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+ if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+ return 0;
+
+ io_ctl_unmap_page(io_ctl);
+
+ return 0;
+}
+
+static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
+ struct btrfs_free_space *entry)
+{
+ int ret;
+
+ ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+ if (ret)
+ return ret;
+
+ copy_page(entry->bitmap, io_ctl->cur);
+ io_ctl_unmap_page(io_ctl);
+
+ return 0;
+}
+
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_block_group *block_group = ctl->block_group;
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
+ u64 size = block_group->length;
+ u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+
+ max_bitmaps = max_t(u64, max_bitmaps, 1);
+
+ if (ctl->total_bitmaps > max_bitmaps)
+ btrfs_err(block_group->fs_info,
+"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
+ block_group->start, block_group->length,
+ ctl->total_bitmaps, ctl->unit, max_bitmaps,
+ bytes_per_bg);
+ ASSERT(ctl->total_bitmaps <= max_bitmaps);
+
+ /*
+ * We are trying to keep the total amount of memory used per 1GiB of
+ * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
+ * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
+ * bitmaps, we may end up using more memory than this.
+ */
+ if (size < SZ_1G)
+ max_bytes = MAX_CACHE_BYTES_PER_GIG;
+ else
+ max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
+
+ bitmap_bytes = ctl->total_bitmaps * ctl->unit;
+
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the max
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
+
+ ctl->extents_thresh =
+ div_u64(extent_bytes, sizeof(struct btrfs_free_space));
+}
+
+static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_path *path, u64 offset)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ struct btrfs_io_ctl io_ctl;
+ struct btrfs_key key;
+ struct btrfs_free_space *e, *n;
+ LIST_HEAD(bitmaps);
+ u64 num_entries;
+ u64 num_bitmaps;
+ u64 generation;
+ u8 type;
+ int ret = 0;
+
+ /* Nothing in the space cache, goodbye */
+ if (!i_size_read(inode))
+ return 0;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return 0;
+ else if (ret > 0) {
+ btrfs_release_path(path);
+ return 0;
+ }
+
+ ret = -1;
+
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ num_entries = btrfs_free_space_entries(leaf, header);
+ num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+ generation = btrfs_free_space_generation(leaf, header);
+ btrfs_release_path(path);
+
+ if (!BTRFS_I(inode)->generation) {
+ btrfs_info(fs_info,
+ "the free space cache file (%llu) is invalid, skip it",
+ offset);
+ return 0;
+ }
+
+ if (BTRFS_I(inode)->generation != generation) {
+ btrfs_err(fs_info,
+ "free space inode generation (%llu) did not match free space cache generation (%llu)",
+ BTRFS_I(inode)->generation, generation);
+ return 0;
+ }
+
+ if (!num_entries)
+ return 0;
+
+ ret = io_ctl_init(&io_ctl, inode, 0);
+ if (ret)
+ return ret;
+
+ readahead_cache(inode);
+
+ ret = io_ctl_prepare_pages(&io_ctl, true);
+ if (ret)
+ goto out;
+
+ ret = io_ctl_check_crc(&io_ctl, 0);
+ if (ret)
+ goto free_cache;
+
+ ret = io_ctl_check_generation(&io_ctl, generation);
+ if (ret)
+ goto free_cache;
+
+ while (num_entries) {
+ e = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
+ if (!e) {
+ ret = -ENOMEM;
+ goto free_cache;
+ }
+
+ ret = io_ctl_read_entry(&io_ctl, e, &type);
+ if (ret) {
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+
+ if (!e->bytes) {
+ ret = -1;
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+
+ if (type == BTRFS_FREE_SPACE_EXTENT) {
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ spin_unlock(&ctl->tree_lock);
+ if (ret) {
+ btrfs_err(fs_info,
+ "Duplicate entries in free space cache, dumping");
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+ } else {
+ ASSERT(num_bitmaps);
+ num_bitmaps--;
+ e->bitmap = kmem_cache_zalloc(
+ btrfs_free_space_bitmap_cachep, GFP_NOFS);
+ if (!e->bitmap) {
+ ret = -ENOMEM;
+ kmem_cache_free(
+ btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ if (ret) {
+ spin_unlock(&ctl->tree_lock);
+ btrfs_err(fs_info,
+ "Duplicate entries in free space cache, dumping");
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+ ctl->total_bitmaps++;
+ recalculate_thresholds(ctl);
+ spin_unlock(&ctl->tree_lock);
+ list_add_tail(&e->list, &bitmaps);
+ }
+
+ num_entries--;
+ }
+
+ io_ctl_unmap_page(&io_ctl);
+
+ /*
+ * We add the bitmaps at the end of the entries in order that
+ * the bitmap entries are added to the cache.
+ */
+ list_for_each_entry_safe(e, n, &bitmaps, list) {
+ list_del_init(&e->list);
+ ret = io_ctl_read_bitmap(&io_ctl, e);
+ if (ret)
+ goto free_cache;
+ }
+
+ io_ctl_drop_pages(&io_ctl);
+ ret = 1;
+out:
+ io_ctl_free(&io_ctl);
+ return ret;
+free_cache:
+ io_ctl_drop_pages(&io_ctl);
+
+ spin_lock(&ctl->tree_lock);
+ __btrfs_remove_free_space_cache(ctl);
+ spin_unlock(&ctl->tree_lock);
+ goto out;
+}
+
+static int copy_free_space_cache(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int ret = 0;
+
+ while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info, true);
+ ret = btrfs_add_free_space(block_group, info->offset,
+ info->bytes);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ u64 offset = info->offset;
+ u64 bytes = ctl->unit;
+
+ while (search_bitmap(ctl, info, &offset, &bytes,
+ false) == 0) {
+ ret = btrfs_add_free_space(block_group, offset,
+ bytes);
+ if (ret)
+ break;
+ bitmap_clear_bits(ctl, info, offset, bytes, true);
+ offset = info->offset;
+ bytes = ctl->unit;
+ }
+ free_bitmap(ctl, info);
+ }
+ cond_resched();
+ }
+ return ret;
+}
+
+static struct lock_class_key btrfs_free_space_inode_key;
+
+int load_free_space_cache(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space_ctl tmp_ctl = {};
+ struct inode *inode;
+ struct btrfs_path *path;
+ int ret = 0;
+ bool matched;
+ u64 used = block_group->used;
+
+ /*
+ * Because we could potentially discard our loaded free space, we want
+ * to load everything into a temporary structure first, and then if it's
+ * valid copy it all into the actual free space ctl.
+ */
+ btrfs_init_free_space_ctl(block_group, &tmp_ctl);
+
+ /*
+ * If this block group has been marked to be cleared for one reason or
+ * another then we can't trust the on disk cache, so just return.
+ */
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ spin_unlock(&block_group->lock);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return 0;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ /*
+ * We must pass a path with search_commit_root set to btrfs_iget in
+ * order to avoid a deadlock when allocating extents for the tree root.
+ *
+ * When we are COWing an extent buffer from the tree root, when looking
+ * for a free extent, at extent-tree.c:find_free_extent(), we can find
+ * block group without its free space cache loaded. When we find one
+ * we must load its space cache which requires reading its free space
+ * cache's inode item from the root tree. If this inode item is located
+ * in the same leaf that we started COWing before, then we end up in
+ * deadlock on the extent buffer (trying to read lock it when we
+ * previously write locked it).
+ *
+ * It's safe to read the inode item using the commit root because
+ * block groups, once loaded, stay in memory forever (until they are
+ * removed) as well as their space caches once loaded. New block groups
+ * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
+ * we will never try to read their inode item while the fs is mounted.
+ */
+ inode = lookup_free_space_inode(block_group, path);
+ if (IS_ERR(inode)) {
+ btrfs_free_path(path);
+ return 0;
+ }
+
+ /* We may have converted the inode and made the cache invalid. */
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+ spin_unlock(&block_group->lock);
+ btrfs_free_path(path);
+ goto out;
+ }
+ spin_unlock(&block_group->lock);
+
+ /*
+ * Reinitialize the class of struct inode's mapping->invalidate_lock for
+ * free space inodes to prevent false positives related to locks for normal
+ * inodes.
+ */
+ lockdep_set_class(&(&inode->i_data)->invalidate_lock,
+ &btrfs_free_space_inode_key);
+
+ ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
+ path, block_group->start);
+ btrfs_free_path(path);
+ if (ret <= 0)
+ goto out;
+
+ matched = (tmp_ctl.free_space == (block_group->length - used -
+ block_group->bytes_super));
+
+ if (matched) {
+ ret = copy_free_space_cache(block_group, &tmp_ctl);
+ /*
+ * ret == 1 means we successfully loaded the free space cache,
+ * so we need to re-set it here.
+ */
+ if (ret == 0)
+ ret = 1;
+ } else {
+ /*
+ * We need to call the _locked variant so we don't try to update
+ * the discard counters.
+ */
+ spin_lock(&tmp_ctl.tree_lock);
+ __btrfs_remove_free_space_cache(&tmp_ctl);
+ spin_unlock(&tmp_ctl.tree_lock);
+ btrfs_warn(fs_info,
+ "block group %llu has wrong amount of free space",
+ block_group->start);
+ ret = -1;
+ }
+out:
+ if (ret < 0) {
+ /* This cache is bogus, make sure it gets cleared */
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_unlock(&block_group->lock);
+ ret = 0;
+
+ btrfs_warn(fs_info,
+ "failed to load free space cache for block group %llu, rebuilding it now",
+ block_group->start);
+ }
+
+ spin_lock(&ctl->tree_lock);
+ btrfs_discard_update_discardable(block_group);
+ spin_unlock(&ctl->tree_lock);
+ iput(inode);
+ return ret;
+}
+
+static noinline_for_stack
+int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group *block_group,
+ int *entries, int *bitmaps,
+ struct list_head *bitmap_list)
+{
+ int ret;
+ struct btrfs_free_cluster *cluster = NULL;
+ struct btrfs_free_cluster *cluster_locked = NULL;
+ struct rb_node *node = rb_first(&ctl->free_space_offset);
+ struct btrfs_trim_range *trim_entry;
+
+ /* Get the cluster for this block_group if it exists */
+ if (block_group && !list_empty(&block_group->cluster_list)) {
+ cluster = list_entry(block_group->cluster_list.next,
+ struct btrfs_free_cluster,
+ block_group_list);
+ }
+
+ if (!node && cluster) {
+ cluster_locked = cluster;
+ spin_lock(&cluster_locked->lock);
+ node = rb_first(&cluster->root);
+ cluster = NULL;
+ }
+
+ /* Write out the extent entries */
+ while (node) {
+ struct btrfs_free_space *e;
+
+ e = rb_entry(node, struct btrfs_free_space, offset_index);
+ *entries += 1;
+
+ ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
+ e->bitmap);
+ if (ret)
+ goto fail;
+
+ if (e->bitmap) {
+ list_add_tail(&e->list, bitmap_list);
+ *bitmaps += 1;
+ }
+ node = rb_next(node);
+ if (!node && cluster) {
+ node = rb_first(&cluster->root);
+ cluster_locked = cluster;
+ spin_lock(&cluster_locked->lock);
+ cluster = NULL;
+ }
+ }
+ if (cluster_locked) {
+ spin_unlock(&cluster_locked->lock);
+ cluster_locked = NULL;
+ }
+
+ /*
+ * Make sure we don't miss any range that was removed from our rbtree
+ * because trimming is running. Otherwise after a umount+mount (or crash
+ * after committing the transaction) we would leak free space and get
+ * an inconsistent free space cache report from fsck.
+ */
+ list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
+ ret = io_ctl_add_entry(io_ctl, trim_entry->start,
+ trim_entry->bytes, NULL);
+ if (ret)
+ goto fail;
+ *entries += 1;
+ }
+
+ return 0;
+fail:
+ if (cluster_locked)
+ spin_unlock(&cluster_locked->lock);
+ return -ENOSPC;
+}
+
+static noinline_for_stack int
+update_cache_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path, u64 offset,
+ int entries, int bitmaps)
+{
+ struct btrfs_key key;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DELALLOC, NULL);
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ struct btrfs_key found_key;
+ ASSERT(path->slots[0]);
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+ found_key.offset != offset) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1, EXTENT_DELALLOC,
+ NULL);
+ btrfs_release_path(path);
+ goto fail;
+ }
+ }
+
+ BTRFS_I(inode)->generation = trans->transid;
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_set_free_space_entries(leaf, header, entries);
+ btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+ btrfs_set_free_space_generation(leaf, header, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ return 0;
+
+fail:
+ return -1;
+}
+
+static noinline_for_stack int write_pinned_extent_entries(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ int *entries)
+{
+ u64 start, extent_start, extent_end, len;
+ struct extent_io_tree *unpin = NULL;
+ int ret;
+
+ if (!block_group)
+ return 0;
+
+ /*
+ * We want to add any pinned extents to our free space cache
+ * so we don't leak the space
+ *
+ * We shouldn't have switched the pinned extents yet so this is the
+ * right one
+ */
+ unpin = &trans->transaction->pinned_extents;
+
+ start = block_group->start;
+
+ while (start < block_group->start + block_group->length) {
+ ret = find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL);
+ if (ret)
+ return 0;
+
+ /* This pinned extent is out of our range */
+ if (extent_start >= block_group->start + block_group->length)
+ return 0;
+
+ extent_start = max(extent_start, start);
+ extent_end = min(block_group->start + block_group->length,
+ extent_end + 1);
+ len = extent_end - extent_start;
+
+ *entries += 1;
+ ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
+ if (ret)
+ return -ENOSPC;
+
+ start = extent_end;
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+ struct btrfs_free_space *entry, *next;
+ int ret;
+
+ /* Write out the bitmaps */
+ list_for_each_entry_safe(entry, next, bitmap_list, list) {
+ ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
+ if (ret)
+ return -ENOSPC;
+ list_del_init(&entry->list);
+ }
+
+ return 0;
+}
+
+static int flush_dirty_cache(struct inode *inode)
+{
+ int ret;
+
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (ret)
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DELALLOC, NULL);
+
+ return ret;
+}
+
+static void noinline_for_stack
+cleanup_bitmap_list(struct list_head *bitmap_list)
+{
+ struct btrfs_free_space *entry, *next;
+
+ list_for_each_entry_safe(entry, next, bitmap_list, list)
+ list_del_init(&entry->list);
+}
+
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+ struct btrfs_io_ctl *io_ctl,
+ struct extent_state **cached_state)
+{
+ io_ctl_drop_pages(io_ctl);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ cached_state);
+}
+
+static int __btrfs_wait_cache_io(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path, u64 offset)
+{
+ int ret;
+ struct inode *inode = io_ctl->inode;
+
+ if (!inode)
+ return 0;
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
+ goto out;
+
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
+ io_ctl->entries, io_ctl->bitmaps);
+out:
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ if (block_group)
+ btrfs_debug(root->fs_info,
+ "failed to write free space cache for block group %llu error %d",
+ block_group->start, ret);
+ }
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
+
+ if (block_group) {
+ /* the dirty list is protected by the dirty_bgs_lock */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+
+ /* the disk_cache_state is protected by the block group lock */
+ spin_lock(&block_group->lock);
+
+ /*
+ * only mark this as written if we didn't get put back on
+ * the dirty list while waiting for IO. Otherwise our
+ * cache state won't be right, and we won't get written again
+ */
+ if (!ret && list_empty(&block_group->dirty_list))
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ else if (ret)
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ io_ctl->inode = NULL;
+ iput(inode);
+ }
+
+ return ret;
+
+}
+
+int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
+{
+ return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
+ block_group, &block_group->io_ctl,
+ path, block_group->start);
+}
+
+/**
+ * Write out cached info to an inode
+ *
+ * @root: root the inode belongs to
+ * @inode: freespace inode we are writing out
+ * @ctl: free space cache we are going to write out
+ * @block_group: block_group for this cache if it belongs to a block_group
+ * @io_ctl: holds context for the io
+ * @trans: the trans handle
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount. This will return 0 if it was successful in writing the cache out,
+ * or an errno if it was not.
+ */
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_trans_handle *trans)
+{
+ struct extent_state *cached_state = NULL;
+ LIST_HEAD(bitmap_list);
+ int entries = 0;
+ int bitmaps = 0;
+ int ret;
+ int must_iput = 0;
+
+ if (!i_size_read(inode))
+ return -EIO;
+
+ WARN_ON(io_ctl->pages);
+ ret = io_ctl_init(io_ctl, inode, 1);
+ if (ret)
+ return ret;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+ down_write(&block_group->data_rwsem);
+ spin_lock(&block_group->lock);
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ up_write(&block_group->data_rwsem);
+ BTRFS_I(inode)->generation = 0;
+ ret = 0;
+ must_iput = 1;
+ goto out;
+ }
+ spin_unlock(&block_group->lock);
+ }
+
+ /* Lock all pages first so we can lock the extent safely. */
+ ret = io_ctl_prepare_pages(io_ctl, false);
+ if (ret)
+ goto out_unlock;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
+
+ io_ctl_set_generation(io_ctl, trans->transid);
+
+ mutex_lock(&ctl->cache_writeout_mutex);
+ /* Write out the extent entries in the free space cache */
+ spin_lock(&ctl->tree_lock);
+ ret = write_cache_extent_entries(io_ctl, ctl,
+ block_group, &entries, &bitmaps,
+ &bitmap_list);
+ if (ret)
+ goto out_nospc_locked;
+
+ /*
+ * Some spaces that are freed in the current transaction are pinned,
+ * they will be added into free space cache after the transaction is
+ * committed, we shouldn't lose them.
+ *
+ * If this changes while we are working we'll get added back to
+ * the dirty list and redo it. No locking needed
+ */
+ ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
+ if (ret)
+ goto out_nospc_locked;
+
+ /*
+ * At last, we write out all the bitmaps and keep cache_writeout_mutex
+ * locked while doing it because a concurrent trim can be manipulating
+ * or freeing the bitmap.
+ */
+ ret = write_bitmap_entries(io_ctl, &bitmap_list);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ if (ret)
+ goto out_nospc;
+
+ /* Zero out the rest of the pages just to make sure */
+ io_ctl_zero_remaining_pages(io_ctl);
+
+ /* Everything is written out, now we dirty the pages in the file. */
+ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
+ io_ctl->num_pages, 0, i_size_read(inode),
+ &cached_state, false);
+ if (ret)
+ goto out_nospc;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+ /*
+ * Release the pages and unlock the extent, we will flush
+ * them out later
+ */
+ io_ctl_drop_pages(io_ctl);
+ io_ctl_free(io_ctl);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
+
+ /*
+ * at this point the pages are under IO and we're happy,
+ * The caller is responsible for waiting on them and updating
+ * the cache and the inode
+ */
+ io_ctl->entries = entries;
+ io_ctl->bitmaps = bitmaps;
+
+ ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
+ if (ret)
+ goto out;
+
+ return 0;
+
+out_nospc_locked:
+ cleanup_bitmap_list(&bitmap_list);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+out_nospc:
+ cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
+
+out_unlock:
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+
+out:
+ io_ctl->inode = NULL;
+ io_ctl_free(io_ctl);
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ }
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (must_iput)
+ iput(inode);
+ return ret;
+}
+
+int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct inode *inode;
+ int ret = 0;
+
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ spin_unlock(&block_group->lock);
+
+ inode = lookup_free_space_inode(block_group, path);
+ if (IS_ERR(inode))
+ return 0;
+
+ ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
+ block_group, &block_group->io_ctl, trans);
+ if (ret) {
+ btrfs_debug(fs_info,
+ "failed to write free space cache for block group %llu error %d",
+ block_group->start, ret);
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+ spin_unlock(&block_group->lock);
+
+ block_group->io_ctl.inode = NULL;
+ iput(inode);
+ }
+
+ /*
+ * if ret == 0 the caller is expected to call btrfs_wait_cache_io
+ * to wait for IO and put the inode
+ */
+
+ return ret;
+}
+
+static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
+ u64 offset)
+{
+ ASSERT(offset >= bitmap_start);
+ offset -= bitmap_start;
+ return (unsigned long)(div_u64(offset, unit));
+}
+
+static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
+{
+ return (unsigned long)(div_u64(bytes, unit));
+}
+
+static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
+ u64 offset)
+{
+ u64 bitmap_start;
+ u64 bytes_per_bitmap;
+
+ bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
+ bitmap_start = offset - ctl->start;
+ bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start *= bytes_per_bitmap;
+ bitmap_start += ctl->start;
+
+ return bitmap_start;
+}
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+ struct rb_node *node, int bitmap)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_free_space *info;
+
+ while (*p) {
+ parent = *p;
+ info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+ if (offset < info->offset) {
+ p = &(*p)->rb_left;
+ } else if (offset > info->offset) {
+ p = &(*p)->rb_right;
+ } else {
+ /*
+ * we could have a bitmap entry and an extent entry
+ * share the same offset. If this is the case, we want
+ * the extent entry to always be found first if we do a
+ * linear search through the tree, since we want to have
+ * the quickest allocation time, and allocating from an
+ * extent is faster than allocating from a bitmap. So
+ * if we're inserting a bitmap and we find an entry at
+ * this offset, we want to go right, or after this entry
+ * logically. If we are inserting an extent and we've
+ * found a bitmap, we want to go left, or before
+ * logically.
+ */
+ if (bitmap) {
+ if (info->bitmap) {
+ WARN_ON_ONCE(1);
+ return -EEXIST;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ if (!info->bitmap) {
+ WARN_ON_ONCE(1);
+ return -EEXIST;
+ }
+ p = &(*p)->rb_left;
+ }
+ }
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+
+ return 0;
+}
+
+/*
+ * This is a little subtle. We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0. In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes. So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have. For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
+ * call into find_free_space(), and return with max_extent_size == 4K, because
+ * that first bitmap entry had ->max_extent_size set, but the second one did
+ * not. If instead we returned 8K we'd come in searching for 8K, and find the
+ * 8K contiguous range.
+ *
+ * Consider the other case, we have 2 8K chunks in that second entry and still
+ * don't have ->max_extent_size set. We'll return 16K, and the next time the
+ * allocator comes in it'll fully search our second bitmap, and this time it'll
+ * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
+ * right allocation the next loop through.
+ */
+static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
+{
+ if (entry->bitmap && entry->max_extent_size)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
+/*
+ * We want the largest entry to be leftmost, so this is inverted from what you'd
+ * normally expect.
+ */
+static bool entry_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct btrfs_free_space *entry, *exist;
+
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
+ return get_max_extent_size(exist) < get_max_extent_size(entry);
+}
+
+/*
+ * searches the tree for the given offset.
+ *
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
+ * want a section that has at least bytes size and comes at or after the given
+ * offset.
+ */
+static struct btrfs_free_space *
+tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ u64 offset, int bitmap_only, int fuzzy)
+{
+ struct rb_node *n = ctl->free_space_offset.rb_node;
+ struct btrfs_free_space *entry = NULL, *prev = NULL;
+
+ /* find entry that is closest to the 'offset' */
+ while (n) {
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ prev = entry;
+
+ if (offset < entry->offset)
+ n = n->rb_left;
+ else if (offset > entry->offset)
+ n = n->rb_right;
+ else
+ break;
+
+ entry = NULL;
+ }
+
+ if (bitmap_only) {
+ if (!entry)
+ return NULL;
+ if (entry->bitmap)
+ return entry;
+
+ /*
+ * bitmap entry and extent entry may share same offset,
+ * in that case, bitmap entry comes after extent entry.
+ */
+ n = rb_next(n);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (entry->offset != offset)
+ return NULL;
+
+ WARN_ON(!entry->bitmap);
+ return entry;
+ } else if (entry) {
+ if (entry->bitmap) {
+ /*
+ * if previous extent entry covers the offset,
+ * we should return it instead of the bitmap entry
+ */
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap &&
+ prev->offset + prev->bytes > offset)
+ entry = prev;
+ }
+ }
+ return entry;
+ }
+
+ if (!prev)
+ return NULL;
+
+ /* find last entry before the 'offset' */
+ entry = prev;
+ if (entry->offset > offset) {
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ ASSERT(entry->offset <= offset);
+ } else {
+ if (fuzzy)
+ return entry;
+ else
+ return NULL;
+ }
+ }
+
+ if (entry->bitmap) {
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap &&
+ prev->offset + prev->bytes > offset)
+ return prev;
+ }
+ if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
+ return entry;
+ } else if (entry->offset + entry->bytes > offset)
+ return entry;
+
+ if (!fuzzy)
+ return NULL;
+
+ while (1) {
+ n = rb_next(&entry->offset_index);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (entry->bitmap) {
+ if (entry->offset + BITS_PER_BITMAP *
+ ctl->unit > offset)
+ break;
+ } else {
+ if (entry->offset + entry->bytes > offset)
+ break;
+ }
+ }
+ return entry;
+}
+
+static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ rb_erase(&info->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+ ctl->free_extents--;
+
+ if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
+ }
+
+ if (update_stat)
+ ctl->free_space -= info->bytes;
+}
+
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ int ret = 0;
+
+ ASSERT(info->bytes || info->bitmap);
+ ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
+ &info->offset_index, (info->bitmap != NULL));
+ if (ret)
+ return ret;
+
+ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+
+ if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]++;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
+ }
+
+ ctl->free_space += info->bytes;
+ ctl->free_extents++;
+ return ret;
+}
+
+static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ ASSERT(info->bitmap);
+
+ /*
+ * If our entry is empty it's because we're on a cluster and we don't
+ * want to re-link it into our ctl bytes index.
+ */
+ if (RB_EMPTY_NODE(&info->bytes_index))
+ return;
+
+ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+}
+
+static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ u64 offset, u64 bytes, bool update_stat)
+{
+ unsigned long start, count, end;
+ int extent_delta = -1;
+
+ start = offset_to_bit(info->offset, ctl->unit, offset);
+ count = bytes_to_bits(bytes, ctl->unit);
+ end = start + count;
+ ASSERT(end <= BITS_PER_BITMAP);
+
+ bitmap_clear(info->bitmap, start, count);
+
+ info->bytes -= bytes;
+ if (info->max_extent_size > ctl->unit)
+ info->max_extent_size = 0;
+
+ relink_bitmap_entry(ctl, info);
+
+ if (start && test_bit(start - 1, info->bitmap))
+ extent_delta++;
+
+ if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
+ extent_delta++;
+
+ info->bitmap_extents += extent_delta;
+ if (!btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+ }
+
+ if (update_stat)
+ ctl->free_space -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ unsigned long start, count, end;
+ int extent_delta = 1;
+
+ start = offset_to_bit(info->offset, ctl->unit, offset);
+ count = bytes_to_bits(bytes, ctl->unit);
+ end = start + count;
+ ASSERT(end <= BITS_PER_BITMAP);
+
+ bitmap_set(info->bitmap, start, count);
+
+ /*
+ * We set some bytes, we have no idea what the max extent size is
+ * anymore.
+ */
+ info->max_extent_size = 0;
+ info->bytes += bytes;
+ ctl->free_space += bytes;
+
+ relink_bitmap_entry(ctl, info);
+
+ if (start && test_bit(start - 1, info->bitmap))
+ extent_delta--;
+
+ if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
+ extent_delta--;
+
+ info->bitmap_extents += extent_delta;
+ if (!btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
+ }
+}
+
+/*
+ * If we can not find suitable extent, we will use bytes to record
+ * the size of the max extent.
+ */
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes, bool for_alloc)
+{
+ unsigned long found_bits = 0;
+ unsigned long max_bits = 0;
+ unsigned long bits, i;
+ unsigned long next_zero;
+ unsigned long extent_bits;
+
+ /*
+ * Skip searching the bitmap if we don't have a contiguous section that
+ * is large enough for this allocation.
+ */
+ if (for_alloc &&
+ bitmap_info->max_extent_size &&
+ bitmap_info->max_extent_size < *bytes) {
+ *bytes = bitmap_info->max_extent_size;
+ return -1;
+ }
+
+ i = offset_to_bit(bitmap_info->offset, ctl->unit,
+ max_t(u64, *offset, bitmap_info->offset));
+ bits = bytes_to_bits(*bytes, ctl->unit);
+
+ for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+ if (for_alloc && bits == 1) {
+ found_bits = 1;
+ break;
+ }
+ next_zero = find_next_zero_bit(bitmap_info->bitmap,
+ BITS_PER_BITMAP, i);
+ extent_bits = next_zero - i;
+ if (extent_bits >= bits) {
+ found_bits = extent_bits;
+ break;
+ } else if (extent_bits > max_bits) {
+ max_bits = extent_bits;
+ }
+ i = next_zero;
+ }
+
+ if (found_bits) {
+ *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
+ *bytes = (u64)(found_bits) * ctl->unit;
+ return 0;
+ }
+
+ *bytes = (u64)(max_bits) * ctl->unit;
+ bitmap_info->max_extent_size = *bytes;
+ relink_bitmap_entry(ctl, bitmap_info);
+ return -1;
+}
+
+/* Cache the size of the max extent in bytes */
+static struct btrfs_free_space *
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ unsigned long align, u64 *max_extent_size, bool use_bytes_index)
+{
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ u64 tmp;
+ u64 align_off;
+ int ret;
+
+ if (!ctl->free_space_offset.rb_node)
+ goto out;
+again:
+ if (use_bytes_index) {
+ node = rb_first_cached(&ctl->free_space_bytes);
+ } else {
+ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
+ 0, 1);
+ if (!entry)
+ goto out;
+ node = &entry->offset_index;
+ }
+
+ for (; node; node = rb_next(node)) {
+ if (use_bytes_index)
+ entry = rb_entry(node, struct btrfs_free_space,
+ bytes_index);
+ else
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+
+ /*
+ * If we are using the bytes index then all subsequent entries
+ * in this tree are going to be < bytes, so simply set the max
+ * extent size and exit the loop.
+ *
+ * If we're using the offset index then we need to keep going
+ * through the rest of the tree.
+ */
+ if (entry->bytes < *bytes) {
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
+ if (use_bytes_index)
+ break;
+ continue;
+ }
+
+ /* make sure the space returned is big enough
+ * to match our requested alignment
+ */
+ if (*bytes >= align) {
+ tmp = entry->offset - ctl->start + align - 1;
+ tmp = div64_u64(tmp, align);
+ tmp = tmp * align + ctl->start;
+ align_off = tmp - entry->offset;
+ } else {
+ align_off = 0;
+ tmp = entry->offset;
+ }
+
+ /*
+ * We don't break here if we're using the bytes index because we
+ * may have another entry that has the correct alignment that is
+ * the right size, so we don't want to miss that possibility.
+ * At worst this adds another loop through the logic, but if we
+ * broke here we could prematurely ENOSPC.
+ */
+ if (entry->bytes < *bytes + align_off) {
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
+ continue;
+ }
+
+ if (entry->bitmap) {
+ struct rb_node *old_next = rb_next(node);
+ u64 size = *bytes;
+
+ ret = search_bitmap(ctl, entry, &tmp, &size, true);
+ if (!ret) {
+ *offset = tmp;
+ *bytes = size;
+ return entry;
+ } else {
+ *max_extent_size =
+ max(get_max_extent_size(entry),
+ *max_extent_size);
+ }
+
+ /*
+ * The bitmap may have gotten re-arranged in the space
+ * index here because the max_extent_size may have been
+ * updated. Start from the beginning again if this
+ * happened.
+ */
+ if (use_bytes_index && old_next != rb_next(node))
+ goto again;
+ continue;
+ }
+
+ *offset = tmp;
+ *bytes = entry->bytes - align_off;
+ return entry;
+ }
+out:
+ return NULL;
+}
+
+static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset)
+{
+ info->offset = offset_to_bitmap(ctl, offset);
+ info->bytes = 0;
+ info->bitmap_extents = 0;
+ INIT_LIST_HEAD(&info->list);
+ link_free_space(ctl, info);
+ ctl->total_bitmaps++;
+ recalculate_thresholds(ctl);
+}
+
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info)
+{
+ /*
+ * Normally when this is called, the bitmap is completely empty. However,
+ * if we are blowing up the free space cache for one reason or another
+ * via __btrfs_remove_free_space_cache(), then it may not be freed and
+ * we may leave stats on the table.
+ */
+ if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] -=
+ bitmap_info->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
+
+ }
+ unlink_free_space(ctl, bitmap_info, true);
+ kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
+ kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
+ ctl->total_bitmaps--;
+ recalculate_thresholds(ctl);
+}
+
+static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info,
+ u64 *offset, u64 *bytes)
+{
+ u64 end;
+ u64 search_start, search_bytes;
+ int ret;
+
+again:
+ end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
+
+ /*
+ * We need to search for bits in this bitmap. We could only cover some
+ * of the extent in this bitmap thanks to how we add space, so we need
+ * to search for as much as it as we can and clear that amount, and then
+ * go searching for the next bit.
+ */
+ search_start = *offset;
+ search_bytes = ctl->unit;
+ search_bytes = min(search_bytes, end - search_start + 1);
+ ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
+ false);
+ if (ret < 0 || search_start != *offset)
+ return -EINVAL;
+
+ /* We may have found more bits than what we need */
+ search_bytes = min(search_bytes, *bytes);
+
+ /* Cannot clear past the end of the bitmap */
+ search_bytes = min(search_bytes, end - search_start + 1);
+
+ bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
+ *offset += search_bytes;
+ *bytes -= search_bytes;
+
+ if (*bytes) {
+ struct rb_node *next = rb_next(&bitmap_info->offset_index);
+ if (!bitmap_info->bytes)
+ free_bitmap(ctl, bitmap_info);
+
+ /*
+ * no entry after this bitmap, but we still have bytes to
+ * remove, so something has gone wrong.
+ */
+ if (!next)
+ return -EINVAL;
+
+ bitmap_info = rb_entry(next, struct btrfs_free_space,
+ offset_index);
+
+ /*
+ * if the next entry isn't a bitmap we need to return to let the
+ * extent stuff do its work.
+ */
+ if (!bitmap_info->bitmap)
+ return -EAGAIN;
+
+ /*
+ * Ok the next item is a bitmap, but it may not actually hold
+ * the information for the rest of this free space stuff, so
+ * look for it, and if we don't find it return so we can try
+ * everything over again.
+ */
+ search_start = *offset;
+ search_bytes = ctl->unit;
+ ret = search_bitmap(ctl, bitmap_info, &search_start,
+ &search_bytes, false);
+ if (ret < 0 || search_start != *offset)
+ return -EAGAIN;
+
+ goto again;
+ } else if (!bitmap_info->bytes)
+ free_bitmap(ctl, bitmap_info);
+
+ return 0;
+}
+
+static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes, enum btrfs_trim_state trim_state)
+{
+ u64 bytes_to_set = 0;
+ u64 end;
+
+ /*
+ * This is a tradeoff to make bitmap trim state minimal. We mark the
+ * whole bitmap untrimmed if at any point we add untrimmed regions.
+ */
+ if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
+ if (btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] +=
+ info->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
+ }
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ }
+
+ end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+
+ bytes_to_set = min(end - offset, bytes);
+
+ bitmap_set_bits(ctl, info, offset, bytes_to_set);
+
+ return bytes_to_set;
+
+}
+
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ struct btrfs_block_group *block_group = ctl->block_group;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ bool forced = false;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(block_group))
+ forced = true;
+#endif
+
+ /* This is a way to reclaim large regions from the bitmaps. */
+ if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
+ return false;
+
+ /*
+ * If we are below the extents threshold then we can add this as an
+ * extent, and don't have to deal with the bitmap
+ */
+ if (!forced && ctl->free_extents < ctl->extents_thresh) {
+ /*
+ * If this block group has some small extents we don't want to
+ * use up all of our free slots in the cache with them, we want
+ * to reserve them to larger extents, however if we have plenty
+ * of cache left then go ahead an dadd them, no sense in adding
+ * the overhead of a bitmap if we don't have to.
+ */
+ if (info->bytes <= fs_info->sectorsize * 8) {
+ if (ctl->free_extents * 3 <= ctl->extents_thresh)
+ return false;
+ } else {
+ return false;
+ }
+ }
+
+ /*
+ * The original block groups from mkfs can be really small, like 8
+ * megabytes, so don't bother with a bitmap for those entries. However
+ * some block groups can be smaller than what a bitmap would cover but
+ * are still large enough that they could overflow the 32k memory limit,
+ * so allow those block groups to still be allowed to have a bitmap
+ * entry.
+ */
+ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
+ return false;
+
+ return true;
+}
+
+static const struct btrfs_free_space_op free_space_op = {
+ .use_bitmap = use_bitmap,
+};
+
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ struct btrfs_free_space *bitmap_info;
+ struct btrfs_block_group *block_group = NULL;
+ int added = 0;
+ u64 bytes, offset, bytes_added;
+ enum btrfs_trim_state trim_state;
+ int ret;
+
+ bytes = info->bytes;
+ offset = info->offset;
+ trim_state = info->trim_state;
+
+ if (!ctl->op->use_bitmap(ctl, info))
+ return 0;
+
+ if (ctl->op == &free_space_op)
+ block_group = ctl->block_group;
+again:
+ /*
+ * Since we link bitmaps right into the cluster we need to see if we
+ * have a cluster here, and if so and it has our bitmap we need to add
+ * the free space to that bitmap.
+ */
+ if (block_group && !list_empty(&block_group->cluster_list)) {
+ struct btrfs_free_cluster *cluster;
+ struct rb_node *node;
+ struct btrfs_free_space *entry;
+
+ cluster = list_entry(block_group->cluster_list.next,
+ struct btrfs_free_cluster,
+ block_group_list);
+ spin_lock(&cluster->lock);
+ node = rb_first(&cluster->root);
+ if (!node) {
+ spin_unlock(&cluster->lock);
+ goto no_cluster_bitmap;
+ }
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (!entry->bitmap) {
+ spin_unlock(&cluster->lock);
+ goto no_cluster_bitmap;
+ }
+
+ if (entry->offset == offset_to_bitmap(ctl, offset)) {
+ bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
+ bytes, trim_state);
+ bytes -= bytes_added;
+ offset += bytes_added;
+ }
+ spin_unlock(&cluster->lock);
+ if (!bytes) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+no_cluster_bitmap:
+ bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!bitmap_info) {
+ ASSERT(added == 0);
+ goto new_bitmap;
+ }
+
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
+ trim_state);
+ bytes -= bytes_added;
+ offset += bytes_added;
+ added = 0;
+
+ if (!bytes) {
+ ret = 1;
+ goto out;
+ } else
+ goto again;
+
+new_bitmap:
+ if (info && info->bitmap) {
+ add_new_bitmap(ctl, info, offset);
+ added = 1;
+ info = NULL;
+ goto again;
+ } else {
+ spin_unlock(&ctl->tree_lock);
+
+ /* no pre-allocated info, allocate a new one */
+ if (!info) {
+ info = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
+ if (!info) {
+ spin_lock(&ctl->tree_lock);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* allocate the bitmap */
+ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
+ GFP_NOFS);
+ info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
+ spin_lock(&ctl->tree_lock);
+ if (!info->bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto again;
+ }
+
+out:
+ if (info) {
+ if (info->bitmap)
+ kmem_cache_free(btrfs_free_space_bitmap_cachep,
+ info->bitmap);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ }
+
+ return ret;
+}
+
+/*
+ * Free space merging rules:
+ * 1) Merge trimmed areas together
+ * 2) Let untrimmed areas coalesce with trimmed areas
+ * 3) Always pull neighboring regions from bitmaps
+ *
+ * The above rules are for when we merge free space based on btrfs_trim_state.
+ * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
+ * same reason: to promote larger extent regions which makes life easier for
+ * find_free_extent(). Rule 2 enables coalescing based on the common path
+ * being returning free space from btrfs_finish_extent_commit(). So when free
+ * space is trimmed, it will prevent aggregating trimmed new region and
+ * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
+ * and provide find_free_extent() with the largest extents possible hoping for
+ * the reuse path.
+ */
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, bool update_stat)
+{
+ struct btrfs_free_space *left_info = NULL;
+ struct btrfs_free_space *right_info;
+ bool merged = false;
+ u64 offset = info->offset;
+ u64 bytes = info->bytes;
+ const bool is_trimmed = btrfs_free_space_trimmed(info);
+
+ /*
+ * first we want to see if there is free space adjacent to the range we
+ * are adding, if there is remove that struct and add a new one to
+ * cover the entire range
+ */
+ right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
+ if (right_info && rb_prev(&right_info->offset_index))
+ left_info = rb_entry(rb_prev(&right_info->offset_index),
+ struct btrfs_free_space, offset_index);
+ else if (!right_info)
+ left_info = tree_search_offset(ctl, offset - 1, 0, 0);
+
+ /* See try_merge_free_space() comment. */
+ if (right_info && !right_info->bitmap &&
+ (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
+ unlink_free_space(ctl, right_info, update_stat);
+ info->bytes += right_info->bytes;
+ kmem_cache_free(btrfs_free_space_cachep, right_info);
+ merged = true;
+ }
+
+ /* See try_merge_free_space() comment. */
+ if (left_info && !left_info->bitmap &&
+ left_info->offset + left_info->bytes == offset &&
+ (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
+ unlink_free_space(ctl, left_info, update_stat);
+ info->offset = left_info->offset;
+ info->bytes += left_info->bytes;
+ kmem_cache_free(btrfs_free_space_cachep, left_info);
+ merged = true;
+ }
+
+ return merged;
+}
+
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ unsigned long i;
+ unsigned long j;
+ const u64 end = info->offset + info->bytes;
+ const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+ u64 bytes;
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, end);
+ j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+ if (j == i)
+ return false;
+ bytes = (j - i) * ctl->unit;
+ info->bytes += bytes;
+
+ /* See try_merge_free_space() comment. */
+ if (!btrfs_free_space_trimmed(bitmap))
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
+ bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ u64 bitmap_offset;
+ unsigned long i;
+ unsigned long j;
+ unsigned long prev_j;
+ u64 bytes;
+
+ bitmap_offset = offset_to_bitmap(ctl, info->offset);
+ /* If we're on a boundary, try the previous logical bitmap. */
+ if (bitmap_offset == info->offset) {
+ if (info->offset == 0)
+ return false;
+ bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
+ }
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+ j = 0;
+ prev_j = (unsigned long)-1;
+ for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
+ if (j > i)
+ break;
+ prev_j = j;
+ }
+ if (prev_j == i)
+ return false;
+
+ if (prev_j == (unsigned long)-1)
+ bytes = (i + 1) * ctl->unit;
+ else
+ bytes = (i - prev_j) * ctl->unit;
+
+ info->offset -= bytes;
+ info->bytes += bytes;
+
+ /* See try_merge_free_space() comment. */
+ if (!btrfs_free_space_trimmed(bitmap))
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
+ bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+/*
+ * We prefer always to allocate from extent entries, both for clustered and
+ * non-clustered allocation requests. So when attempting to add a new extent
+ * entry, try to see if there's adjacent free space in bitmap entries, and if
+ * there is, migrate that space from the bitmaps to the extent.
+ * Like this we get better chances of satisfying space allocation requests
+ * because we attempt to satisfy them based on a single cache entry, and never
+ * on 2 or more entries - even if the entries represent a contiguous free space
+ * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
+ * ends).
+ */
+static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ /*
+ * Only work with disconnected entries, as we can change their offset,
+ * and must be extent entries.
+ */
+ ASSERT(!info->bitmap);
+ ASSERT(RB_EMPTY_NODE(&info->offset_index));
+
+ if (ctl->total_bitmaps > 0) {
+ bool stole_end;
+ bool stole_front = false;
+
+ stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
+ if (ctl->total_bitmaps > 0)
+ stole_front = steal_from_bitmap_to_front(ctl, info,
+ update_stat);
+
+ if (stole_end || stole_front)
+ try_merge_free_space(ctl, info, update_stat);
+ }
+}
+
+int __btrfs_add_free_space(struct btrfs_block_group *block_group,
+ u64 offset, u64 bytes,
+ enum btrfs_trim_state trim_state)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *info;
+ int ret = 0;
+ u64 filter_bytes = bytes;
+
+ ASSERT(!btrfs_is_zoned(fs_info));
+
+ info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+ if (!info)
+ return -ENOMEM;
+
+ info->offset = offset;
+ info->bytes = bytes;
+ info->trim_state = trim_state;
+ RB_CLEAR_NODE(&info->offset_index);
+ RB_CLEAR_NODE(&info->bytes_index);
+
+ spin_lock(&ctl->tree_lock);
+
+ if (try_merge_free_space(ctl, info, true))
+ goto link;
+
+ /*
+ * There was no extent directly to the left or right of this new
+ * extent then we know we're going to have to allocate a new extent, so
+ * before we do that see if we need to drop this into a bitmap
+ */
+ ret = insert_into_bitmap(ctl, info);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ ret = 0;
+ goto out;
+ }
+link:
+ /*
+ * Only steal free space from adjacent bitmaps if we're sure we're not
+ * going to add the new free space to existing bitmap entries - because
+ * that would mean unnecessary work that would be reverted. Therefore
+ * attempt to steal space from bitmaps if we're adding an extent entry.
+ */
+ steal_from_bitmap(ctl, info, true);
+
+ filter_bytes = max(filter_bytes, info->bytes);
+
+ ret = link_free_space(ctl, info);
+ if (ret)
+ kmem_cache_free(btrfs_free_space_cachep, info);
+out:
+ btrfs_discard_update_discardable(block_group);
+ spin_unlock(&ctl->tree_lock);
+
+ if (ret) {
+ btrfs_crit(fs_info, "unable to add free space :%d", ret);
+ ASSERT(ret != -EEXIST);
+ }
+
+ if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
+ btrfs_discard_check_filter(block_group, filter_bytes);
+ btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+ }
+
+ return ret;
+}
+
+static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size, bool used)
+{
+ struct btrfs_space_info *sinfo = block_group->space_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 offset = bytenr - block_group->start;
+ u64 to_free, to_unusable;
+ int bg_reclaim_threshold = 0;
+ bool initial = (size == block_group->length);
+ u64 reclaimable_unusable;
+
+ WARN_ON(!initial && offset + size > block_group->zone_capacity);
+
+ if (!initial)
+ bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
+
+ spin_lock(&ctl->tree_lock);
+ if (!used)
+ to_free = size;
+ else if (initial)
+ to_free = block_group->zone_capacity;
+ else if (offset >= block_group->alloc_offset)
+ to_free = size;
+ else if (offset + size <= block_group->alloc_offset)
+ to_free = 0;
+ else
+ to_free = offset + size - block_group->alloc_offset;
+ to_unusable = size - to_free;
+
+ ctl->free_space += to_free;
+ /*
+ * If the block group is read-only, we should account freed space into
+ * bytes_readonly.
+ */
+ if (!block_group->ro)
+ block_group->zone_unusable += to_unusable;
+ spin_unlock(&ctl->tree_lock);
+ if (!used) {
+ spin_lock(&block_group->lock);
+ block_group->alloc_offset -= size;
+ spin_unlock(&block_group->lock);
+ }
+
+ reclaimable_unusable = block_group->zone_unusable -
+ (block_group->length - block_group->zone_capacity);
+ /* All the region is now unusable. Mark it as unused and reclaim */
+ if (block_group->zone_unusable == block_group->length) {
+ btrfs_mark_bg_unused(block_group);
+ } else if (bg_reclaim_threshold &&
+ reclaimable_unusable >=
+ div_factor_fine(block_group->zone_capacity,
+ bg_reclaim_threshold)) {
+ btrfs_mark_bg_to_reclaim(block_group);
+ }
+
+ return 0;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ true);
+
+ if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+
+ return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
+}
+
+int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ false);
+
+ return btrfs_add_free_space(block_group, bytenr, size);
+}
+
+/*
+ * This is a subtle distinction because when adding free space back in general,
+ * we want it to be added as untrimmed for async. But in the case where we add
+ * it on loading of a block group, we want to consider it trimmed.
+ */
+int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ true);
+
+ if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+
+ return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group *block_group,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *info;
+ int ret;
+ bool re_search = false;
+
+ if (btrfs_is_zoned(block_group->fs_info)) {
+ /*
+ * This can happen with conventional zones when replaying log.
+ * Since the allocation info of tree-log nodes are not recorded
+ * to the extent-tree, calculate_alloc_pointer() failed to
+ * advance the allocation pointer after last allocated tree log
+ * node blocks.
+ *
+ * This function is called from
+ * btrfs_pin_extent_for_log_replay() when replaying the log.
+ * Advance the pointer not to overwrite the tree-log nodes.
+ */
+ if (block_group->start + block_group->alloc_offset <
+ offset + bytes) {
+ block_group->alloc_offset =
+ offset + bytes - block_group->start;
+ }
+ return 0;
+ }
+
+ spin_lock(&ctl->tree_lock);
+
+again:
+ ret = 0;
+ if (!bytes)
+ goto out_lock;
+
+ info = tree_search_offset(ctl, offset, 0, 0);
+ if (!info) {
+ /*
+ * oops didn't find an extent that matched the space we wanted
+ * to remove, look for a bitmap instead
+ */
+ info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!info) {
+ /*
+ * If we found a partial bit of our free space in a
+ * bitmap but then couldn't find the other part this may
+ * be a problem, so WARN about it.
+ */
+ WARN_ON(re_search);
+ goto out_lock;
+ }
+ }
+
+ re_search = false;
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info, true);
+ if (offset == info->offset) {
+ u64 to_free = min(bytes, info->bytes);
+
+ info->bytes -= to_free;
+ info->offset += to_free;
+ if (info->bytes) {
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
+ } else {
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ }
+
+ offset += to_free;
+ bytes -= to_free;
+ goto again;
+ } else {
+ u64 old_end = info->bytes + info->offset;
+
+ info->bytes = offset - info->offset;
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
+ if (ret)
+ goto out_lock;
+
+ /* Not enough bytes in this entry to satisfy us */
+ if (old_end < offset + bytes) {
+ bytes -= old_end - offset;
+ offset = old_end;
+ goto again;
+ } else if (old_end == offset + bytes) {
+ /* all done */
+ goto out_lock;
+ }
+ spin_unlock(&ctl->tree_lock);
+
+ ret = __btrfs_add_free_space(block_group,
+ offset + bytes,
+ old_end - (offset + bytes),
+ info->trim_state);
+ WARN_ON(ret);
+ goto out;
+ }
+ }
+
+ ret = remove_from_bitmap(ctl, info, &offset, &bytes);
+ if (ret == -EAGAIN) {
+ re_search = true;
+ goto again;
+ }
+out_lock:
+ btrfs_discard_update_discardable(block_group);
+ spin_unlock(&ctl->tree_lock);
+out:
+ return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group *block_group,
+ u64 bytes)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int count = 0;
+
+ /*
+ * Zoned btrfs does not use free space tree and cluster. Just print
+ * out the free space after the allocation offset.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ btrfs_info(fs_info, "free space %llu active %d",
+ block_group->zone_capacity - block_group->alloc_offset,
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags));
+ return;
+ }
+
+ spin_lock(&ctl->tree_lock);
+ for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (info->bytes >= bytes && !block_group->ro)
+ count++;
+ btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
+ info->offset, info->bytes,
+ (info->bitmap) ? "yes" : "no");
+ }
+ spin_unlock(&ctl->tree_lock);
+ btrfs_info(fs_info, "block group has cluster?: %s",
+ list_empty(&block_group->cluster_list) ? "no" : "yes");
+ btrfs_info(fs_info,
+ "%d blocks of free space at or bigger than bytes is", count);
+}
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+ spin_lock_init(&ctl->tree_lock);
+ ctl->unit = fs_info->sectorsize;
+ ctl->start = block_group->start;
+ ctl->block_group = block_group;
+ ctl->op = &free_space_op;
+ ctl->free_space_bytes = RB_ROOT_CACHED;
+ INIT_LIST_HEAD(&ctl->trimming_ranges);
+ mutex_init(&ctl->cache_writeout_mutex);
+
+ /*
+ * we only want to have 32k of ram per block group for keeping
+ * track of free space, and if we pass 1/2 of that we want to
+ * start converting things over to using bitmaps
+ */
+ ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
+}
+
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache. If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already. In that case, we just return without changing anything
+ */
+static void __btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+
+ spin_lock(&cluster->lock);
+ if (cluster->block_group != block_group) {
+ spin_unlock(&cluster->lock);
+ return;
+ }
+
+ cluster->block_group = NULL;
+ cluster->window_start = 0;
+ list_del_init(&cluster->block_group_list);
+
+ node = rb_first(&cluster->root);
+ while (node) {
+ bool bitmap;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ node = rb_next(&entry->offset_index);
+ rb_erase(&entry->offset_index, &cluster->root);
+ RB_CLEAR_NODE(&entry->offset_index);
+
+ bitmap = (entry->bitmap != NULL);
+ if (!bitmap) {
+ /* Merging treats extents as if they were new */
+ if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -=
+ entry->bytes;
+ }
+
+ try_merge_free_space(ctl, entry, false);
+ steal_from_bitmap(ctl, entry, false);
+
+ /* As we insert directly, update these statistics */
+ if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]++;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] +=
+ entry->bytes;
+ }
+ }
+ tree_insert_offset(&ctl->free_space_offset,
+ entry->offset, &entry->offset_index, bitmap);
+ rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
+ entry_less);
+ }
+ cluster->root = RB_ROOT;
+ spin_unlock(&cluster->lock);
+ btrfs_put_block_group(block_group);
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_cluster *cluster;
+ struct list_head *head;
+
+ spin_lock(&ctl->tree_lock);
+ while ((head = block_group->cluster_list.next) !=
+ &block_group->cluster_list) {
+ cluster = list_entry(head, struct btrfs_free_cluster,
+ block_group_list);
+
+ WARN_ON(cluster->block_group != block_group);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
+
+ cond_resched_lock(&ctl->tree_lock);
+ }
+ __btrfs_remove_free_space_cache(ctl);
+ btrfs_discard_update_discardable(block_group);
+ spin_unlock(&ctl->tree_lock);
+
+}
+
+/**
+ * btrfs_is_free_space_trimmed - see if everything is trimmed
+ * @block_group: block_group of interest
+ *
+ * Walk @block_group's free space rb_tree to determine if everything is trimmed.
+ */
+bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+ bool ret = true;
+
+ spin_lock(&ctl->tree_lock);
+ node = rb_first(&ctl->free_space_offset);
+
+ while (node) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ if (!btrfs_free_space_trimmed(info)) {
+ ret = false;
+ break;
+ }
+
+ node = rb_next(node);
+ }
+
+ spin_unlock(&ctl->tree_lock);
+ return ret;
+}
+
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
+ u64 offset, u64 bytes, u64 empty_size,
+ u64 *max_extent_size)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
+ struct btrfs_free_space *entry = NULL;
+ u64 bytes_search = bytes + empty_size;
+ u64 ret = 0;
+ u64 align_gap = 0;
+ u64 align_gap_len = 0;
+ enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ bool use_bytes_index = (offset == block_group->start);
+
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
+ spin_lock(&ctl->tree_lock);
+ entry = find_free_space(ctl, &offset, &bytes_search,
+ block_group->full_stripe_len, max_extent_size,
+ use_bytes_index);
+ if (!entry)
+ goto out;
+
+ ret = offset;
+ if (entry->bitmap) {
+ bitmap_clear_bits(ctl, entry, offset, bytes, true);
+
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
+
+ if (!entry->bytes)
+ free_bitmap(ctl, entry);
+ } else {
+ unlink_free_space(ctl, entry, true);
+ align_gap_len = offset - entry->offset;
+ align_gap = entry->offset;
+ align_gap_trim_state = entry->trim_state;
+
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
+
+ entry->offset = offset + bytes;
+ WARN_ON(entry->bytes < bytes + align_gap_len);
+
+ entry->bytes -= bytes + align_gap_len;
+ if (!entry->bytes)
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ else
+ link_free_space(ctl, entry);
+ }
+out:
+ btrfs_discard_update_discardable(block_group);
+ spin_unlock(&ctl->tree_lock);
+
+ if (align_gap_len)
+ __btrfs_add_free_space(block_group, align_gap, align_gap_len,
+ align_gap_trim_state);
+ return ret;
+}
+
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache. If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+void btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster)
+{
+ struct btrfs_free_space_ctl *ctl;
+
+ /* first, get a safe pointer to the block group */
+ spin_lock(&cluster->lock);
+ if (!block_group) {
+ block_group = cluster->block_group;
+ if (!block_group) {
+ spin_unlock(&cluster->lock);
+ return;
+ }
+ } else if (cluster->block_group != block_group) {
+ /* someone else has already freed it don't redo their work */
+ spin_unlock(&cluster->lock);
+ return;
+ }
+ btrfs_get_block_group(block_group);
+ spin_unlock(&cluster->lock);
+
+ ctl = block_group->free_space_ctl;
+
+ /* now return any extents the cluster had on it */
+ spin_lock(&ctl->tree_lock);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&ctl->tree_lock);
+
+ btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
+
+ /* finally drop our ref */
+ btrfs_put_block_group(block_group);
+}
+
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster,
+ struct btrfs_free_space *entry,
+ u64 bytes, u64 min_start,
+ u64 *max_extent_size)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ int err;
+ u64 search_start = cluster->window_start;
+ u64 search_bytes = bytes;
+ u64 ret = 0;
+
+ search_start = min_start;
+ search_bytes = bytes;
+
+ err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
+ if (err) {
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
+ return 0;
+ }
+
+ ret = search_start;
+ bitmap_clear_bits(ctl, entry, ret, bytes, false);
+
+ return ret;
+}
+
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster, u64 bytes,
+ u64 min_start, u64 *max_extent_size)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
+ struct btrfs_free_space *entry = NULL;
+ struct rb_node *node;
+ u64 ret = 0;
+
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
+ spin_lock(&cluster->lock);
+ if (bytes > cluster->max_size)
+ goto out;
+
+ if (cluster->block_group != block_group)
+ goto out;
+
+ node = rb_first(&cluster->root);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ while (1) {
+ if (entry->bytes < bytes)
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
+
+ if (entry->bytes < bytes ||
+ (!entry->bitmap && entry->offset < min_start)) {
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ }
+
+ if (entry->bitmap) {
+ ret = btrfs_alloc_from_bitmap(block_group,
+ cluster, entry, bytes,
+ cluster->window_start,
+ max_extent_size);
+ if (ret == 0) {
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ }
+ cluster->window_start += bytes;
+ } else {
+ ret = entry->offset;
+
+ entry->offset += bytes;
+ entry->bytes -= bytes;
+ }
+
+ break;
+ }
+out:
+ spin_unlock(&cluster->lock);
+
+ if (!ret)
+ return 0;
+
+ spin_lock(&ctl->tree_lock);
+
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
+
+ ctl->free_space -= bytes;
+ if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+
+ spin_lock(&cluster->lock);
+ if (entry->bytes == 0) {
+ rb_erase(&entry->offset_index, &cluster->root);
+ ctl->free_extents--;
+ if (entry->bitmap) {
+ kmem_cache_free(btrfs_free_space_bitmap_cachep,
+ entry->bitmap);
+ ctl->total_bitmaps--;
+ recalculate_thresholds(ctl);
+ } else if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
+ }
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
+
+ spin_unlock(&cluster->lock);
+ spin_unlock(&ctl->tree_lock);
+
+ return ret;
+}
+
+static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
+ struct btrfs_free_space *entry,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ unsigned long next_zero;
+ unsigned long i;
+ unsigned long want_bits;
+ unsigned long min_bits;
+ unsigned long found_bits;
+ unsigned long max_bits = 0;
+ unsigned long start = 0;
+ unsigned long total_found = 0;
+ int ret;
+
+ i = offset_to_bit(entry->offset, ctl->unit,
+ max_t(u64, offset, entry->offset));
+ want_bits = bytes_to_bits(bytes, ctl->unit);
+ min_bits = bytes_to_bits(min_bytes, ctl->unit);
+
+ /*
+ * Don't bother looking for a cluster in this bitmap if it's heavily
+ * fragmented.
+ */
+ if (entry->max_extent_size &&
+ entry->max_extent_size < cont1_bytes)
+ return -ENOSPC;
+again:
+ found_bits = 0;
+ for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
+ next_zero = find_next_zero_bit(entry->bitmap,
+ BITS_PER_BITMAP, i);
+ if (next_zero - i >= min_bits) {
+ found_bits = next_zero - i;
+ if (found_bits > max_bits)
+ max_bits = found_bits;
+ break;
+ }
+ if (next_zero - i > max_bits)
+ max_bits = next_zero - i;
+ i = next_zero;
+ }
+
+ if (!found_bits) {
+ entry->max_extent_size = (u64)max_bits * ctl->unit;
+ return -ENOSPC;
+ }
+
+ if (!total_found) {
+ start = i;
+ cluster->max_size = 0;
+ }
+
+ total_found += found_bits;
+
+ if (cluster->max_size < found_bits * ctl->unit)
+ cluster->max_size = found_bits * ctl->unit;
+
+ if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+ i = next_zero + 1;
+ goto again;
+ }
+
+ cluster->window_start = start * ctl->unit + entry->offset;
+ rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+
+ /*
+ * We need to know if we're currently on the normal space index when we
+ * manipulate the bitmap so that we know we need to remove and re-insert
+ * it into the space_index tree. Clear the bytes_index node here so the
+ * bitmap manipulation helpers know not to mess with the space_index
+ * until this bitmap entry is added back into the normal cache.
+ */
+ RB_CLEAR_NODE(&entry->bytes_index);
+
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index, 1);
+ ASSERT(!ret); /* -EEXIST; Logic error */
+
+ trace_btrfs_setup_cluster(block_group, cluster,
+ total_found * ctl->unit, 1);
+ return 0;
+}
+
+/*
+ * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
+ */
+static noinline int
+setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster,
+ struct list_head *bitmaps, u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *first = NULL;
+ struct btrfs_free_space *entry = NULL;
+ struct btrfs_free_space *last;
+ struct rb_node *node;
+ u64 window_free;
+ u64 max_extent;
+ u64 total_size = 0;
+
+ entry = tree_search_offset(ctl, offset, 0, 1);
+ if (!entry)
+ return -ENOSPC;
+
+ /*
+ * We don't want bitmaps, so just move along until we find a normal
+ * extent entry.
+ */
+ while (entry->bitmap || entry->bytes < min_bytes) {
+ if (entry->bitmap && list_empty(&entry->list))
+ list_add_tail(&entry->list, bitmaps);
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ return -ENOSPC;
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ }
+
+ window_free = entry->bytes;
+ max_extent = entry->bytes;
+ first = entry;
+ last = entry;
+
+ for (node = rb_next(&entry->offset_index); node;
+ node = rb_next(&entry->offset_index)) {
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ if (entry->bitmap) {
+ if (list_empty(&entry->list))
+ list_add_tail(&entry->list, bitmaps);
+ continue;
+ }
+
+ if (entry->bytes < min_bytes)
+ continue;
+
+ last = entry;
+ window_free += entry->bytes;
+ if (entry->bytes > max_extent)
+ max_extent = entry->bytes;
+ }
+
+ if (window_free < bytes || max_extent < cont1_bytes)
+ return -ENOSPC;
+
+ cluster->window_start = first->offset;
+
+ node = &first->offset_index;
+
+ /*
+ * now we've found our entries, pull them out of the free space
+ * cache and put them into the cluster rbtree
+ */
+ do {
+ int ret;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ node = rb_next(&entry->offset_index);
+ if (entry->bitmap || entry->bytes < min_bytes)
+ continue;
+
+ rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index, 0);
+ total_size += entry->bytes;
+ ASSERT(!ret); /* -EEXIST; Logic error */
+ } while (node && entry != last);
+
+ cluster->max_size = max_extent;
+ trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
+ return 0;
+}
+
+/*
+ * This specifically looks for bitmaps that may work in the cluster, we assume
+ * that we have already failed to find extents that will work.
+ */
+static noinline int
+setup_cluster_bitmap(struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster,
+ struct list_head *bitmaps, u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry = NULL;
+ int ret = -ENOSPC;
+ u64 bitmap_offset = offset_to_bitmap(ctl, offset);
+
+ if (ctl->total_bitmaps == 0)
+ return -ENOSPC;
+
+ /*
+ * The bitmap that covers offset won't be in the list unless offset
+ * is just its start offset.
+ */
+ if (!list_empty(bitmaps))
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+
+ if (!entry || entry->offset != bitmap_offset) {
+ entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (entry && list_empty(&entry->list))
+ list_add(&entry->list, bitmaps);
+ }
+
+ list_for_each_entry(entry, bitmaps, list) {
+ if (entry->bytes < bytes)
+ continue;
+ ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+ bytes, cont1_bytes, min_bytes);
+ if (!ret)
+ return 0;
+ }
+
+ /*
+ * The bitmaps list has all the bitmaps that record free space
+ * starting after offset, so no more search is required.
+ */
+ return -ENOSPC;
+}
+
+/*
+ * here we try to find a cluster of blocks in a block group. The goal
+ * is to find at least bytes+empty_size.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry, *tmp;
+ LIST_HEAD(bitmaps);
+ u64 min_bytes;
+ u64 cont1_bytes;
+ int ret;
+
+ /*
+ * Choose the minimum extent size we'll require for this
+ * cluster. For SSD_SPREAD, don't allow any fragmentation.
+ * For metadata, allow allocates with smaller extents. For
+ * data, keep it dense.
+ */
+ if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
+ cont1_bytes = bytes + empty_size;
+ min_bytes = cont1_bytes;
+ } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ cont1_bytes = bytes;
+ min_bytes = fs_info->sectorsize;
+ } else {
+ cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+ min_bytes = fs_info->sectorsize;
+ }
+
+ spin_lock(&ctl->tree_lock);
+
+ /*
+ * If we know we don't have enough space to make a cluster don't even
+ * bother doing all the work to try and find one.
+ */
+ if (ctl->free_space < bytes) {
+ spin_unlock(&ctl->tree_lock);
+ return -ENOSPC;
+ }
+
+ spin_lock(&cluster->lock);
+
+ /* someone already found a cluster, hooray */
+ if (cluster->block_group) {
+ ret = 0;
+ goto out;
+ }
+
+ trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+ min_bytes);
+
+ ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
+ bytes + empty_size,
+ cont1_bytes, min_bytes);
+ if (ret)
+ ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
+ offset, bytes + empty_size,
+ cont1_bytes, min_bytes);
+
+ /* Clear our temporary list */
+ list_for_each_entry_safe(entry, tmp, &bitmaps, list)
+ list_del_init(&entry->list);
+
+ if (!ret) {
+ btrfs_get_block_group(block_group);
+ list_add_tail(&cluster->block_group_list,
+ &block_group->cluster_list);
+ cluster->block_group = block_group;
+ } else {
+ trace_btrfs_failed_cluster_setup(block_group);
+ }
+out:
+ spin_unlock(&cluster->lock);
+ spin_unlock(&ctl->tree_lock);
+
+ return ret;
+}
+
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+ spin_lock_init(&cluster->lock);
+ spin_lock_init(&cluster->refill_lock);
+ cluster->root = RB_ROOT;
+ cluster->max_size = 0;
+ cluster->fragmented = false;
+ INIT_LIST_HEAD(&cluster->block_group_list);
+ cluster->block_group = NULL;
+}
+
+static int do_trimming(struct btrfs_block_group *block_group,
+ u64 *total_trimmed, u64 start, u64 bytes,
+ u64 reserved_start, u64 reserved_bytes,
+ enum btrfs_trim_state reserved_trim_state,
+ struct btrfs_trim_range *trim_entry)
+{
+ struct btrfs_space_info *space_info = block_group->space_info;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ int ret;
+ int update = 0;
+ const u64 end = start + bytes;
+ const u64 reserved_end = reserved_start + reserved_bytes;
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ u64 trimmed = 0;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (!block_group->ro) {
+ block_group->reserved += reserved_bytes;
+ space_info->bytes_reserved += reserved_bytes;
+ update = 1;
+ }
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
+ if (!ret) {
+ *total_trimmed += trimmed;
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+ }
+
+ mutex_lock(&ctl->cache_writeout_mutex);
+ if (reserved_start < start)
+ __btrfs_add_free_space(block_group, reserved_start,
+ start - reserved_start,
+ reserved_trim_state);
+ if (start + bytes < reserved_start + reserved_bytes)
+ __btrfs_add_free_space(block_group, end, reserved_end - end,
+ reserved_trim_state);
+ __btrfs_add_free_space(block_group, start, bytes, trim_state);
+ list_del(&trim_entry->list);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+ if (update) {
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (block_group->ro)
+ space_info->bytes_readonly += reserved_bytes;
+ block_group->reserved -= reserved_bytes;
+ space_info->bytes_reserved -= reserved_bytes;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ }
+
+ return ret;
+}
+
+/*
+ * If @async is set, then we will trim 1 region and return.
+ */
+static int trim_no_bitmap(struct btrfs_block_group *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen,
+ bool async)
+{
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret = 0;
+ u64 extent_start;
+ u64 extent_bytes;
+ enum btrfs_trim_state extent_trim_state;
+ u64 bytes;
+ const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
+
+ while (start < end) {
+ struct btrfs_trim_range trim_entry;
+
+ mutex_lock(&ctl->cache_writeout_mutex);
+ spin_lock(&ctl->tree_lock);
+
+ if (ctl->free_space < minlen)
+ goto out_unlock;
+
+ entry = tree_search_offset(ctl, start, 0, 1);
+ if (!entry)
+ goto out_unlock;
+
+ /* Skip bitmaps and if async, already trimmed entries */
+ while (entry->bitmap ||
+ (async && btrfs_free_space_trimmed(entry))) {
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ goto out_unlock;
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ }
+
+ if (entry->offset >= end)
+ goto out_unlock;
+
+ extent_start = entry->offset;
+ extent_bytes = entry->bytes;
+ extent_trim_state = entry->trim_state;
+ if (async) {
+ start = entry->offset;
+ bytes = entry->bytes;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
+ unlink_free_space(ctl, entry, true);
+ /*
+ * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
+ * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
+ * X when we come back around. So trim it now.
+ */
+ if (max_discard_size &&
+ bytes >= (max_discard_size +
+ BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
+ bytes = max_discard_size;
+ extent_bytes = max_discard_size;
+ entry->offset += max_discard_size;
+ entry->bytes -= max_discard_size;
+ link_free_space(ctl, entry);
+ } else {
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
+ } else {
+ start = max(start, extent_start);
+ bytes = min(extent_start + extent_bytes, end) - start;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
+
+ unlink_free_space(ctl, entry, true);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
+
+ spin_unlock(&ctl->tree_lock);
+ trim_entry.start = extent_start;
+ trim_entry.bytes = extent_bytes;
+ list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ extent_start, extent_bytes, extent_trim_state,
+ &trim_entry);
+ if (ret) {
+ block_group->discard_cursor = start + bytes;
+ break;
+ }
+next:
+ start += bytes;
+ block_group->discard_cursor = start;
+ if (async && *total_trimmed)
+ break;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+
+out_unlock:
+ block_group->discard_cursor = btrfs_block_group_end(block_group);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+ return ret;
+}
+
+/*
+ * If we break out of trimming a bitmap prematurely, we should reset the
+ * trimming bit. In a rather contrieved case, it's possible to race here so
+ * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
+ *
+ * start = start of bitmap
+ * end = near end of bitmap
+ *
+ * Thread 1: Thread 2:
+ * trim_bitmaps(start)
+ * trim_bitmaps(end)
+ * end_trimming_bitmap()
+ * reset_trimming_bitmap()
+ */
+static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
+{
+ struct btrfs_free_space *entry;
+
+ spin_lock(&ctl->tree_lock);
+ entry = tree_search_offset(ctl, offset, 1, 0);
+ if (entry) {
+ if (btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] +=
+ entry->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
+ }
+ entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ }
+
+ spin_unlock(&ctl->tree_lock);
+}
+
+static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *entry)
+{
+ if (btrfs_free_space_trimming_bitmap(entry)) {
+ entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
+ ctl->discardable_extents[BTRFS_STAT_CURR] -=
+ entry->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
+ }
+}
+
+/*
+ * If @async is set, then we will trim 1 region and return.
+ */
+static int trim_bitmaps(struct btrfs_block_group *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async)
+{
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ int ret = 0;
+ int ret2;
+ u64 bytes;
+ u64 offset = offset_to_bitmap(ctl, start);
+ const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
+
+ while (offset < end) {
+ bool next_bitmap = false;
+ struct btrfs_trim_range trim_entry;
+
+ mutex_lock(&ctl->cache_writeout_mutex);
+ spin_lock(&ctl->tree_lock);
+
+ if (ctl->free_space < minlen) {
+ block_group->discard_cursor =
+ btrfs_block_group_end(block_group);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ break;
+ }
+
+ entry = tree_search_offset(ctl, offset, 1, 0);
+ /*
+ * Bitmaps are marked trimmed lossily now to prevent constant
+ * discarding of the same bitmap (the reason why we are bound
+ * by the filters). So, retrim the block group bitmaps when we
+ * are preparing to punt to the unused_bgs list. This uses
+ * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
+ * which is the only discard index which sets minlen to 0.
+ */
+ if (!entry || (async && minlen && start == offset &&
+ btrfs_free_space_trimmed(entry))) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ next_bitmap = true;
+ goto next;
+ }
+
+ /*
+ * Async discard bitmap trimming begins at by setting the start
+ * to be key.objectid and the offset_to_bitmap() aligns to the
+ * start of the bitmap. This lets us know we are fully
+ * scanning the bitmap rather than only some portion of it.
+ */
+ if (start == offset)
+ entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
+
+ bytes = minlen;
+ ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
+ if (ret2 || start >= end) {
+ /*
+ * We lossily consider a bitmap trimmed if we only skip
+ * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
+ */
+ if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
+ end_trimming_bitmap(ctl, entry);
+ else
+ entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ next_bitmap = true;
+ goto next;
+ }
+
+ /*
+ * We already trimmed a region, but are using the locking above
+ * to reset the trim_state.
+ */
+ if (async && *total_trimmed) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto out;
+ }
+
+ bytes = min(bytes, end - start);
+ if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
+
+ /*
+ * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
+ * If X < @minlen, we won't trim X when we come back around.
+ * So trim it now. We differ here from trimming extents as we
+ * don't keep individual state per bit.
+ */
+ if (async &&
+ max_discard_size &&
+ bytes > (max_discard_size + minlen))
+ bytes = max_discard_size;
+
+ bitmap_clear_bits(ctl, entry, start, bytes, true);
+ if (entry->bytes == 0)
+ free_bitmap(ctl, entry);
+
+ spin_unlock(&ctl->tree_lock);
+ trim_entry.start = start;
+ trim_entry.bytes = bytes;
+ list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ start, bytes, 0, &trim_entry);
+ if (ret) {
+ reset_trimming_bitmap(ctl, offset);
+ block_group->discard_cursor =
+ btrfs_block_group_end(block_group);
+ break;
+ }
+next:
+ if (next_bitmap) {
+ offset += BITS_PER_BITMAP * ctl->unit;
+ start = offset;
+ } else {
+ start += bytes;
+ }
+ block_group->discard_cursor = start;
+
+ if (fatal_signal_pending(current)) {
+ if (start != offset)
+ reset_trimming_bitmap(ctl, offset);
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ if (offset >= end)
+ block_group->discard_cursor = end;
+
+out:
+ return ret;
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ int ret;
+ u64 rem = 0;
+
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ btrfs_freeze_block_group(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
+ if (ret)
+ goto out;
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
+ div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
+ /* If we ended in the middle of a bitmap, reset the trimming flag */
+ if (rem)
+ reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
+out:
+ btrfs_unfreeze_block_group(block_group);
+ return ret;
+}
+
+int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ bool async)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ btrfs_freeze_block_group(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
+ btrfs_unfreeze_block_group(block_group);
+
+ return ret;
+}
+
+int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ btrfs_freeze_block_group(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
+ async);
+
+ btrfs_unfreeze_block_group(block_group);
+
+ return ret;
+}
+
+bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_super_cache_generation(fs_info->super_copy);
+}
+
+static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_block_group *block_group;
+ struct rb_node *node;
+ int ret = 0;
+
+ btrfs_info(fs_info, "cleaning free space cache v1");
+
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group, cache_node);
+ ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
+ if (ret)
+ goto out;
+ node = rb_next(node);
+ }
+out:
+ return ret;
+}
+
+int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /*
+ * update_super_roots will appropriately set or unset
+ * super_copy->cache_generation based on SPACE_CACHE and
+ * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
+ * transaction commit whether we are enabling space cache v1 and don't
+ * have any other work to do, or are disabling it and removing free
+ * space inodes.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ if (!active) {
+ set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
+ ret = cleanup_free_space_cache_v1(fs_info, trans);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
+
+ ret = btrfs_commit_transaction(trans);
+out:
+ clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
+
+ return ret;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/*
+ * Use this if you need to make a bitmap or extent entry specifically, it
+ * doesn't do any of the merging that add_free_space does, this acts a lot like
+ * how the free space cache loading stuff works, so you can get really weird
+ * configurations.
+ */
+int test_add_free_space_entry(struct btrfs_block_group *cache,
+ u64 offset, u64 bytes, bool bitmap)
+{
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *info = NULL, *bitmap_info;
+ void *map = NULL;
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
+ u64 bytes_added;
+ int ret;
+
+again:
+ if (!info) {
+ info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+ if (!info)
+ return -ENOMEM;
+ }
+
+ if (!bitmap) {
+ spin_lock(&ctl->tree_lock);
+ info->offset = offset;
+ info->bytes = bytes;
+ info->max_extent_size = 0;
+ ret = link_free_space(ctl, info);
+ spin_unlock(&ctl->tree_lock);
+ if (ret)
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ return ret;
+ }
+
+ if (!map) {
+ map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
+ if (!map) {
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ return -ENOMEM;
+ }
+ }
+
+ spin_lock(&ctl->tree_lock);
+ bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!bitmap_info) {
+ info->bitmap = map;
+ map = NULL;
+ add_new_bitmap(ctl, info, offset);
+ bitmap_info = info;
+ info = NULL;
+ }
+
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
+ trim_state);
+
+ bytes -= bytes_added;
+ offset += bytes_added;
+ spin_unlock(&ctl->tree_lock);
+
+ if (bytes)
+ goto again;
+
+ if (info)
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ if (map)
+ kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
+ return 0;
+}
+
+/*
+ * Checks to see if the given range is in the free space cache. This is really
+ * just used to check the absence of space, so if there is free space in the
+ * range at all we will return 1.
+ */
+int test_check_exists(struct btrfs_block_group *cache,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *info;
+ int ret = 0;
+
+ spin_lock(&ctl->tree_lock);
+ info = tree_search_offset(ctl, offset, 0, 0);
+ if (!info) {
+ info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!info)
+ goto out;
+ }
+
+have_info:
+ if (info->bitmap) {
+ u64 bit_off, bit_bytes;
+ struct rb_node *n;
+ struct btrfs_free_space *tmp;
+
+ bit_off = offset;
+ bit_bytes = ctl->unit;
+ ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
+ if (!ret) {
+ if (bit_off == offset) {
+ ret = 1;
+ goto out;
+ } else if (bit_off > offset &&
+ offset + bytes > bit_off) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+ n = rb_prev(&info->offset_index);
+ while (n) {
+ tmp = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (tmp->offset + tmp->bytes < offset)
+ break;
+ if (offset + bytes < tmp->offset) {
+ n = rb_prev(&tmp->offset_index);
+ continue;
+ }
+ info = tmp;
+ goto have_info;
+ }
+
+ n = rb_next(&info->offset_index);
+ while (n) {
+ tmp = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (offset + bytes < tmp->offset)
+ break;
+ if (tmp->offset + tmp->bytes < offset) {
+ n = rb_next(&tmp->offset_index);
+ continue;
+ }
+ info = tmp;
+ goto have_info;
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ if (info->offset == offset) {
+ ret = 1;
+ goto out;
+ }
+
+ if (offset > info->offset && offset < info->offset + info->bytes)
+ ret = 1;
+out:
+ spin_unlock(&ctl->tree_lock);
+ return ret;
+}
+#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000..6d419ba53
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_FREE_SPACE_CACHE_H
+#define BTRFS_FREE_SPACE_CACHE_H
+
+/*
+ * This is the trim state of an extent or bitmap.
+ *
+ * BTRFS_TRIM_STATE_TRIMMING is special and used to maintain the state of a
+ * bitmap as we may need several trims to fully trim a single bitmap entry.
+ * This is reset should any free space other than trimmed space be added to the
+ * bitmap.
+ */
+enum btrfs_trim_state {
+ BTRFS_TRIM_STATE_UNTRIMMED,
+ BTRFS_TRIM_STATE_TRIMMED,
+ BTRFS_TRIM_STATE_TRIMMING,
+};
+
+struct btrfs_free_space {
+ struct rb_node offset_index;
+ struct rb_node bytes_index;
+ u64 offset;
+ u64 bytes;
+ u64 max_extent_size;
+ unsigned long *bitmap;
+ struct list_head list;
+ enum btrfs_trim_state trim_state;
+ s32 bitmap_extents;
+};
+
+static inline bool btrfs_free_space_trimmed(struct btrfs_free_space *info)
+{
+ return (info->trim_state == BTRFS_TRIM_STATE_TRIMMED);
+}
+
+static inline bool btrfs_free_space_trimming_bitmap(
+ struct btrfs_free_space *info)
+{
+ return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING);
+}
+
+struct btrfs_free_space_ctl {
+ spinlock_t tree_lock;
+ struct rb_root free_space_offset;
+ struct rb_root_cached free_space_bytes;
+ u64 free_space;
+ int extents_thresh;
+ int free_extents;
+ int total_bitmaps;
+ int unit;
+ u64 start;
+ s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
+ s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
+ const struct btrfs_free_space_op *op;
+ struct btrfs_block_group *block_group;
+ struct mutex cache_writeout_mutex;
+ struct list_head trimming_ranges;
+};
+
+struct btrfs_free_space_op {
+ bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
+};
+
+struct btrfs_io_ctl {
+ void *cur, *orig;
+ struct page *page;
+ struct page **pages;
+ struct btrfs_fs_info *fs_info;
+ struct inode *inode;
+ unsigned long size;
+ int index;
+ int num_pages;
+ int entries;
+ int bitmaps;
+};
+
+struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_block_group *block_group);
+
+int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv);
+int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct inode *inode);
+int load_free_space_cache(struct btrfs_block_group *block_group);
+int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl);
+int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
+ u64 size, enum btrfs_trim_state trim_state);
+int btrfs_add_free_space(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
+int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
+int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
+bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
+ u64 offset, u64 bytes, u64 empty_size,
+ u64 *max_extent_size);
+void btrfs_dump_free_space(struct btrfs_block_group *block_group,
+ u64 bytes);
+int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster, u64 bytes,
+ u64 min_start, u64 *max_extent_size);
+void btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group *block_group,
+ struct btrfs_free_cluster *cluster);
+int btrfs_trim_block_group(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen);
+int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ bool async);
+int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async);
+
+bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
+int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
+/* Support functions for running our sanity tests */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int test_add_free_space_entry(struct btrfs_block_group *cache,
+ u64 offset, u64 bytes, bool bitmap);
+int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes);
+#endif
+
+#endif
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
new file mode 100644
index 000000000..6a44733a9
--- /dev/null
+++ b/fs/btrfs/free-space-tree.c
@@ -0,0 +1,1658 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "free-space-tree.h"
+#include "transaction.h"
+#include "block-group.h"
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+
+static struct btrfs_root *btrfs_free_space_root(
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
+ if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2))
+ key.offset = block_group->global_root_id;
+ return btrfs_global_root(block_group->fs_info, &key);
+}
+
+void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
+{
+ u32 bitmap_range;
+ size_t bitmap_size;
+ u64 num_bitmaps, total_bitmap_size;
+
+ if (WARN_ON(cache->length == 0))
+ btrfs_warn(cache->fs_info, "block group %llu length is zero",
+ cache->start);
+
+ /*
+ * We convert to bitmaps when the disk space required for using extents
+ * exceeds that required for using bitmaps.
+ */
+ bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range);
+ bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
+ total_bitmap_size = num_bitmaps * bitmap_size;
+ cache->bitmap_high_thresh = div_u64(total_bitmap_size,
+ sizeof(struct btrfs_item));
+
+ /*
+ * We allow for a small buffer between the high threshold and low
+ * threshold to avoid thrashing back and forth between the two formats.
+ */
+ if (cache->bitmap_high_thresh > 100)
+ cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100;
+ else
+ cache->bitmap_low_thresh = 0;
+}
+
+static int add_new_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = block_group->start;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->length;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ info = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_info);
+ btrfs_set_free_space_extent_count(leaf, info, 0);
+ btrfs_set_free_space_flags(leaf, info, 0);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+EXPORT_FOR_TESTS
+struct btrfs_free_space_info *search_free_space_info(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, int cow)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = block_group->start;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->length;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret != 0) {
+ btrfs_warn(fs_info, "missing free space info for %llu",
+ block_group->start);
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_free_space_info);
+}
+
+/*
+ * btrfs_search_slot() but we're looking for the greatest key less than the
+ * passed key.
+ */
+static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int ins_len, int cow)
+{
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, p, ins_len, cow);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+
+ if (p->slots[0] == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+ p->slots[0]--;
+
+ return 0;
+}
+
+static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info,
+ u64 size)
+{
+ return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE);
+}
+
+static unsigned long *alloc_bitmap(u32 bitmap_size)
+{
+ unsigned long *ret;
+ unsigned int nofs_flag;
+ u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long));
+
+ /*
+ * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
+ * into the filesystem as the free space bitmap can be modified in the
+ * critical section of a transaction commit.
+ *
+ * TODO: push the memalloc_nofs_{save,restore}() to the caller where we
+ * know that recursion is unsafe.
+ */
+ nofs_flag = memalloc_nofs_save();
+ ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
+
+static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
+{
+ u8 *p = ((u8 *)map) + BIT_BYTE(start);
+ const unsigned int size = start + len;
+ int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
+ u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
+
+ while (len - bits_to_set >= 0) {
+ *p |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_BYTE;
+ mask_to_set = ~0;
+ p++;
+ }
+ if (len) {
+ mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+ *p |= mask_to_set;
+ }
+}
+
+EXPORT_FOR_TESTS
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ char *bitmap_cursor;
+ u64 start, end;
+ u64 bitmap_range, i;
+ u32 bitmap_size, flags, expected_extent_count;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->start;
+ end = block_group->start + block_group->length;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
+ u64 first, last;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ first = div_u64(found_key.objectid - start,
+ fs_info->sectorsize);
+ last = div_u64(found_key.objectid + found_key.offset - start,
+ fs_info->sectorsize);
+ le_bitmap_set(bitmap, first, last - first);
+
+ extent_count++;
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info,
+ "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->start, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ bitmap_cursor = (char *)bitmap;
+ bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ i = start;
+ while (i < end) {
+ unsigned long ptr;
+ u64 extent_size;
+ u32 data_size;
+
+ extent_size = min(end - i, bitmap_range);
+ data_size = free_space_bitmap_size(fs_info, extent_size);
+
+ key.objectid = i;
+ key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
+ key.offset = extent_size;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ data_size);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ i += extent_size;
+ bitmap_cursor += data_size;
+ }
+
+ ret = 0;
+out:
+ kvfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+}
+
+EXPORT_FOR_TESTS
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ u64 start, end;
+ u32 bitmap_size, flags, expected_extent_count;
+ unsigned long nrbits, start_bit, end_bit;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->start;
+ end = block_group->start + block_group->length;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ unsigned long ptr;
+ char *bitmap_cursor;
+ u32 bitmap_pos, data_size;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ bitmap_pos = div_u64(found_key.objectid - start,
+ fs_info->sectorsize *
+ BITS_PER_BYTE);
+ bitmap_cursor = ((char *)bitmap) + bitmap_pos;
+ data_size = free_space_bitmap_size(fs_info,
+ found_key.offset);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ read_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
+ start_bit = find_next_bit_le(bitmap, nrbits, 0);
+
+ while (start_bit < nrbits) {
+ end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
+ ASSERT(start_bit < end_bit);
+
+ key.objectid = start + start_bit * block_group->fs_info->sectorsize;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+
+ start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info,
+ "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->start, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ kvfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+}
+
+static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path,
+ int new_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ u32 extent_count;
+ int ret = 0;
+
+ if (new_extents == 0)
+ return 0;
+
+ info = search_free_space_info(trans, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ extent_count += new_extents;
+ btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+
+ if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count > block_group->bitmap_high_thresh) {
+ ret = convert_free_space_to_bitmaps(trans, block_group, path);
+ } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count < block_group->bitmap_low_thresh) {
+ ret = convert_free_space_to_extents(trans, block_group, path);
+ }
+
+out:
+ return ret;
+}
+
+EXPORT_FOR_TESTS
+int free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ unsigned long ptr, i;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(offset >= found_start && offset < found_end);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ i = div_u64(offset - found_start,
+ block_group->fs_info->sectorsize);
+ return !!extent_buffer_test_bit(leaf, ptr, i);
+}
+
+static void free_space_set_bits(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ int bit)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 end = *start + *size;
+ u64 found_start, found_end;
+ unsigned long ptr, first, last;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(*start >= found_start && *start < found_end);
+ ASSERT(end > found_start);
+
+ if (end > found_end)
+ end = found_end;
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ first = (*start - found_start) >> fs_info->sectorsize_bits;
+ last = (end - found_start) >> fs_info->sectorsize_bits;
+ if (bit)
+ extent_buffer_bitmap_set(leaf, ptr, first, last - first);
+ else
+ extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
+ btrfs_mark_buffer_dirty(leaf);
+
+ *size -= end - *start;
+ *start = end;
+}
+
+/*
+ * We can't use btrfs_next_item() in modify_free_space_bitmap() because
+ * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy
+ * tree walking in btrfs_next_leaf() anyways because we know exactly what we're
+ * looking for.
+ */
+static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p)
+{
+ struct btrfs_key key;
+
+ if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) {
+ p->slots[0]++;
+ return 0;
+ }
+
+ btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]);
+ btrfs_release_path(p);
+
+ key.objectid += key.offset;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ return btrfs_search_prev_slot(trans, root, &key, p, 0, 1);
+}
+
+/*
+ * If remove is 1, then we are removing free space, thus clearing bits in the
+ * bitmap. If remove is 0, then we are adding free space, thus setting bits in
+ * the bitmap.
+ */
+static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size, int remove)
+{
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
+ struct btrfs_key key;
+ u64 end = start + size;
+ u64 cur_start, cur_size;
+ int prev_bit, next_bit;
+ int new_extents;
+ int ret;
+
+ /*
+ * Read the bit for the block immediately before the extent of space if
+ * that block is within the block group.
+ */
+ if (start > block_group->start) {
+ u64 prev_block = start - block_group->fs_info->sectorsize;
+
+ key.objectid = prev_block;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = free_space_test_bit(block_group, path, prev_block);
+
+ /* The previous block may have been in the previous bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (start >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+ } else {
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = -1;
+ }
+
+ /*
+ * Iterate over all of the bitmaps overlapped by the extent of space,
+ * clearing/setting bits as required.
+ */
+ cur_start = start;
+ cur_size = size;
+ while (1) {
+ free_space_set_bits(block_group, path, &cur_start, &cur_size,
+ !remove);
+ if (cur_size == 0)
+ break;
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * Read the bit for the block immediately after the extent of space if
+ * that block is within the block group.
+ */
+ if (end < block_group->start + block_group->length) {
+ /* The next block may be in the next bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (end >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ next_bit = free_space_test_bit(block_group, path, end);
+ } else {
+ next_bit = -1;
+ }
+
+ if (remove) {
+ new_extents = -1;
+ if (prev_bit == 1) {
+ /* Leftover on the left. */
+ new_extents++;
+ }
+ if (next_bit == 1) {
+ /* Leftover on the right. */
+ new_extents++;
+ }
+ } else {
+ new_extents = 1;
+ if (prev_bit == 1) {
+ /* Merging with neighbor on the left. */
+ new_extents--;
+ }
+ if (next_bit == 1) {
+ /* Merging with neighbor on the right. */
+ new_extents--;
+ }
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+static int remove_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = -1;
+ int ret;
+
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(start >= found_start && end <= found_end);
+
+ /*
+ * Okay, now that we've found the free space extent which contains the
+ * free space that we are removing, there are four cases:
+ *
+ * 1. We're using the whole extent: delete the key we found and
+ * decrement the free space extent count.
+ * 2. We are using part of the extent starting at the beginning: delete
+ * the key we found and insert a new key representing the leftover at
+ * the end. There is no net change in the number of extents.
+ * 3. We are using part of the extent ending at the end: delete the key
+ * we found and insert a new key representing the leftover at the
+ * beginning. There is no net change in the number of extents.
+ * 4. We are using part of the extent in the middle: delete the key we
+ * found and insert two new keys representing the leftovers on each
+ * side. Where we used to have one extent, we now have two, so increment
+ * the extent count. We may need to convert the block group to bitmaps
+ * as a result.
+ */
+
+ /* Delete the existing key (cases 1-4). */
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+
+ /* Add a key for leftovers at the beginning (cases 3 and 4). */
+ if (start > found_start) {
+ key.objectid = found_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = start - found_start;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ /* Add a key for leftovers at the end (cases 2 and 4). */
+ if (end < found_end) {
+ key.objectid = end;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = found_end - end;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+EXPORT_FOR_TESTS
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, block_group, path,
+ start, size, 1);
+ } else {
+ return remove_free_space_extent(trans, block_group, path,
+ start, size);
+ }
+}
+
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(trans->fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __remove_from_free_space_tree(trans, block_group, path, start,
+ size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+}
+
+static int add_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
+ struct btrfs_key key, new_key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = 1;
+ int ret;
+
+ /*
+ * We are adding a new extent of free space, but we need to merge
+ * extents. There are four cases here:
+ *
+ * 1. The new extent does not have any immediate neighbors to merge
+ * with: add the new key and increment the free space extent count. We
+ * may need to convert the block group to bitmaps as a result.
+ * 2. The new extent has an immediate neighbor before it: remove the
+ * previous key and insert a new key combining both of them. There is no
+ * net change in the number of extents.
+ * 3. The new extent has an immediate neighbor after it: remove the next
+ * key and insert a new key combining both of them. There is no net
+ * change in the number of extents.
+ * 4. The new extent has immediate neighbors on both sides: remove both
+ * of the keys and insert a new key combining all of them. Where we used
+ * to have two extents, we now have one, so decrement the extent count.
+ */
+
+ new_key.objectid = start;
+ new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ new_key.offset = size;
+
+ /* Search for a neighbor on the left. */
+ if (start == block_group->start)
+ goto right;
+ key.objectid = start - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto right;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->start &&
+ found_end > block_group->start);
+ ASSERT(found_start < start && found_end <= start);
+
+ /*
+ * Delete the neighbor on the left and absorb it into the new key (cases
+ * 2 and 4).
+ */
+ if (found_end == start) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.objectid = found_start;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+right:
+ /* Search for a neighbor on the right. */
+ if (end == block_group->start + block_group->length)
+ goto insert;
+ key.objectid = end;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto insert;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->start &&
+ found_end > block_group->start);
+ ASSERT((found_start < start && found_end <= start) ||
+ (found_start >= end && found_end > end));
+
+ /*
+ * Delete the neighbor on the right and absorb it into the new key
+ * (cases 3 and 4).
+ */
+ if (found_start == end) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+insert:
+ /* Insert the new key (cases 1-4). */
+ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+EXPORT_FOR_TESTS
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, block_group, path,
+ start, size, 0);
+ } else {
+ return add_free_space_extent(trans, block_group, path, start,
+ size);
+ }
+}
+
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(trans->fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __add_to_free_space_tree(trans, block_group, path, start, size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+}
+
+/*
+ * Populate the free space tree by walking the extent tree. Operations on the
+ * extent tree that happen as a result of writes to the free space tree will go
+ * through the normal add/remove hooks.
+ */
+static int populate_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_root *extent_root;
+ struct btrfs_path *path, *path2;
+ struct btrfs_key key;
+ u64 start, end;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ path2 = btrfs_alloc_path();
+ if (!path2) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ ret = add_new_free_space_info(trans, block_group, path2);
+ if (ret)
+ goto out;
+
+ mutex_lock(&block_group->free_space_lock);
+
+ /*
+ * Iterate through all of the extent and metadata items in this block
+ * group, adding the free space between them and the free space at the
+ * end. Note that EXTENT_ITEM and METADATA_ITEM are less than
+ * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
+ * contained in.
+ */
+ key.objectid = block_group->start;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ extent_root = btrfs_extent_root(trans->fs_info, key.objectid);
+ ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out_locked;
+ ASSERT(ret == 0);
+
+ start = block_group->start;
+ end = block_group->start + block_group->length;
+ while (1) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (key.objectid >= end)
+ break;
+
+ if (start < key.objectid) {
+ ret = __add_to_free_space_tree(trans,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
+ if (ret)
+ goto out_locked;
+ }
+ start = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ start += trans->fs_info->nodesize;
+ else
+ start += key.offset;
+ } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ if (key.objectid != block_group->start)
+ break;
+ }
+
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ goto out_locked;
+ if (ret)
+ break;
+ }
+ if (start < end) {
+ ret = __add_to_free_space_tree(trans, block_group, path2,
+ start, end - start);
+ if (ret)
+ goto out_locked;
+ }
+
+ ret = 0;
+out_locked:
+ mutex_unlock(&block_group->free_space_lock);
+out:
+ btrfs_free_path(path2);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root;
+ struct btrfs_block_group *block_group;
+ struct rb_node *node;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+ free_space_root = btrfs_create_tree(trans,
+ BTRFS_FREE_SPACE_TREE_OBJECTID);
+ if (IS_ERR(free_space_root)) {
+ ret = PTR_ERR(free_space_root);
+ goto abort;
+ }
+ ret = btrfs_global_root_insert(free_space_root);
+ if (ret) {
+ btrfs_put_root(free_space_root);
+ goto abort;
+ }
+
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group,
+ cache_node);
+ ret = populate_free_space_tree(trans, block_group);
+ if (ret)
+ goto abort;
+ node = rb_next(node);
+ }
+
+ btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
+ clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ ret = btrfs_commit_transaction(trans);
+
+ /*
+ * Now that we've committed the transaction any reading of our commit
+ * root will be safe, so we can cache from the free space tree now.
+ */
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+ return ret;
+
+abort:
+ clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+static int clear_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int nr;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ nr = btrfs_header_nritems(path->nodes[0]);
+ if (!nr)
+ break;
+
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
+
+ ret = clear_free_space_tree(trans, free_space_root);
+ if (ret)
+ goto abort;
+
+ ret = btrfs_del_root(trans, &free_space_root->root_key);
+ if (ret)
+ goto abort;
+
+ btrfs_global_root_delete(free_space_root);
+
+ spin_lock(&fs_info->trans_lock);
+ list_del(&free_space_root->dirty_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_tree_lock(free_space_root->node);
+ btrfs_clean_tree_block(free_space_root->node);
+ btrfs_tree_unlock(free_space_root->node);
+ btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+ free_space_root->node, 0, 1);
+
+ btrfs_put_root(free_space_root);
+
+ return btrfs_commit_transaction(trans);
+
+abort:
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
+ struct rb_node *node;
+ int ret;
+
+ trans = btrfs_start_transaction(free_space_root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+
+ ret = clear_free_space_tree(trans, free_space_root);
+ if (ret)
+ goto abort;
+
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
+ while (node) {
+ struct btrfs_block_group *block_group;
+
+ block_group = rb_entry(node, struct btrfs_block_group,
+ cache_node);
+ ret = populate_free_space_tree(trans, block_group);
+ if (ret)
+ goto abort;
+ node = rb_next(node);
+ }
+
+ btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
+ clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+
+ ret = btrfs_commit_transaction(trans);
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+ return ret;
+abort:
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
+{
+ int ret;
+
+ clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
+
+ ret = add_new_free_space_info(trans, block_group, path);
+ if (ret)
+ return ret;
+
+ return __add_to_free_space_tree(trans, block_group, path,
+ block_group->start,
+ block_group->length);
+}
+
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ mutex_lock(&block_group->free_space_lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __add_block_group_free_space(trans, block_group, path);
+
+out:
+ btrfs_free_path(path);
+ mutex_unlock(&block_group->free_space_lock);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+}
+
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
+ struct btrfs_path *path;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ u64 start, end;
+ int done = 0, nr;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
+ /* We never added this block group to the free space tree. */
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->start;
+ end = block_group->start + block_group->length;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
+ done = 1;
+ nr++;
+ path->slots[0]--;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY ||
+ found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+}
+
+static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 end, offset;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = btrfs_free_space_root(block_group);
+
+ end = block_group->start + block_group->length;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(block_group, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ u64 space_added;
+
+ ret = add_new_free_space(block_group, extent_start,
+ offset, &space_added);
+ if (ret)
+ goto out;
+ total_found += space_added;
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += fs_info->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ ret = add_new_free_space(block_group, extent_start, end, NULL);
+ if (ret)
+ goto out;
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info,
+ "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->start, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ u64 end;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = btrfs_free_space_root(block_group);
+
+ end = block_group->start + block_group->length;
+
+ while (1) {
+ u64 space_added;
+
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ ret = add_new_free_space(block_group, key.objectid,
+ key.objectid + key.offset, &space_added);
+ if (ret)
+ goto out;
+ total_found += space_added;
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info,
+ "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->start, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_free_space_info *info;
+ struct btrfs_path *path;
+ u32 extent_count, flags;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Just like caching_thread() doesn't want to deadlock on the extent
+ * tree, we don't want to deadlock on the free space tree.
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = READA_FORWARD;
+
+ info = search_free_space_info(NULL, block_group, path, 0);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+
+ /*
+ * We left path pointing to the free space info item, so now
+ * load_free_space_foo can just iterate through the free space tree from
+ * there.
+ */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
+ ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ else
+ ret = load_free_space_extents(caching_ctl, path, extent_count);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
new file mode 100644
index 000000000..6d5551d0c
--- /dev/null
+++ b/fs/btrfs/free-space-tree.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ */
+
+#ifndef BTRFS_FREE_SPACE_TREE_H
+#define BTRFS_FREE_SPACE_TREE_H
+
+struct btrfs_caching_control;
+
+/*
+ * The default size for new free space bitmap items. The last bitmap in a block
+ * group may be truncated, and none of the free space tree code assumes that
+ * existing bitmaps are this size.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
+#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
+
+void set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
+int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info);
+int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info);
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, int cow);
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset);
+#endif
+
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000..5add022d3
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,748 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include "ctree.h"
+#include "inode-item.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ int slot,
+ const struct fscrypt_str *name)
+{
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ unsigned long name_ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int len;
+
+ item_size = btrfs_item_size(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ len = btrfs_inode_ref_name_len(leaf, ref);
+ name_ptr = (unsigned long)(ref + 1);
+ cur_offset += len + sizeof(*ref);
+ if (len != name->len)
+ continue;
+ if (memcmp_extent_buffer(leaf, name->name, name_ptr,
+ name->len) == 0)
+ return ref;
+ }
+ return NULL;
+}
+
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const struct fscrypt_str *name)
+{
+ struct btrfs_inode_extref *extref;
+ unsigned long ptr;
+ unsigned long name_ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int ref_name_len;
+
+ item_size = btrfs_item_size(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+
+ /*
+ * Search all extended backrefs in this item. We're only
+ * looking through any collisions so most of the time this is
+ * just going to compare against one buffer. If all is well,
+ * we'll return success and the inode ref object.
+ */
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+ name_ptr = (unsigned long)(&extref->name);
+ ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (ref_name_len == name->len &&
+ btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+ (memcmp_extent_buffer(leaf, name->name, name_ptr,
+ name->len) == 0))
+ return extref;
+
+ cur_offset += ref_name_len + sizeof(*extref);
+ }
+ return NULL;
+}
+
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow)
+{
+ int ret;
+ struct btrfs_key key;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+ return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+ ref_objectid, name);
+
+}
+
+static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid,
+ u64 *index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ int ret;
+ int del_len = name->len + sizeof(*extref);
+ unsigned long ptr;
+ unsigned long item_start;
+ u32 item_size;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Sanity check - did we find the right item for this name?
+ * This should always succeed so error here will make the FS
+ * readonly.
+ */
+ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
+ ref_objectid, name);
+ if (!extref) {
+ btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
+ ret = -EROFS;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ if (index)
+ *index = btrfs_inode_extref_index(leaf, extref);
+
+ if (del_len == item_size) {
+ /*
+ * Common case only one ref in the item, remove the
+ * whole item.
+ */
+ ret = btrfs_del_item(trans, root, path);
+ goto out;
+ }
+
+ ptr = (unsigned long)extref;
+ item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ memmove_extent_buffer(leaf, ptr, ptr + del_len,
+ item_size - (ptr + del_len - item_start));
+
+ btrfs_truncate_item(path, item_size - del_len, 1);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+ unsigned long item_start;
+ u32 item_size;
+ u32 sub_item_len;
+ int ret;
+ int search_ext_refs = 0;
+ int del_len = name->len + sizeof(*ref);
+
+ key.objectid = inode_objectid;
+ key.offset = ref_objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = -ENOENT;
+ search_ext_refs = 1;
+ goto out;
+ } else if (ret < 0) {
+ goto out;
+ }
+
+ ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name);
+ if (!ref) {
+ ret = -ENOENT;
+ search_ext_refs = 1;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (index)
+ *index = btrfs_inode_ref_index(leaf, ref);
+
+ if (del_len == item_size) {
+ ret = btrfs_del_item(trans, root, path);
+ goto out;
+ }
+ ptr = (unsigned long)ref;
+ sub_item_len = name->len + sizeof(*ref);
+ item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+ item_size - (ptr + sub_item_len - item_start));
+ btrfs_truncate_item(path, item_size - sub_item_len, 1);
+out:
+ btrfs_free_path(path);
+
+ if (search_ext_refs) {
+ /*
+ * No refs were found, or we could not find the
+ * name in our ref array. Find and remove the extended
+ * inode ref then.
+ */
+ return btrfs_del_inode_extref(trans, root, name,
+ inode_objectid, ref_objectid, index);
+ }
+
+ return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid,
+ u64 index)
+{
+ struct btrfs_inode_extref *extref;
+ int ret;
+ int ins_len = name->len + sizeof(*extref);
+ unsigned long ptr;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ ins_len);
+ if (ret == -EEXIST) {
+ if (btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0],
+ ref_objectid,
+ name))
+ goto out;
+
+ btrfs_extend_item(path, ins_len);
+ ret = 0;
+ }
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+ ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
+ extref = (struct btrfs_inode_extref *)ptr;
+
+ btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len);
+ btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+ btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+ ptr = (unsigned long)&extref->name;
+ write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ int ret;
+ int ins_len = name->len + sizeof(*ref);
+
+ key.objectid = inode_objectid;
+ key.offset = ref_objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->skip_release_on_error = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ ins_len);
+ if (ret == -EEXIST) {
+ u32 old_size;
+ ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+ name);
+ if (ref)
+ goto out;
+
+ old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
+ btrfs_extend_item(path, ins_len);
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+ ptr = (unsigned long)(ref + 1);
+ ret = 0;
+ } else if (ret < 0) {
+ if (ret == -EOVERFLOW) {
+ if (btrfs_find_name_in_backref(path->nodes[0],
+ path->slots[0],
+ name))
+ ret = -EEXIST;
+ else
+ ret = -EMLINK;
+ }
+ goto out;
+ } else {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+ ptr = (unsigned long)(ref + 1);
+ }
+ write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+ btrfs_free_path(path);
+
+ if (ret == -EMLINK) {
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ /* We ran out of space in the ref array. Need to
+ * add an extended ref. */
+ if (btrfs_super_incompat_flags(disk_super)
+ & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+ ret = btrfs_insert_inode_extref(trans, root, name,
+ inode_objectid,
+ ref_objectid, index);
+ }
+
+ return ret;
+}
+
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid)
+{
+ struct btrfs_key key;
+ int ret;
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_inode_item));
+ return ret;
+}
+
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod)
+{
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key found_key;
+
+ ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+ if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
+ location->offset == (u64)-1 && path->slots[0] != 0) {
+ slot = path->slots[0] - 1;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid == location->objectid &&
+ found_key.type == location->type) {
+ path->slots[0]--;
+ return 0;
+ }
+ }
+ return ret;
+}
+
+static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
+ struct extent_buffer *leaf,
+ struct btrfs_file_extent_item *fi,
+ u64 offset, int extent_type, int slot)
+{
+ if (!inode)
+ return;
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot,
+ offset);
+ else
+ trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset);
+}
+
+/*
+ * Remove inode items from a given root.
+ *
+ * @trans: A transaction handle.
+ * @root: The root from which to remove items.
+ * @inode: The inode whose items we want to remove.
+ * @control: The btrfs_truncate_control to control how and what we
+ * are truncating.
+ *
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
+ *
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_truncate_control *control)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 new_size = control->new_size;
+ u64 extent_num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 item_end = 0;
+ u32 found_type = (u8)-1;
+ int del_item;
+ int pending_del_nr = 0;
+ int pending_del_slot = 0;
+ int extent_type = -1;
+ int ret;
+ u64 bytes_deleted = 0;
+ bool be_nice = false;
+
+ ASSERT(control->inode || !control->clear_extent_range);
+ ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY);
+
+ control->last_size = new_size;
+ control->sub_bytes = 0;
+
+ /*
+ * For shareable roots we want to back off from time to time, this turns
+ * out to be subvolume roots, reloc roots, and data reloc roots.
+ */
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ be_nice = true;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_BACK;
+
+ key.objectid = control->ino;
+ key.offset = (u64)-1;
+ key.type = (u8)-1;
+
+search_again:
+ /*
+ * With a 16K leaf size and 128MiB extents, you can actually queue up a
+ * huge file in a single leaf. Most of the time that bytes_deleted is
+ * > 0, it will be huge by the time we get here
+ */
+ if (be_nice && bytes_deleted > SZ_32M &&
+ btrfs_should_end_transaction(trans)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = 0;
+ /* There are no items in the tree for us to truncate, we're done */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (1) {
+ u64 clear_start = 0, clear_len = 0, extent_start = 0;
+ bool should_throttle = false;
+
+ fi = NULL;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = found_key.type;
+
+ if (found_key.objectid != control->ino)
+ break;
+
+ if (found_type < control->min_type)
+ break;
+
+ item_end = found_key.offset;
+ if (found_type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE)
+ item_end +=
+ btrfs_file_extent_num_bytes(leaf, fi);
+ else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ item_end += btrfs_file_extent_ram_bytes(leaf, fi);
+
+ btrfs_trace_truncate(control->inode, leaf, fi,
+ found_key.offset, extent_type,
+ path->slots[0]);
+ item_end--;
+ }
+ if (found_type > control->min_type) {
+ del_item = 1;
+ } else {
+ if (item_end < new_size)
+ break;
+ if (found_key.offset >= new_size)
+ del_item = 1;
+ else
+ del_item = 0;
+ }
+
+ /* FIXME, shrink the extent if the ref count is only 1 */
+ if (found_type != BTRFS_EXTENT_DATA_KEY)
+ goto delete;
+
+ control->extents_found++;
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ u64 num_dec;
+
+ clear_start = found_key.offset;
+ extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (!del_item) {
+ u64 orig_num_bytes =
+ btrfs_file_extent_num_bytes(leaf, fi);
+ extent_num_bytes = ALIGN(new_size -
+ found_key.offset,
+ fs_info->sectorsize);
+ clear_start = ALIGN(new_size, fs_info->sectorsize);
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_num_bytes);
+ num_dec = (orig_num_bytes - extent_num_bytes);
+ if (extent_start != 0)
+ control->sub_bytes += num_dec;
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ extent_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = found_key.offset -
+ btrfs_file_extent_offset(leaf, fi);
+
+ /* FIXME blocksize != 4096 */
+ num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_start != 0)
+ control->sub_bytes += num_dec;
+ }
+ clear_len = num_dec;
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * We can't truncate inline items that have had
+ * special encodings
+ */
+ if (!del_item &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
+ btrfs_file_extent_compression(leaf, fi) == 0) {
+ u32 size = (u32)(new_size - found_key.offset);
+
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size = btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(path, size, 1);
+ } else if (!del_item) {
+ /*
+ * We have to bail so the last_size is set to
+ * just before this extent.
+ */
+ ret = BTRFS_NEED_TRUNCATE_BLOCK;
+ break;
+ } else {
+ /*
+ * Inline extents are special, we just treat
+ * them as a full sector worth in the file
+ * extent tree just for simplicity sake.
+ */
+ clear_len = fs_info->sectorsize;
+ }
+
+ control->sub_bytes += item_end + 1 - new_size;
+ }
+delete:
+ /*
+ * We only want to clear the file extent range if we're
+ * modifying the actual inode's mapping, which is just the
+ * normal truncate path.
+ */
+ if (control->clear_extent_range) {
+ ret = btrfs_inode_clear_file_extent_range(control->inode,
+ clear_start, clear_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
+ if (del_item) {
+ ASSERT(!pending_del_nr ||
+ ((path->slots[0] + 1) == pending_del_slot));
+
+ control->last_size = found_key.offset;
+ if (!pending_del_nr) {
+ /* No pending yet, add ourselves */
+ pending_del_slot = path->slots[0];
+ pending_del_nr = 1;
+ } else if (pending_del_nr &&
+ path->slots[0] + 1 == pending_del_slot) {
+ /* Hop on the pending chunk */
+ pending_del_nr++;
+ pending_del_slot = path->slots[0];
+ }
+ } else {
+ control->last_size = new_size;
+ break;
+ }
+
+ if (del_item && extent_start != 0 && !control->skip_ref_updates) {
+ struct btrfs_ref ref = { 0 };
+
+ bytes_deleted += extent_num_bytes;
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
+ extent_start, extent_num_bytes, 0);
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ control->ino, extent_offset,
+ root->root_key.objectid, false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ if (be_nice) {
+ if (btrfs_should_throttle_delayed_refs(trans))
+ should_throttle = true;
+ }
+ }
+
+ if (found_type == BTRFS_INODE_ITEM_KEY)
+ break;
+
+ if (path->slots[0] == 0 ||
+ path->slots[0] != pending_del_slot ||
+ should_throttle) {
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ pending_del_nr = 0;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * We can generate a lot of delayed refs, so we need to
+ * throttle every once and a while and make sure we're
+ * adding enough space to keep up with the work we are
+ * generating. Since we hold a transaction here we
+ * can't flush, and we don't want to FLUSH_LIMIT because
+ * we could have generated too many delayed refs to
+ * actually allocate, so just bail if we're short and
+ * let the normal reservation dance happen higher up.
+ */
+ if (should_throttle) {
+ ret = btrfs_delayed_refs_rsv_refill(fs_info,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (ret) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ goto search_again;
+ } else {
+ path->slots[0]--;
+ }
+ }
+out:
+ if (ret >= 0 && pending_del_nr) {
+ int err;
+
+ err = btrfs_del_items(trans, root, path, pending_del_slot,
+ pending_del_nr);
+ if (err) {
+ btrfs_abort_transaction(trans, err);
+ ret = err;
+ }
+ }
+
+ ASSERT(control->last_size >= new_size);
+ if (!ret && control->last_size > new_size)
+ control->last_size = new_size;
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
new file mode 100644
index 000000000..b80aeb715
--- /dev/null
+++ b/fs/btrfs/inode-item.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_INODE_ITEM_H
+#define BTRFS_INODE_ITEM_H
+
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_key;
+struct btrfs_inode_extref;
+struct btrfs_inode;
+struct extent_buffer;
+
+/*
+ * Return this if we need to call truncate_block for the last bit of the
+ * truncate.
+ */
+#define BTRFS_NEED_TRUNCATE_BLOCK 1
+
+struct btrfs_truncate_control {
+ /*
+ * IN: the inode we're operating on, this can be NULL if
+ * ->clear_extent_range is false.
+ */
+ struct btrfs_inode *inode;
+
+ /* IN: the size we're truncating to. */
+ u64 new_size;
+
+ /* OUT: the number of extents truncated. */
+ u64 extents_found;
+
+ /* OUT: the last size we truncated this inode to. */
+ u64 last_size;
+
+ /* OUT: the number of bytes to sub from this inode. */
+ u64 sub_bytes;
+
+ /* IN: the ino we are truncating. */
+ u64 ino;
+
+ /*
+ * IN: minimum key type to remove. All key types with this type are
+ * removed only if their offset >= new_size.
+ */
+ u32 min_type;
+
+ /*
+ * IN: true if we don't want to do extent reference updates for any file
+ * extents we drop.
+ */
+ bool skip_ref_updates;
+
+ /*
+ * IN: true if we need to clear the file extent range for the inode as
+ * we drop the file extent items.
+ */
+ bool clear_extent_range;
+};
+
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_truncate_control *control);
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod);
+
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow);
+
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ int slot,
+ const struct fscrypt_str *name);
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const struct fscrypt_str *name);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000..82f92b565
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,11522 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <crypto/hash.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/blk-cgroup.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/writeback.h>
+#include <linux/compat.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
+#include <linux/magic.h>
+#include <linux/iversion.h>
+#include <linux/swap.h>
+#include <linux/migrate.h>
+#include <linux/sched/mm.h>
+#include <linux/iomap.h>
+#include <asm/unaligned.h>
+#include <linux/fsverity.h>
+#include "misc.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "volumes.h"
+#include "compression.h"
+#include "locking.h"
+#include "free-space-cache.h"
+#include "props.h"
+#include "qgroup.h"
+#include "delalloc-space.h"
+#include "block-group.h"
+#include "space-info.h"
+#include "zoned.h"
+#include "subpage.h"
+#include "inode-item.h"
+
+struct btrfs_iget_args {
+ u64 ino;
+ struct btrfs_root *root;
+};
+
+struct btrfs_dio_data {
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ bool data_space_reserved;
+ bool nocow_done;
+};
+
+struct btrfs_dio_private {
+ struct inode *inode;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
+ /* Used for bio::bi_size */
+ u32 bytes;
+
+ /*
+ * References to this structure. There is one reference per in-flight
+ * bio plus one while we're still setting up.
+ */
+ refcount_t refs;
+
+ /* Array of checksums */
+ u8 *csums;
+
+ /* This must be last */
+ struct bio bio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
+struct btrfs_rename_ctx {
+ /* Output field. Stores the index number of the old directory entry. */
+ u64 index;
+};
+
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct file_operations btrfs_dir_file_operations;
+
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_path_cachep;
+struct kmem_cache *btrfs_free_space_cachep;
+struct kmem_cache *btrfs_free_space_bitmap_cachep;
+
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
+static int btrfs_truncate(struct inode *inode, bool skip_writeback);
+static noinline int cow_file_range(struct btrfs_inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset);
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
+ u64 block_len, u64 orig_block_len,
+ u64 ram_bytes, int compress_type,
+ int type);
+
+/*
+ * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ *
+ * ilock_flags can have the following bit set:
+ *
+ * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
+ * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
+ * return -EAGAIN
+ * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
+ */
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
+{
+ if (ilock_flags & BTRFS_ILOCK_SHARED) {
+ if (ilock_flags & BTRFS_ILOCK_TRY) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ else
+ return 0;
+ }
+ inode_lock_shared(inode);
+ } else {
+ if (ilock_flags & BTRFS_ILOCK_TRY) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ else
+ return 0;
+ }
+ inode_lock(inode);
+ }
+ if (ilock_flags & BTRFS_ILOCK_MMAP)
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
+ return 0;
+}
+
+/*
+ * btrfs_inode_unlock - unock inode i_rwsem
+ *
+ * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
+ * to decide whether the lock acquired is shared or exclusive.
+ */
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
+{
+ if (ilock_flags & BTRFS_ILOCK_MMAP)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ if (ilock_flags & BTRFS_ILOCK_SHARED)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+}
+
+/*
+ * Cleanup all submitted ordered extents in specified range to handle errors
+ * from the btrfs_run_delalloc_range() callback.
+ *
+ * NOTE: caller must ensure that when an error happens, it can not call
+ * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
+ * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
+ * to be released, which we want to happen only when finishing the ordered
+ * extent (btrfs_finish_ordered_io()).
+ */
+static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
+ struct page *locked_page,
+ u64 offset, u64 bytes)
+{
+ unsigned long index = offset >> PAGE_SHIFT;
+ unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ u64 page_start, page_end;
+ struct page *page;
+
+ if (locked_page) {
+ page_start = page_offset(locked_page);
+ page_end = page_start + PAGE_SIZE - 1;
+ }
+
+ while (index <= end_index) {
+ /*
+ * For locked page, we will call end_extent_writepage() on it
+ * in run_delalloc_range() for the error handling. That
+ * end_extent_writepage() function will call
+ * btrfs_mark_ordered_io_finished() to clear page Ordered and
+ * run the ordered extent accounting.
+ *
+ * Here we can't just clear the Ordered bit, or
+ * btrfs_mark_ordered_io_finished() would skip the accounting
+ * for the page range, and the ordered extent will never finish.
+ */
+ if (locked_page && index == (page_start >> PAGE_SHIFT)) {
+ index++;
+ continue;
+ }
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
+ index++;
+ if (!page)
+ continue;
+
+ /*
+ * Here we just clear all Ordered bits for every page in the
+ * range, then btrfs_mark_ordered_io_finished() will handle
+ * the ordered extent accounting for the range.
+ */
+ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
+ offset, bytes);
+ put_page(page);
+ }
+
+ if (locked_page) {
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_start + PAGE_SIZE)
+ return;
+ /*
+ * In case this page belongs to the delalloc range being
+ * instantiated then skip it, since the first page of a range is
+ * going to be properly cleaned up by the caller of
+ * run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
+ }
+ }
+
+ return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
+}
+
+static int btrfs_dirty_inode(struct inode *inode);
+
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args)
+{
+ int err;
+
+ if (args->default_acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ACL_TYPE_DEFAULT);
+ if (err)
+ return err;
+ }
+ if (args->acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (err)
+ return err;
+ }
+ if (!args->default_acl && !args->acl)
+ cache_no_acl(args->inode);
+ return btrfs_xattr_security_init(trans, args->inode, args->dir,
+ &args->dentry->d_name);
+}
+
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree. The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode, bool extent_inserted,
+ size_t size, size_t compressed_size,
+ int compress_type,
+ struct page **compressed_pages,
+ bool update_i_size)
+{
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct page *page = NULL;
+ char *kaddr;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ int ret;
+ size_t cur_size = size;
+ u64 i_size;
+
+ ASSERT((compressed_size > 0 && compressed_pages) ||
+ (compressed_size == 0 && !compressed_pages));
+
+ if (compressed_size && compressed_pages)
+ cur_size = compressed_size;
+
+ if (!extent_inserted) {
+ struct btrfs_key key;
+ size_t datasize;
+
+ key.objectid = btrfs_ino(inode);
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+
+ datasize = btrfs_file_extent_calc_inline_size(cur_size);
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ if (ret)
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+ ptr = btrfs_file_extent_inline_start(ei);
+
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ struct page *cpage;
+ int i = 0;
+ while (compressed_size > 0) {
+ cpage = compressed_pages[i];
+ cur_size = min_t(unsigned long, compressed_size,
+ PAGE_SIZE);
+
+ kaddr = kmap_local_page(cpage);
+ write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap_local(kaddr);
+
+ i++;
+ ptr += cur_size;
+ compressed_size -= cur_size;
+ }
+ btrfs_set_file_extent_compression(leaf, ei,
+ compress_type);
+ } else {
+ page = find_get_page(inode->vfs_inode.i_mapping, 0);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ kaddr = kmap_local_page(page);
+ write_extent_buffer(leaf, kaddr, ptr, size);
+ kunmap_local(kaddr);
+ put_page(page);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ /*
+ * We align size to sectorsize for inline extents just for simplicity
+ * sake.
+ */
+ ret = btrfs_inode_set_file_extent_range(inode, 0,
+ ALIGN(size, root->fs_info->sectorsize));
+ if (ret)
+ goto fail;
+
+ /*
+ * We're an inline extent, so nobody can extend the file past i_size
+ * without locking a page we already have locked.
+ *
+ * We must do any i_size and inode updates before we unlock the pages.
+ * Otherwise we could end up racing with unlink.
+ */
+ i_size = i_size_read(&inode->vfs_inode);
+ if (update_i_size && size > i_size) {
+ i_size_write(&inode->vfs_inode, size);
+ i_size = size;
+ }
+ inode->disk_i_size = i_size;
+
+fail:
+ return ret;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file. This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
+ size_t compressed_size,
+ int compress_type,
+ struct page **compressed_pages,
+ bool update_i_size)
+{
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans;
+ u64 data_len = (compressed_size ?: size);
+ int ret;
+ struct btrfs_path *path;
+
+ /*
+ * We can create an inline extent if it ends at or beyond the current
+ * i_size, is no larger than a sector (decompressed), and the (possibly
+ * compressed) data fits in a leaf and the configured maximum inline
+ * size.
+ */
+ if (size < i_size_read(&inode->vfs_inode) ||
+ size > fs_info->sectorsize ||
+ data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
+ data_len > fs_info->max_inline)
+ return 1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+ trans->block_rsv = &inode->block_rsv;
+
+ drop_args.path = path;
+ drop_args.start = 0;
+ drop_args.end = fs_info->sectorsize;
+ drop_args.drop_cache = true;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
+ size, compressed_size, compress_type,
+ compressed_pages, update_i_size);
+ if (ret && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ } else if (ret == -ENOSPC) {
+ ret = 1;
+ goto out;
+ }
+
+ btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ } else if (ret == -ENOSPC) {
+ ret = 1;
+ goto out;
+ }
+
+ btrfs_set_inode_full_sync(inode);
+out:
+ /*
+ * Don't forget to free the reserved space, as for inlined extent
+ * it won't count as data extent, free them directly here.
+ * And at reserve time, it's always aligned to page size, so
+ * just free one page here.
+ */
+ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
+ btrfs_free_path(path);
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+struct async_extent {
+ u64 start;
+ u64 ram_size;
+ u64 compressed_size;
+ struct page **pages;
+ unsigned long nr_pages;
+ int compress_type;
+ struct list_head list;
+};
+
+struct async_chunk {
+ struct inode *inode;
+ struct page *locked_page;
+ u64 start;
+ u64 end;
+ blk_opf_t write_flags;
+ struct list_head extents;
+ struct cgroup_subsys_state *blkcg_css;
+ struct btrfs_work work;
+ struct async_cow *async_cow;
+};
+
+struct async_cow {
+ atomic_t num_chunks;
+ struct async_chunk chunks[];
+};
+
+static noinline int add_async_extent(struct async_chunk *cow,
+ u64 start, u64 ram_size,
+ u64 compressed_size,
+ struct page **pages,
+ unsigned long nr_pages,
+ int compress_type)
+{
+ struct async_extent *async_extent;
+
+ async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+ BUG_ON(!async_extent); /* -ENOMEM */
+ async_extent->start = start;
+ async_extent->ram_size = ram_size;
+ async_extent->compressed_size = compressed_size;
+ async_extent->pages = pages;
+ async_extent->nr_pages = nr_pages;
+ async_extent->compress_type = compress_type;
+ list_add_tail(&async_extent->list, &cow->extents);
+ return 0;
+}
+
+/*
+ * Check if the inode needs to be submitted to compression, based on mount
+ * options, defragmentation, properties or heuristics.
+ */
+static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
+ u64 end)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (!btrfs_inode_can_compress(inode)) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
+ btrfs_ino(inode));
+ return 0;
+ }
+ /*
+ * Special check for subpage.
+ *
+ * We lock the full page then run each delalloc range in the page, thus
+ * for the following case, we will hit some subpage specific corner case:
+ *
+ * 0 32K 64K
+ * | |///////| |///////|
+ * \- A \- B
+ *
+ * In above case, both range A and range B will try to unlock the full
+ * page [0, 64K), causing the one finished later will have page
+ * unlocked already, triggering various page lock requirement BUG_ON()s.
+ *
+ * So here we add an artificial limit that subpage compression can only
+ * if the range is fully page aligned.
+ *
+ * In theory we only need to ensure the first page is fully covered, but
+ * the tailing partial page will be locked until the full compression
+ * finishes, delaying the write of other range.
+ *
+ * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+ * first to prevent any submitted async extent to unlock the full page.
+ * By this, we can ensure for subpage case that only the last async_cow
+ * will unlock the full page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(end + 1))
+ return 0;
+ }
+
+ /* force compress */
+ if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
+ return 1;
+ /* defrag ioctl */
+ if (inode->defrag_compress)
+ return 1;
+ /* bad compression ratios */
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS)
+ return 0;
+ if (btrfs_test_opt(fs_info, COMPRESS) ||
+ inode->flags & BTRFS_INODE_COMPRESS ||
+ inode->prop_compress)
+ return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
+ return 0;
+}
+
+static inline void inode_should_defrag(struct btrfs_inode *inode,
+ u64 start, u64 end, u64 num_bytes, u32 small_write)
+{
+ /* If this is a small write inside eof, kick off a defrag */
+ if (num_bytes < small_write &&
+ (start > 0 || end + 1 < inode->disk_i_size))
+ btrfs_add_inode_defrag(NULL, inode, small_write);
+}
+
+/*
+ * we create compressed extents in two phases. The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus. The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes. This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that the flusher thread sent them
+ * down.
+ */
+static noinline int compress_file_range(struct async_chunk *async_chunk)
+{
+ struct inode *inode = async_chunk->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 blocksize = fs_info->sectorsize;
+ u64 start = async_chunk->start;
+ u64 end = async_chunk->end;
+ u64 actual_end;
+ u64 i_size;
+ int ret = 0;
+ struct page **pages = NULL;
+ unsigned long nr_pages;
+ unsigned long total_compressed = 0;
+ unsigned long total_in = 0;
+ int i;
+ int will_compress;
+ int compress_type = fs_info->compress_type;
+ int compressed_extents = 0;
+ int redirty = 0;
+
+ inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
+ SZ_16K);
+
+ /*
+ * We need to save i_size before now because it could change in between
+ * us evaluating the size and assigning it. This is because we lock and
+ * unlock the page in truncate and fallocate, and then modify the i_size
+ * later on.
+ *
+ * The barriers are to emulate READ_ONCE, remove that once i_size_read
+ * does that for us.
+ */
+ barrier();
+ i_size = i_size_read(inode);
+ barrier();
+ actual_end = min_t(u64, i_size, end + 1);
+again:
+ will_compress = 0;
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ nr_pages = min_t(unsigned long, nr_pages,
+ BTRFS_MAX_COMPRESSED / PAGE_SIZE);
+
+ /*
+ * we don't want to send crud past the end of i_size through
+ * compression, that's just a waste of CPU time. So, if the
+ * end of the file is before the start of our current
+ * requested range of bytes, we bail out to the uncompressed
+ * cleanup code that can deal with all of this.
+ *
+ * It isn't really the fastest way to fix things, but this is a
+ * very uncommon corner.
+ */
+ if (actual_end <= start)
+ goto cleanup_and_bail_uncompressed;
+
+ total_compressed = actual_end - start;
+
+ /*
+ * Skip compression for a small file range(<=blocksize) that
+ * isn't an inline extent, since it doesn't save disk space at all.
+ */
+ if (total_compressed <= blocksize &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ goto cleanup_and_bail_uncompressed;
+
+ /*
+ * For subpage case, we require full page alignment for the sector
+ * aligned range.
+ * Thus we must also check against @actual_end, not just @end.
+ */
+ if (blocksize < PAGE_SIZE) {
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(round_up(actual_end, blocksize)))
+ goto cleanup_and_bail_uncompressed;
+ }
+
+ total_compressed = min_t(unsigned long, total_compressed,
+ BTRFS_MAX_UNCOMPRESSED);
+ total_in = 0;
+ ret = 0;
+
+ /*
+ * we do compression for mount -o compress and when the
+ * inode has not been flagged as nocompress. This flag can
+ * change at any time if we discover bad compression ratios.
+ */
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
+ WARN_ON(pages);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages) {
+ /* just bail out to the uncompressed code */
+ nr_pages = 0;
+ goto cont;
+ }
+
+ if (BTRFS_I(inode)->defrag_compress)
+ compress_type = BTRFS_I(inode)->defrag_compress;
+ else if (BTRFS_I(inode)->prop_compress)
+ compress_type = BTRFS_I(inode)->prop_compress;
+
+ /*
+ * we need to call clear_page_dirty_for_io on each
+ * page in the range. Otherwise applications with the file
+ * mmap'd can wander in and change the page contents while
+ * we are compressing them.
+ *
+ * If the compression fails for any reason, we set the pages
+ * dirty again later on.
+ *
+ * Note that the remaining part is redirtied, the start pointer
+ * has moved, the end is the original one.
+ */
+ if (!redirty) {
+ extent_range_clear_dirty_for_io(inode, start, end);
+ redirty = 1;
+ }
+
+ /* Compression level is applied here and only here */
+ ret = btrfs_compress_pages(
+ compress_type | (fs_info->compress_level << 4),
+ inode->i_mapping, start,
+ pages,
+ &nr_pages,
+ &total_in,
+ &total_compressed);
+
+ if (!ret) {
+ unsigned long offset = offset_in_page(total_compressed);
+ struct page *page = pages[nr_pages - 1];
+
+ /* zero the tail end of the last page, we might be
+ * sending it down to disk
+ */
+ if (offset)
+ memzero_page(page, offset, PAGE_SIZE - offset);
+ will_compress = 1;
+ }
+ }
+cont:
+ /*
+ * Check cow_file_range() for why we don't even try to create inline
+ * extent for subpage case.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+ /* lets try to make an inline extent */
+ if (ret || total_in < actual_end) {
+ /* we didn't compress the entire range, try
+ * to make an uncompressed inline extent.
+ */
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+ 0, BTRFS_COMPRESS_NONE,
+ NULL, false);
+ } else {
+ /* try making a compressed inline extent */
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+ total_compressed,
+ compress_type, pages,
+ false);
+ }
+ if (ret <= 0) {
+ unsigned long clear_flags = EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+ EXTENT_DO_ACCOUNTING;
+ unsigned long page_error_op;
+
+ page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
+
+ /*
+ * inline extent creation worked or returned error,
+ * we don't need to create any more async work items.
+ * Unlock and free up our temp pages.
+ *
+ * We use DO_ACCOUNTING here because we need the
+ * delalloc_release_metadata to be done _after_ we drop
+ * our outstanding extent for clearing delalloc for this
+ * range.
+ */
+ extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
+ NULL,
+ clear_flags,
+ PAGE_UNLOCK |
+ PAGE_START_WRITEBACK |
+ page_error_op |
+ PAGE_END_WRITEBACK);
+
+ /*
+ * Ensure we only free the compressed pages if we have
+ * them allocated, as we can still reach here with
+ * inode_need_compress() == false.
+ */
+ if (pages) {
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+ }
+ return 0;
+ }
+ }
+
+ if (will_compress) {
+ /*
+ * we aren't doing an inline extent round the compressed size
+ * up to a block size boundary so the allocator does sane
+ * things
+ */
+ total_compressed = ALIGN(total_compressed, blocksize);
+
+ /*
+ * one last check to make sure the compression is really a
+ * win, compare the page count read with the blocks on disk,
+ * compression must free at least one sector size
+ */
+ total_in = round_up(total_in, fs_info->sectorsize);
+ if (total_compressed + blocksize <= total_in) {
+ compressed_extents++;
+
+ /*
+ * The async work queues will take care of doing actual
+ * allocation on disk for these compressed pages, and
+ * will submit them to the elevator.
+ */
+ add_async_extent(async_chunk, start, total_in,
+ total_compressed, pages, nr_pages,
+ compress_type);
+
+ if (start + total_in < end) {
+ start += total_in;
+ pages = NULL;
+ cond_resched();
+ goto again;
+ }
+ return compressed_extents;
+ }
+ }
+ if (pages) {
+ /*
+ * the compression code ran but failed to make things smaller,
+ * free any pages it allocated and our page pointer array
+ */
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+ pages = NULL;
+ total_compressed = 0;
+ nr_pages = 0;
+
+ /* flag the file so we don't compress in the future */
+ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
+ !(BTRFS_I(inode)->prop_compress)) {
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ }
+ }
+cleanup_and_bail_uncompressed:
+ /*
+ * No compression, but we still need to write the pages in the file
+ * we've been given so far. redirty the locked page if it corresponds
+ * to our extent and set things up for the async work queue to run
+ * cow_file_range to do the normal delalloc dance.
+ */
+ if (async_chunk->locked_page &&
+ (page_offset(async_chunk->locked_page) >= start &&
+ page_offset(async_chunk->locked_page)) <= end) {
+ __set_page_dirty_nobuffers(async_chunk->locked_page);
+ /* unlocked later on in the async handlers */
+ }
+
+ if (redirty)
+ extent_range_redirty_for_io(inode, start, end);
+ add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
+ BTRFS_COMPRESS_NONE);
+ compressed_extents++;
+
+ return compressed_extents;
+}
+
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+ int i;
+
+ if (!async_extent->pages)
+ return;
+
+ for (i = 0; i < async_extent->nr_pages; i++) {
+ WARN_ON(async_extent->pages[i]->mapping);
+ put_page(async_extent->pages[i]);
+ }
+ kfree(async_extent->pages);
+ async_extent->nr_pages = 0;
+ async_extent->pages = NULL;
+}
+
+static int submit_uncompressed_range(struct btrfs_inode *inode,
+ struct async_extent *async_extent,
+ struct page *locked_page)
+{
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
+ unsigned long nr_written = 0;
+ int page_started = 0;
+ int ret;
+
+ /*
+ * Call cow_file_range() to run the delalloc range directly, since we
+ * won't go to NOCOW or async path again.
+ *
+ * Also we call cow_file_range() with @unlock_page == 0, so that we
+ * can directly submit them without interruption.
+ */
+ ret = cow_file_range(inode, locked_page, start, end, &page_started,
+ &nr_written, 0, NULL);
+ /* Inline extent inserted, page gets unlocked and everything is done */
+ if (page_started) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0) {
+ btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
+ if (locked_page) {
+ const u64 page_start = page_offset(locked_page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
+
+ btrfs_page_set_error(inode->root->fs_info, locked_page,
+ page_start, PAGE_SIZE);
+ set_page_writeback(locked_page);
+ end_page_writeback(locked_page);
+ end_extent_writepage(locked_page, ret, page_start, page_end);
+ unlock_page(locked_page);
+ }
+ goto out;
+ }
+
+ ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+ /* All pages will be unlocked, including @locked_page */
+out:
+ kfree(async_extent);
+ return ret;
+}
+
+static int submit_one_async_extent(struct btrfs_inode *inode,
+ struct async_chunk *async_chunk,
+ struct async_extent *async_extent,
+ u64 *alloc_hint)
+{
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key ins;
+ struct page *locked_page = NULL;
+ struct extent_map *em;
+ int ret = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
+
+ /*
+ * If async_chunk->locked_page is in the async_extent range, we need to
+ * handle it.
+ */
+ if (async_chunk->locked_page) {
+ u64 locked_page_start = page_offset(async_chunk->locked_page);
+ u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
+
+ if (!(start >= locked_page_end || end <= locked_page_start))
+ locked_page = async_chunk->locked_page;
+ }
+ lock_extent(io_tree, start, end, NULL);
+
+ /* We have fall back to uncompressed write */
+ if (!async_extent->pages)
+ return submit_uncompressed_range(inode, async_extent, locked_page);
+
+ ret = btrfs_reserve_extent(root, async_extent->ram_size,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, *alloc_hint, &ins, 1, 1);
+ if (ret) {
+ free_async_extent_pages(async_extent);
+ /*
+ * Here we used to try again by going back to non-compressed
+ * path for ENOSPC. But we can't reserve space even for
+ * compressed size, how could it work for uncompressed size
+ * which requires larger size? So here we directly go error
+ * path.
+ */
+ goto out_free;
+ }
+
+ /* Here we're doing allocation and writeback of the compressed pages */
+ em = create_io_em(inode, start,
+ async_extent->ram_size, /* len */
+ start, /* orig_start */
+ ins.objectid, /* block_start */
+ ins.offset, /* block_len */
+ ins.offset, /* orig_block_len */
+ async_extent->ram_size, /* ram_bytes */
+ async_extent->compress_type,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserve;
+ }
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ async_extent->ram_size, /* ram_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* disk_num_bytes */
+ 0, /* offset */
+ 1 << BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
+ if (ret) {
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ goto out_free_reserve;
+ }
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ /* Clear dirty, set writeback and unlock the pages. */
+ extent_clear_unlock_delalloc(inode, start, end,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
+ if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* compressed_len */
+ async_extent->pages, /* compressed_pages */
+ async_extent->nr_pages,
+ async_chunk->write_flags,
+ async_chunk->blkcg_css, true)) {
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
+ *alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ return ret;
+
+out_free_reserve:
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_free:
+ extent_clear_unlock_delalloc(inode, start, end,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ kfree(async_extent);
+ return ret;
+}
+
+/*
+ * Phase two of compressed writeback. This is the ordered portion of the code,
+ * which only gets called in the order the work was queued. We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+{
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ int ret = 0;
+
+ while (!list_empty(&async_chunk->extents)) {
+ u64 extent_start;
+ u64 ram_size;
+
+ async_extent = list_entry(async_chunk->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+ extent_start = async_extent->start;
+ ram_size = async_extent->ram_size;
+
+ ret = submit_one_async_extent(inode, async_chunk, async_extent,
+ &alloc_hint);
+ btrfs_debug(fs_info,
+"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), extent_start, ram_size, ret);
+ }
+}
+
+static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ u64 num_bytes)
+{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ u64 alloc_hint = 0;
+
+ read_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, start, num_bytes);
+ if (em) {
+ /*
+ * if block start isn't an actual block number then find the
+ * first block in this inode and use that as a hint. If that
+ * block is also bogus then just don't worry about it.
+ */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ em = search_extent_mapping(em_tree, 0, 0);
+ if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ if (em)
+ free_extent_map(em);
+ } else {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ }
+ read_unlock(&em_tree->lock);
+
+ return alloc_hint;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code. The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already. We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it. It may be clean and already done with
+ * IO when we return.
+ *
+ * When unlock == 1, we unlock the pages in successfully allocated regions.
+ * When unlock == 0, we leave them locked for writing them out.
+ *
+ * However, we unlock all the pages except @locked_page in case of failure.
+ *
+ * In summary, page locking state will be as follow:
+ *
+ * - page_started == 1 (return value)
+ * - All the pages are unlocked. IO is started.
+ * - Note that this can happen only on success
+ * - unlock == 1
+ * - All the pages except @locked_page are unlocked in any case
+ * - unlock == 0
+ * - On success, all the pages are locked for writing out them
+ * - On failure, all the pages except @locked_page are unlocked
+ *
+ * When a failure happens in the second or later iteration of the
+ * while-loop, the ordered extents created in previous iterations are kept
+ * intact. So, the caller must clean them up by calling
+ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
+ * example.
+ */
+static noinline int cow_file_range(struct btrfs_inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 alloc_hint = 0;
+ u64 orig_start = start;
+ u64 num_bytes;
+ unsigned long ram_size;
+ u64 cur_alloc_size = 0;
+ u64 min_alloc_size;
+ u64 blocksize = fs_info->sectorsize;
+ struct btrfs_key ins;
+ struct extent_map *em;
+ unsigned clear_bits;
+ unsigned long page_ops;
+ bool extent_reserved = false;
+ int ret = 0;
+
+ if (btrfs_is_free_space_inode(inode)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ num_bytes = ALIGN(end - start + 1, blocksize);
+ num_bytes = max(blocksize, num_bytes);
+ ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
+
+ inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
+
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+ u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
+ end + 1);
+
+ /* lets try to make an inline extent */
+ ret = cow_file_range_inline(inode, actual_end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
+ if (ret == 0) {
+ /*
+ * We use DO_ACCOUNTING here because we need the
+ * delalloc_release_metadata to be run _after_ we drop
+ * our outstanding extent for clearing delalloc for this
+ * range.
+ */
+ extent_clear_unlock_delalloc(inode, start, end,
+ locked_page,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+ EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
+ PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
+ *nr_written = *nr_written +
+ (end - start + PAGE_SIZE) / PAGE_SIZE;
+ *page_started = 1;
+ /*
+ * locked_page is locked by the caller of
+ * writepage_delalloc(), not locked by
+ * __process_pages_contig().
+ *
+ * We can't let __process_pages_contig() to unlock it,
+ * as it doesn't have any subpage::writers recorded.
+ *
+ * Here we manually unlock the page, since the caller
+ * can't use page_started to determine if it's an
+ * inline extent or a compressed extent.
+ */
+ unlock_page(locked_page);
+ goto out;
+ } else if (ret < 0) {
+ goto out_unlock;
+ }
+ }
+
+ alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+
+ /*
+ * Relocation relies on the relocated extents to have exactly the same
+ * size as the original extents. Normally writeback for relocation data
+ * extents follows a NOCOW path because relocation preallocates the
+ * extents. However, due to an operation such as scrub turning a block
+ * group to RO mode, it may fallback to COW mode, so we must make sure
+ * an extent allocated during COW has exactly the requested size and can
+ * not be split into smaller extents, otherwise relocation breaks and
+ * fails during the stage where it updates the bytenr of file extent
+ * items.
+ */
+ if (btrfs_is_data_reloc_root(root))
+ min_alloc_size = num_bytes;
+ else
+ min_alloc_size = fs_info->sectorsize;
+
+ while (num_bytes > 0) {
+ cur_alloc_size = num_bytes;
+ ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
+ min_alloc_size, 0, alloc_hint,
+ &ins, 1, 1);
+ if (ret < 0)
+ goto out_unlock;
+ cur_alloc_size = ins.offset;
+ extent_reserved = true;
+
+ ram_size = ins.offset;
+ em = create_io_em(inode, start, ins.offset, /* len */
+ start, /* orig_start */
+ ins.objectid, /* block_start */
+ ins.offset, /* block_len */
+ ins.offset, /* orig_block_len */
+ ram_size, /* ram_bytes */
+ BTRFS_COMPRESS_NONE, /* compress_type */
+ BTRFS_ORDERED_REGULAR /* type */);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_reserve;
+ }
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
+ ins.objectid, cur_alloc_size, 0,
+ 1 << BTRFS_ORDERED_REGULAR,
+ BTRFS_COMPRESS_NONE);
+ if (ret)
+ goto out_drop_extent_cache;
+
+ if (btrfs_is_data_reloc_root(root)) {
+ ret = btrfs_reloc_clone_csums(inode, start,
+ cur_alloc_size);
+ /*
+ * Only drop cache here, and process as normal.
+ *
+ * We must not allow extent_clear_unlock_delalloc()
+ * at out_unlock label to free meta of this ordered
+ * extent, as its meta should be freed by
+ * btrfs_finish_ordered_io().
+ *
+ * So we must continue until @start is increased to
+ * skip current ordered extent.
+ */
+ if (ret)
+ btrfs_drop_extent_map_range(inode, start,
+ start + ram_size - 1,
+ false);
+ }
+
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ /*
+ * We're not doing compressed IO, don't unlock the first page
+ * (which the caller expects to stay locked), don't clear any
+ * dirty bits and don't set any writeback bits
+ *
+ * Do set the Ordered (Private2) bit so we know this page was
+ * properly setup for writepage.
+ */
+ page_ops = unlock ? PAGE_UNLOCK : 0;
+ page_ops |= PAGE_SET_ORDERED;
+
+ extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
+ locked_page,
+ EXTENT_LOCKED | EXTENT_DELALLOC,
+ page_ops);
+ if (num_bytes < cur_alloc_size)
+ num_bytes = 0;
+ else
+ num_bytes -= cur_alloc_size;
+ alloc_hint = ins.objectid + ins.offset;
+ start += cur_alloc_size;
+ extent_reserved = false;
+
+ /*
+ * btrfs_reloc_clone_csums() error, since start is increased
+ * extent_clear_unlock_delalloc() at out_unlock label won't
+ * free metadata of current ordered extent, we're OK to exit.
+ */
+ if (ret)
+ goto out_unlock;
+ }
+out:
+ return ret;
+
+out_drop_extent_cache:
+ btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
+out_reserve:
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_unlock:
+ /*
+ * If done_offset is non-NULL and ret == -EAGAIN, we expect the
+ * caller to write out the successfully allocated region and retry.
+ */
+ if (done_offset && ret == -EAGAIN) {
+ if (orig_start < start)
+ *done_offset = start - 1;
+ else
+ *done_offset = start;
+ return ret;
+ } else if (ret == -EAGAIN) {
+ /* Convert to -ENOSPC since the caller cannot retry. */
+ ret = -ENOSPC;
+ }
+
+ /*
+ * Now, we have three regions to clean up:
+ *
+ * |-------(1)----|---(2)---|-------------(3)----------|
+ * `- orig_start `- start `- start + cur_alloc_size `- end
+ *
+ * We process each region below.
+ */
+
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
+ /*
+ * For the range (1). We have already instantiated the ordered extents
+ * for this region. They are cleaned up by
+ * btrfs_cleanup_ordered_extents() in e.g,
+ * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
+ * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
+ * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
+ * function.
+ *
+ * However, in case of unlock == 0, we still need to unlock the pages
+ * (except @locked_page) to ensure all the pages are unlocked.
+ */
+ if (!unlock && orig_start < start) {
+ if (!locked_page)
+ mapping_set_error(inode->vfs_inode.i_mapping, ret);
+ extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+ locked_page, 0, page_ops);
+ }
+
+ /*
+ * For the range (2). If we reserved an extent for our delalloc range
+ * (or a subrange) and failed to create the respective ordered extent,
+ * then it means that when we reserved the extent we decremented the
+ * extent's size from the data space_info's bytes_may_use counter and
+ * incremented the space_info's bytes_reserved counter by the same
+ * amount. We must make sure extent_clear_unlock_delalloc() does not try
+ * to decrement again the data space_info's bytes_may_use counter,
+ * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
+ */
+ if (extent_reserved) {
+ extent_clear_unlock_delalloc(inode, start,
+ start + cur_alloc_size - 1,
+ locked_page,
+ clear_bits,
+ page_ops);
+ start += cur_alloc_size;
+ }
+
+ /*
+ * For the range (3). We never touched the region. In addition to the
+ * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
+ * space_info's bytes_may_use counter, reserved in
+ * btrfs_check_data_free_space().
+ */
+ if (start < end) {
+ clear_bits |= EXTENT_CLEAR_DATA_RESV;
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ clear_bits, page_ops);
+ }
+ return ret;
+}
+
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+ struct async_chunk *async_chunk;
+ int compressed_extents;
+
+ async_chunk = container_of(work, struct async_chunk, work);
+
+ compressed_extents = compress_file_range(async_chunk);
+ if (compressed_extents == 0) {
+ btrfs_add_delayed_iput(async_chunk->inode);
+ async_chunk->inode = NULL;
+ }
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+ struct async_chunk *async_chunk = container_of(work, struct async_chunk,
+ work);
+ struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
+ unsigned long nr_pages;
+
+ nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
+ PAGE_SHIFT;
+
+ /*
+ * ->inode could be NULL if async_chunk_start has failed to compress,
+ * in which case we don't have anything to submit, yet we need to
+ * always adjust ->async_delalloc_pages as its paired with the init
+ * happening in cow_file_range_async
+ */
+ if (async_chunk->inode)
+ submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
+}
+
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+ struct async_chunk *async_chunk;
+ struct async_cow *async_cow;
+
+ async_chunk = container_of(work, struct async_chunk, work);
+ if (async_chunk->inode)
+ btrfs_add_delayed_iput(async_chunk->inode);
+ if (async_chunk->blkcg_css)
+ css_put(async_chunk->blkcg_css);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
+}
+
+static int cow_file_range_async(struct btrfs_inode *inode,
+ struct writeback_control *wbc,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
+ struct async_cow *ctx;
+ struct async_chunk *async_chunk;
+ unsigned long nr_pages;
+ u64 cur_end;
+ u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
+ int i;
+ bool should_compress;
+ unsigned nofs_flag;
+ const blk_opf_t write_flags = wbc_to_write_flags(wbc);
+
+ unlock_extent(&inode->io_tree, start, end, NULL);
+
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
+ !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
+ num_chunks = 1;
+ should_compress = false;
+ } else {
+ should_compress = true;
+ }
+
+ nofs_flag = memalloc_nofs_save();
+ ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (!ctx) {
+ unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+ EXTENT_DO_ACCOUNTING;
+ unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR;
+
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ clear_bits, page_ops);
+ return -ENOMEM;
+ }
+
+ async_chunk = ctx->chunks;
+ atomic_set(&ctx->num_chunks, num_chunks);
+
+ for (i = 0; i < num_chunks; i++) {
+ if (should_compress)
+ cur_end = min(end, start + SZ_512K - 1);
+ else
+ cur_end = end;
+
+ /*
+ * igrab is called higher up in the call chain, take only the
+ * lightweight reference for the callback lifetime
+ */
+ ihold(&inode->vfs_inode);
+ async_chunk[i].async_cow = ctx;
+ async_chunk[i].inode = &inode->vfs_inode;
+ async_chunk[i].start = start;
+ async_chunk[i].end = cur_end;
+ async_chunk[i].write_flags = write_flags;
+ INIT_LIST_HEAD(&async_chunk[i].extents);
+
+ /*
+ * The locked_page comes all the way from writepage and its
+ * the original page we were actually given. As we spread
+ * this large delalloc region across multiple async_chunk
+ * structs, only the first struct needs a pointer to locked_page
+ *
+ * This way we don't need racey decisions about who is supposed
+ * to unlock it.
+ */
+ if (locked_page) {
+ /*
+ * Depending on the compressibility, the pages might or
+ * might not go through async. We want all of them to
+ * be accounted against wbc once. Let's do it here
+ * before the paths diverge. wbc accounting is used
+ * only for foreign writeback detection and doesn't
+ * need full accuracy. Just account the whole thing
+ * against the first page.
+ */
+ wbc_account_cgroup_owner(wbc, locked_page,
+ cur_end - start);
+ async_chunk[i].locked_page = locked_page;
+ locked_page = NULL;
+ } else {
+ async_chunk[i].locked_page = NULL;
+ }
+
+ if (blkcg_css != blkcg_root_css) {
+ css_get(blkcg_css);
+ async_chunk[i].blkcg_css = blkcg_css;
+ } else {
+ async_chunk[i].blkcg_css = NULL;
+ }
+
+ btrfs_init_work(&async_chunk[i].work, async_cow_start,
+ async_cow_submit, async_cow_free);
+
+ nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
+ atomic_add(nr_pages, &fs_info->async_delalloc_pages);
+
+ btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
+
+ *nr_written += nr_pages;
+ start = cur_end + 1;
+ }
+ *page_started = 1;
+ return 0;
+}
+
+static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
+ struct page *locked_page, u64 start,
+ u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ u64 done_offset = end;
+ int ret;
+ bool locked_page_done = false;
+
+ while (start <= end) {
+ ret = cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 0, &done_offset);
+ if (ret && ret != -EAGAIN)
+ return ret;
+
+ if (*page_started) {
+ ASSERT(ret == 0);
+ return 0;
+ }
+
+ if (ret == 0)
+ done_offset = end;
+
+ if (done_offset == start) {
+ wait_on_bit_io(&inode->root->fs_info->flags,
+ BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ continue;
+ }
+
+ if (!locked_page_done) {
+ __set_page_dirty_nobuffers(locked_page);
+ account_page_redirty(locked_page);
+ }
+ locked_page_done = true;
+ extent_write_locked_range(&inode->vfs_inode, start, done_offset);
+
+ start = done_offset + 1;
+ }
+
+ *page_started = 1;
+
+ return 0;
+}
+
+static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 num_bytes, bool nowait)
+{
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
+ struct btrfs_ordered_sum *sums;
+ int ret;
+ LIST_HEAD(list);
+
+ ret = btrfs_lookup_csums_range(csum_root, bytenr,
+ bytenr + num_bytes - 1, &list, 0,
+ nowait);
+ if (ret == 0 && list_empty(&list))
+ return 0;
+
+ while (!list_empty(&list)) {
+ sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
+static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
+ const u64 start, const u64 end,
+ int *page_started, unsigned long *nr_written)
+{
+ const bool is_space_ino = btrfs_is_free_space_inode(inode);
+ const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
+ const u64 range_bytes = end + 1 - start;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ u64 range_start = start;
+ u64 count;
+
+ /*
+ * If EXTENT_NORESERVE is set it means that when the buffered write was
+ * made we had not enough available data space and therefore we did not
+ * reserve data space for it, since we though we could do NOCOW for the
+ * respective file range (either there is prealloc extent or the inode
+ * has the NOCOW bit set).
+ *
+ * However when we need to fallback to COW mode (because for example the
+ * block group for the corresponding extent was turned to RO mode by a
+ * scrub or relocation) we need to do the following:
+ *
+ * 1) We increment the bytes_may_use counter of the data space info.
+ * If COW succeeds, it allocates a new data extent and after doing
+ * that it decrements the space info's bytes_may_use counter and
+ * increments its bytes_reserved counter by the same amount (we do
+ * this at btrfs_add_reserved_bytes()). So we need to increment the
+ * bytes_may_use counter to compensate (when space is reserved at
+ * buffered write time, the bytes_may_use counter is incremented);
+ *
+ * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
+ * that if the COW path fails for any reason, it decrements (through
+ * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
+ * data space info, which we incremented in the step above.
+ *
+ * If we need to fallback to cow and the inode corresponds to a free
+ * space cache inode or an inode of the data relocation tree, we must
+ * also increment bytes_may_use of the data space_info for the same
+ * reason. Space caches and relocated data extents always get a prealloc
+ * extent for them, however scrub or balance may have set the block
+ * group that contains that extent to RO mode and therefore force COW
+ * when starting writeback.
+ */
+ count = count_range_bits(io_tree, &range_start, end, range_bytes,
+ EXTENT_NORESERVE, 0);
+ if (count > 0 || is_space_ino || is_reloc_ino) {
+ u64 bytes = count;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+
+ if (is_space_ino || is_reloc_ino)
+ bytes = range_bytes;
+
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+ spin_unlock(&sinfo->lock);
+
+ if (count > 0)
+ clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ NULL);
+ }
+
+ return cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 1, NULL);
+}
+
+struct can_nocow_file_extent_args {
+ /* Input fields. */
+
+ /* Start file offset of the range we want to NOCOW. */
+ u64 start;
+ /* End file offset (inclusive) of the range we want to NOCOW. */
+ u64 end;
+ bool writeback_path;
+ bool strict;
+ /*
+ * Free the path passed to can_nocow_file_extent() once it's not needed
+ * anymore.
+ */
+ bool free_path;
+
+ /* Output fields. Only set when can_nocow_file_extent() returns 1. */
+
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ /* Number of bytes that can be written to in NOCOW mode. */
+ u64 num_bytes;
+};
+
+/*
+ * Check if we can NOCOW the file extent that the path points to.
+ * This function may return with the path released, so the caller should check
+ * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
+ *
+ * Returns: < 0 on error
+ * 0 if we can not NOCOW
+ * 1 if we can NOCOW
+ */
+static int can_nocow_file_extent(struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct btrfs_inode *inode,
+ struct can_nocow_file_extent_args *args)
+{
+ const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_end;
+ u8 extent_type;
+ int can_nocow = 0;
+ int ret = 0;
+ bool nowait = path->nowait;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ goto out;
+
+ /* Can't access these fields unless we know it's not an inline extent. */
+ args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
+ extent_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
+ /*
+ * If the extent was created before the generation where the last snapshot
+ * for its subvolume was created, then this implies the extent is shared,
+ * hence we must COW.
+ */
+ if (!args->strict &&
+ btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
+ /* An explicit hole, must COW. */
+ if (args->disk_bytenr == 0)
+ goto out;
+
+ /* Compressed/encrypted/encoded extents must be COWed. */
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The following checks can be expensive, as they need to take other
+ * locks and do btree or rbtree searches, so release the path to avoid
+ * blocking other tasks for too long.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
+ key->offset - args->extent_offset,
+ args->disk_bytenr, args->strict, path);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ if (args->free_path) {
+ /*
+ * We don't need the path anymore, plus through the
+ * csum_exist_in_range() call below we will end up allocating
+ * another path. So free the path to avoid unnecessary extra
+ * memory usage.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+ }
+
+ /* If there are pending snapshots for this root, we must COW. */
+ if (args->writeback_path && !is_freespace_inode &&
+ atomic_read(&root->snapshot_force_cow))
+ goto out;
+
+ args->disk_bytenr += args->extent_offset;
+ args->disk_bytenr += args->start - key->offset;
+ args->num_bytes = min(args->end + 1, extent_end) - args->start;
+
+ /*
+ * Force COW if csums exist in the range. This ensures that csums for a
+ * given extent are either valid or do not exist.
+ */
+ ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
+ nowait);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ can_nocow = 1;
+ out:
+ if (args->free_path && path)
+ btrfs_free_path(path);
+
+ return ret < 0 ? ret : can_nocow;
+}
+
+/*
+ * when nowcow writeback call back. This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
+ struct page *locked_page,
+ const u64 start, const u64 end,
+ int *page_started,
+ unsigned long *nr_written)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ u64 cow_start = (u64)-1;
+ u64 cur_offset = start;
+ int ret;
+ bool check_prev = true;
+ u64 ino = btrfs_ino(inode);
+ struct btrfs_block_group *bg;
+ bool nocow = false;
+ struct can_nocow_file_extent_args nocow_args = { 0 };
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, PAGE_UNLOCK |
+ PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ return -ENOMEM;
+ }
+
+ nocow_args.end = end;
+ nocow_args.writeback_path = true;
+
+ while (1) {
+ struct btrfs_key found_key;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ u64 extent_end;
+ u64 ram_bytes;
+ u64 nocow_end;
+ int extent_type;
+
+ nocow = false;
+
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino,
+ cur_offset, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * If there is no extent for our range when doing the initial
+ * search, then go back to the previous slot as it will be the
+ * one containing the search offset
+ */
+ if (ret > 0 && path->slots[0] > 0 && check_prev) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key,
+ path->slots[0] - 1);
+ if (found_key.objectid == ino &&
+ found_key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+ check_prev = false;
+next_slot:
+ /* Go to next leaf if we have exhausted the current one */
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ }
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ /* Didn't find anything for our INO */
+ if (found_key.objectid > ino)
+ break;
+ /*
+ * Keep searching until we find an EXTENT_ITEM or there are no
+ * more extents for this inode
+ */
+ if (WARN_ON_ONCE(found_key.objectid < ino) ||
+ found_key.type < BTRFS_EXTENT_DATA_KEY) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ /* Found key is not EXTENT_DATA_KEY or starts after req range */
+ if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ found_key.offset > end)
+ break;
+
+ /*
+ * If the found extent starts after requested offset, then
+ * adjust extent_end to be right before this extent begins
+ */
+ if (found_key.offset > cur_offset) {
+ extent_end = found_key.offset;
+ extent_type = 0;
+ goto out_check;
+ }
+
+ /*
+ * Found extent which begins before our range and potentially
+ * intersect it
+ */
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ /* If this is triggered then we have a memory corruption. */
+ ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
+ if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
+ ret = -EUCLEAN;
+ goto error;
+ }
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * If the extent we got ends before our current offset, skip to
+ * the next extent.
+ */
+ if (extent_end <= cur_offset) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ nocow_args.start = cur_offset;
+ ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ } else if (ret == 0) {
+ goto out_check;
+ }
+
+ ret = 0;
+ bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ if (bg)
+ nocow = true;
+out_check:
+ /*
+ * If nocow is false then record the beginning of the range
+ * that needs to be COWed
+ */
+ if (!nocow) {
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = extent_end;
+ if (cur_offset > end)
+ break;
+ if (!path->nodes[0])
+ continue;
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ /*
+ * COW range from cow_start to found_key.offset - 1. As the key
+ * will contain the beginning of the first extent that can be
+ * NOCOW, following one which needs to be COW'ed
+ */
+ if (cow_start != (u64)-1) {
+ ret = fallback_to_cow(inode, locked_page,
+ cow_start, found_key.offset - 1,
+ page_started, nr_written);
+ if (ret)
+ goto error;
+ cow_start = (u64)-1;
+ }
+
+ nocow_end = cur_offset + nocow_args.num_bytes - 1;
+
+ if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 orig_start = found_key.offset - nocow_args.extent_offset;
+ struct extent_map *em;
+
+ em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
+ orig_start,
+ nocow_args.disk_bytenr, /* block_start */
+ nocow_args.num_bytes, /* block_len */
+ nocow_args.disk_num_bytes, /* orig_block_len */
+ ram_bytes, BTRFS_COMPRESS_NONE,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto error;
+ }
+ free_extent_map(em);
+ ret = btrfs_add_ordered_extent(inode,
+ cur_offset, nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes, 0,
+ 1 << BTRFS_ORDERED_PREALLOC,
+ BTRFS_COMPRESS_NONE);
+ if (ret) {
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ nocow_end, false);
+ goto error;
+ }
+ } else {
+ ret = btrfs_add_ordered_extent(inode, cur_offset,
+ nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes,
+ 0,
+ 1 << BTRFS_ORDERED_NOCOW,
+ BTRFS_COMPRESS_NONE);
+ if (ret)
+ goto error;
+ }
+
+ if (nocow) {
+ btrfs_dec_nocow_writers(bg);
+ nocow = false;
+ }
+
+ if (btrfs_is_data_reloc_root(root))
+ /*
+ * Error handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler
+ * from freeing metadata of created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(inode, cur_offset,
+ nocow_args.num_bytes);
+
+ extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
+ locked_page, EXTENT_LOCKED |
+ EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
+
+ cur_offset = extent_end;
+
+ /*
+ * btrfs_reloc_clone_csums() error, now we're OK to call error
+ * handler, as metadata for created ordered extent will only
+ * be freed by btrfs_finish_ordered_io().
+ */
+ if (ret)
+ goto error;
+ if (cur_offset > end)
+ break;
+ }
+ btrfs_release_path(path);
+
+ if (cur_offset <= end && cow_start == (u64)-1)
+ cow_start = cur_offset;
+
+ if (cow_start != (u64)-1) {
+ cur_offset = end;
+ ret = fallback_to_cow(inode, locked_page, cow_start, end,
+ page_started, nr_written);
+ if (ret)
+ goto error;
+ }
+
+error:
+ if (nocow)
+ btrfs_dec_nocow_writers(bg);
+
+ if (ret && cur_offset < end)
+ extent_clear_unlock_delalloc(inode, cur_offset, end,
+ locked_page, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_DEFRAG |
+ EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
+ PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
+ if (inode->defrag_bytes &&
+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
+ 0, NULL))
+ return false;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Function to process delayed allocation (create CoW) for ranges which are
+ * being touched for the first time.
+ */
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started, unsigned long *nr_written,
+ struct writeback_control *wbc)
+{
+ int ret;
+ const bool zoned = btrfs_is_zoned(inode->root->fs_info);
+
+ /*
+ * The range must cover part of the @locked_page, or the returned
+ * @page_started can confuse the caller.
+ */
+ ASSERT(!(end <= page_offset(locked_page) ||
+ start >= page_offset(locked_page) + PAGE_SIZE));
+
+ if (should_nocow(inode, start, end)) {
+ /*
+ * Normally on a zoned device we're only doing COW writes, but
+ * in case of relocation on a zoned filesystem we have taken
+ * precaution, that we're only writing sequentially. It's safe
+ * to use run_delalloc_nocow() here, like for regular
+ * preallocated inodes.
+ */
+ ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
+ ret = run_delalloc_nocow(inode, locked_page, start, end,
+ page_started, nr_written);
+ } else if (!btrfs_inode_can_compress(inode) ||
+ !inode_need_compress(inode, start, end)) {
+ if (zoned)
+ ret = run_delalloc_zoned(inode, locked_page, start, end,
+ page_started, nr_written);
+ else
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1, NULL);
+ } else {
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
+ ret = cow_file_range_async(inode, wbc, locked_page, start, end,
+ page_started, nr_written);
+ }
+ ASSERT(ret <= 0);
+ if (ret)
+ btrfs_cleanup_ordered_extents(inode, locked_page, start,
+ end - start + 1);
+ return ret;
+}
+
+void btrfs_split_delalloc_extent(struct inode *inode,
+ struct extent_state *orig, u64 split)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 size;
+
+ /* not delalloc, ignore it */
+ if (!(orig->state & EXTENT_DELALLOC))
+ return;
+
+ size = orig->end - orig->start + 1;
+ if (size > fs_info->max_extent_size) {
+ u32 num_extents;
+ u64 new_size;
+
+ /*
+ * See the explanation in btrfs_merge_delalloc_extent, the same
+ * applies here, just in reverse.
+ */
+ new_size = orig->end - split + 1;
+ num_extents = count_max_extents(fs_info, new_size);
+ new_size = split - orig->start;
+ num_extents += count_max_extents(fs_info, new_size);
+ if (count_max_extents(fs_info, size) >= num_extents)
+ return;
+ }
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
+ spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+/*
+ * Handle merged delayed allocation extents so we can keep track of new extents
+ * that are just merged onto old extents, such as when we are doing sequential
+ * writes, so we can properly account for the metadata space we'll need.
+ */
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+ struct extent_state *other)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 new_size, old_size;
+ u32 num_extents;
+
+ /* not delalloc, ignore it */
+ if (!(other->state & EXTENT_DELALLOC))
+ return;
+
+ if (new->start > other->start)
+ new_size = new->end - other->start + 1;
+ else
+ new_size = other->end - new->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= fs_info->max_extent_size) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return;
+ }
+
+ /*
+ * We have to add up either side to figure out how many extents were
+ * accounted for before we merged into one big extent. If the number of
+ * extents we accounted for is <= the amount we need for the new range
+ * then we can return, otherwise drop. Think of it like this
+ *
+ * [ 4k][MAX_SIZE]
+ *
+ * So we've grown the extent by a MAX_SIZE extent, this would mean we
+ * need 2 outstanding extents, on one side we have 1 and the other side
+ * we have 1 so they are == and we can return. But in this case
+ *
+ * [MAX_SIZE+4k][MAX_SIZE+4k]
+ *
+ * Each range on their own accounts for 2 extents, but merged together
+ * they are only 3 extents worth of accounting, so we need to drop in
+ * this case.
+ */
+ old_size = other->end - other->start + 1;
+ num_extents = count_max_extents(fs_info, old_size);
+ old_size = new->end - new->start + 1;
+ num_extents += count_max_extents(fs_info, old_size);
+ if (count_max_extents(fs_info, new_size) >= num_extents)
+ return;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
+ spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+ struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ spin_lock(&root->delalloc_lock);
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&fs_info->delalloc_root_lock);
+ BUG_ON(!list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root,
+ &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
+
+void __btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!list_empty(&inode->delalloc_inodes)) {
+ list_del_init(&inode->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &inode->runtime_flags);
+ root->nr_delalloc_inodes--;
+ if (!root->nr_delalloc_inodes) {
+ ASSERT(list_empty(&root->delalloc_inodes));
+ spin_lock(&fs_info->delalloc_root_lock);
+ BUG_ON(list_empty(&root->delalloc_root));
+ list_del_init(&root->delalloc_root);
+ spin_unlock(&fs_info->delalloc_root_lock);
+ }
+ }
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct btrfs_inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ __btrfs_del_delalloc_inode(root, inode);
+ spin_unlock(&root->delalloc_lock);
+}
+
+/*
+ * Properly track delayed allocation bytes in the inode and to maintain the
+ * list of inodes that have pending delalloc work to be done.
+ */
+void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+ u32 bits)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
+ WARN_ON(1);
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 len = state->end + 1 - state->start;
+ u32 num_extents = count_max_extents(fs_info, len);
+ bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ /* For sanity tests */
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
+ fs_info->delalloc_batch);
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->delalloc_bytes += len;
+ if (bits & EXTENT_DEFRAG)
+ BTRFS_I(inode)->defrag_bytes += len;
+ if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_add_delalloc_inodes(root, inode);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+
+ if (!(state->state & EXTENT_DELALLOC_NEW) &&
+ (bits & EXTENT_DELALLOC_NEW)) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
+ state->start;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+}
+
+/*
+ * Once a range is no longer delalloc this function ensures that proper
+ * accounting happens.
+ */
+void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
+ struct extent_state *state, u32 bits)
+{
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
+ u64 len = state->end + 1 - state->start;
+ u32 num_extents = count_max_extents(fs_info, len);
+
+ if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
+ spin_lock(&inode->lock);
+ inode->defrag_bytes -= len;
+ spin_unlock(&inode->lock);
+ }
+
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = inode->root;
+ bool do_list = !btrfs_is_free_space_inode(inode);
+
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, -num_extents);
+ spin_unlock(&inode->lock);
+
+ /*
+ * We don't reserve metadata space for space cache inodes so we
+ * don't need to call delalloc_release_metadata if there is an
+ * error.
+ */
+ if (bits & EXTENT_CLEAR_META_RESV &&
+ root != fs_info->tree_root)
+ btrfs_delalloc_release_metadata(inode, len, false);
+
+ /* For sanity tests. */
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ if (!btrfs_is_data_reloc_root(root) &&
+ do_list && !(state->state & EXTENT_NORESERVE) &&
+ (bits & EXTENT_CLEAR_DATA_RESV))
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
+
+ percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
+ fs_info->delalloc_batch);
+ spin_lock(&inode->lock);
+ inode->delalloc_bytes -= len;
+ if (do_list && inode->delalloc_bytes == 0 &&
+ test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &inode->runtime_flags))
+ btrfs_del_delalloc_inode(root, inode);
+ spin_unlock(&inode->lock);
+ }
+
+ if ((state->state & EXTENT_DELALLOC_NEW) &&
+ (bits & EXTENT_DELALLOC_NEW)) {
+ spin_lock(&inode->lock);
+ ASSERT(inode->new_delalloc_bytes >= len);
+ inode->new_delalloc_bytes -= len;
+ if (bits & EXTENT_ADD_INODE_BYTES)
+ inode_add_bytes(&inode->vfs_inode, len);
+ spin_unlock(&inode->lock);
+ }
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time. All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
+{
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
+}
+
+/*
+ * Split an extent_map at [start, start + len]
+ *
+ * This function is intended to be used only for extract_ordered_extent().
+ */
+static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ u64 pre, u64 post)
+{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ struct extent_map *split_pre = NULL;
+ struct extent_map *split_mid = NULL;
+ struct extent_map *split_post = NULL;
+ int ret = 0;
+ unsigned long flags;
+
+ /* Sanity check */
+ if (pre == 0 && post == 0)
+ return 0;
+
+ split_pre = alloc_extent_map();
+ if (pre)
+ split_mid = alloc_extent_map();
+ if (post)
+ split_post = alloc_extent_map();
+ if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ASSERT(pre + post < len);
+
+ lock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ASSERT(em->len == len);
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!list_empty(&em->list));
+
+ flags = em->flags;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ /* First, replace the em with a new extent_map starting from * em->start */
+ split_pre->start = em->start;
+ split_pre->len = (pre ? pre : em->len - post);
+ split_pre->orig_start = split_pre->start;
+ split_pre->block_start = em->block_start;
+ split_pre->block_len = split_pre->len;
+ split_pre->orig_block_len = split_pre->block_len;
+ split_pre->ram_bytes = split_pre->len;
+ split_pre->flags = flags;
+ split_pre->compress_type = em->compress_type;
+ split_pre->generation = em->generation;
+
+ replace_extent_mapping(em_tree, em, split_pre, 1);
+
+ /*
+ * Now we only have an extent_map at:
+ * [em->start, em->start + pre] if pre != 0
+ * [em->start, em->start + em->len - post] if pre == 0
+ */
+
+ if (pre) {
+ /* Insert the middle extent_map */
+ split_mid->start = em->start + pre;
+ split_mid->len = em->len - pre - post;
+ split_mid->orig_start = split_mid->start;
+ split_mid->block_start = em->block_start + pre;
+ split_mid->block_len = split_mid->len;
+ split_mid->orig_block_len = split_mid->block_len;
+ split_mid->ram_bytes = split_mid->len;
+ split_mid->flags = flags;
+ split_mid->compress_type = em->compress_type;
+ split_mid->generation = em->generation;
+ add_extent_mapping(em_tree, split_mid, 1);
+ }
+
+ if (post) {
+ split_post->start = em->start + em->len - post;
+ split_post->len = post;
+ split_post->orig_start = split_post->start;
+ split_post->block_start = em->block_start + em->len - post;
+ split_post->block_len = split_post->len;
+ split_post->orig_block_len = split_post->block_len;
+ split_post->ram_bytes = split_post->len;
+ split_post->flags = flags;
+ split_post->compress_type = em->compress_type;
+ split_post->generation = em->generation;
+ add_extent_mapping(em_tree, split_post, 1);
+ }
+
+ /* Once for us */
+ free_extent_map(em);
+ /* Once for the tree */
+ free_extent_map(em);
+
+out_unlock:
+ write_unlock(&em_tree->lock);
+ unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
+out:
+ free_extent_map(split_pre);
+ free_extent_map(split_mid);
+ free_extent_map(split_post);
+
+ return ret;
+}
+
+static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ struct bio *bio, loff_t file_offset)
+{
+ struct btrfs_ordered_extent *ordered;
+ u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 file_len;
+ u64 len = bio->bi_iter.bi_size;
+ u64 end = start + len;
+ u64 ordered_end;
+ u64 pre, post;
+ int ret = 0;
+
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (WARN_ON_ONCE(!ordered))
+ return BLK_STS_IOERR;
+
+ /* No need to split */
+ if (ordered->disk_num_bytes == len)
+ goto out;
+
+ /* We cannot split once end_bio'd ordered extent */
+ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* We cannot split a compressed ordered extent */
+ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
+ /* bio must be in one ordered extent */
+ if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Checksum list should be empty */
+ if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ file_len = ordered->num_bytes;
+ pre = start - ordered->disk_bytenr;
+ post = ordered_end - end;
+
+ ret = btrfs_split_ordered_extent(ordered, pre, post);
+ if (ret)
+ goto out;
+ ret = split_zoned_em(inode, file_offset, file_len, pre, post);
+
+out:
+ btrfs_put_ordered_extent(ordered);
+
+ return errno_to_blk_status(ret);
+}
+
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ blk_status_t ret;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ ret = extract_ordered_extent(bi, bio,
+ page_offset(bio_first_bvec_all(bio)->bv_page));
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
+ }
+
+ /*
+ * If we need to checksum, and the I/O is not issued by fsync and
+ * friends, that is ->sync_writers != 0, defer the submission to a
+ * workqueue to parallelize it.
+ *
+ * Csum items for reloc roots have already been cloned at this point,
+ * so they are handled as part of the no-checksum case.
+ */
+ if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !btrfs_is_data_reloc_root(bi->root)) {
+ if (!atomic_read(&bi->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ btrfs_submit_bio_start))
+ return;
+
+ ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
+ }
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+}
+
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ blk_status_t ret;
+
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ /*
+ * btrfs_submit_compressed_read will handle completing the bio
+ * if there were any errors, so just return here.
+ */
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
+ }
+
+ /* Save the original iter for read repair */
+ btrfs_bio(bio)->iter = bio->bi_iter;
+
+ /*
+ * Lookup bio sums does extra checks around whether we need to csum or
+ * not, which is why we ignore skip_sum here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+}
+
+/*
+ * given a list of ordered sums record them in the inode. This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct list_head *list)
+{
+ struct btrfs_ordered_sum *sum;
+ struct btrfs_root *csum_root = NULL;
+ int ret;
+
+ list_for_each_entry(sum, list, list) {
+ trans->adding_csums = true;
+ if (!csum_root)
+ csum_root = btrfs_csum_root(trans->fs_info,
+ sum->bytenr);
+ ret = btrfs_csum_file_blocks(trans, csum_root, sum);
+ trans->adding_csums = false;
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW, cached_state,
+ GFP_NOFS);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+ unsigned int extra_bits,
+ struct extent_state **cached_state)
+{
+ WARN_ON(PAGE_ALIGNED(end));
+
+ if (start >= i_size_read(&inode->vfs_inode) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
+ /*
+ * There can't be any extents following eof in this case so just
+ * set the delalloc new bit for the range directly.
+ */
+ extra_bits |= EXTENT_DELALLOC_NEW;
+ } else {
+ int ret;
+
+ ret = btrfs_find_new_delalloc_bytes(inode, start,
+ end + 1 - start,
+ cached_state);
+ if (ret)
+ return ret;
+ }
+
+ return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
+ cached_state);
+}
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+ struct page *page;
+ struct inode *inode;
+ struct btrfs_work work;
+};
+
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ struct page *page;
+ struct btrfs_inode *inode;
+ u64 page_start;
+ u64 page_end;
+ int ret = 0;
+ bool free_delalloc_space = true;
+
+ fixup = container_of(work, struct btrfs_writepage_fixup, work);
+ page = fixup->page;
+ inode = BTRFS_I(fixup->inode);
+ page_start = page_offset(page);
+ page_end = page_offset(page) + PAGE_SIZE - 1;
+
+ /*
+ * This is similar to page_mkwrite, we need to reserve the space before
+ * we take the page lock.
+ */
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
+ PAGE_SIZE);
+again:
+ lock_page(page);
+
+ /*
+ * Before we queued this fixup, we took a reference on the page.
+ * page->mapping may go NULL, but it shouldn't be moved to a different
+ * address space.
+ */
+ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+ /*
+ * Unfortunately this is a little tricky, either
+ *
+ * 1) We got here and our page had already been dealt with and
+ * we reserved our space, thus ret == 0, so we need to just
+ * drop our space reservation and bail. This can happen the
+ * first time we come into the fixup worker, or could happen
+ * while waiting for the ordered extent.
+ * 2) Our page was already dealt with, but we happened to get an
+ * ENOSPC above from the btrfs_delalloc_reserve_space. In
+ * this case we obviously don't have anything to release, but
+ * because the page was already dealt with we don't want to
+ * mark the page with an error, so make sure we're resetting
+ * ret to 0. This is why we have this check _before_ the ret
+ * check, because we do not want to have a surprise ENOSPC
+ * when the page was already properly dealt with.
+ */
+ if (!ret) {
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ page_start, PAGE_SIZE,
+ true);
+ }
+ ret = 0;
+ goto out_page;
+ }
+
+ /*
+ * We can't mess with the page state unless it is locked, so now that
+ * it is locked bail if we failed to make our space reservation.
+ */
+ if (ret)
+ goto out_page;
+
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+
+ /* already ordered? We're done */
+ if (PageOrdered(page))
+ goto out_reserved;
+
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ if (ordered) {
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
+ &cached_state);
+ if (ret)
+ goto out_reserved;
+
+ /*
+ * Everything went as planned, we're now the owner of a dirty page with
+ * delayed allocation bits set and space reserved for our COW
+ * destination.
+ *
+ * The page was dirty when we started, nothing should have cleaned it.
+ */
+ BUG_ON(!PageDirty(page));
+ free_delalloc_space = false;
+out_reserved:
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
+ if (free_delalloc_space)
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ PAGE_SIZE, true);
+ unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+out_page:
+ if (ret) {
+ /*
+ * We hit ENOSPC or other errors. Update the mapping and page
+ * to reflect the errors and clean the page.
+ */
+ mapping_set_error(page->mapping, ret);
+ end_extent_writepage(page, ret, page_start, page_end);
+ clear_page_dirty_for_io(page);
+ SetPageError(page);
+ }
+ btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
+ unlock_page(page);
+ put_page(page);
+ kfree(fixup);
+ extent_changeset_free(data_reserved);
+ /*
+ * As a precaution, do a delayed iput in case it would be the last iput
+ * that could need flushing space. Recursing back to fixup worker would
+ * deadlock.
+ */
+ btrfs_add_delayed_iput(&inode->vfs_inode);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea. This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO. We kick off an async process
+ * to fix it up. The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+int btrfs_writepage_cow_fixup(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_writepage_fixup *fixup;
+
+ /* This page has ordered extent covering it already */
+ if (PageOrdered(page))
+ return 0;
+
+ /*
+ * PageChecked is set below when we create a fixup worker for this page,
+ * don't try to create another one if we're already PageChecked()
+ *
+ * The extent_io writepage code will redirty the page if we send back
+ * EAGAIN.
+ */
+ if (PageChecked(page))
+ return -EAGAIN;
+
+ fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+ if (!fixup)
+ return -EAGAIN;
+
+ /*
+ * We are already holding a reference to this inode from
+ * write_cache_pages. We need to hold it because the space reservation
+ * takes place outside of the page lock, and we can't trust
+ * page->mapping outside of the page lock.
+ */
+ ihold(inode);
+ btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ get_page(page);
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+ fixup->page = page;
+ fixup->inode = inode;
+ btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
+
+ return -EAGAIN;
+}
+
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u64 file_pos,
+ struct btrfs_file_extent_item *stack_fi,
+ const bool update_inode_bytes,
+ u64 qgroup_reserved)
+{
+ struct btrfs_root *root = inode->root;
+ const u64 sectorsize = root->fs_info->sectorsize;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
+ u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 offset = btrfs_stack_file_extent_offset(stack_fi);
+ u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
+ u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * we may be replacing one extent in the tree with another.
+ * The new extent is pinned in the extent map, and we don't want
+ * to drop it from the cache until it is completely in the btree.
+ *
+ * So, tell btrfs_drop_extents to leave this extent in the cache.
+ * the caller is expected to unpin it and allow it to be merged
+ * with the others.
+ */
+ drop_args.path = path;
+ drop_args.start = file_pos;
+ drop_args.end = file_pos + num_bytes;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*stack_fi);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret)
+ goto out;
+
+ if (!drop_args.extent_inserted) {
+ ins.objectid = btrfs_ino(inode);
+ ins.offset = file_pos;
+ ins.type = BTRFS_EXTENT_DATA_KEY;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &ins,
+ sizeof(*stack_fi));
+ if (ret)
+ goto out;
+ }
+ leaf = path->nodes[0];
+ btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
+ write_extent_buffer(leaf, stack_fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_file_extent_item));
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ /*
+ * If we dropped an inline extent here, we know the range where it is
+ * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
+ * number of bytes only for that range containing the inline extent.
+ * The remaining of the range will be processed when clearning the
+ * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
+ */
+ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
+ u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
+
+ inline_size = drop_args.bytes_found - inline_size;
+ btrfs_update_inode_bytes(inode, sectorsize, inline_size);
+ drop_args.bytes_found -= inline_size;
+ num_bytes -= sectorsize;
+ }
+
+ if (update_inode_bytes)
+ btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
+
+ ins.objectid = disk_bytenr;
+ ins.offset = disk_num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
+ if (ret)
+ goto out;
+
+ ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
+ file_pos - offset,
+ qgroup_reserved, &ins);
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len)
+{
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, start);
+ ASSERT(cache);
+
+ spin_lock(&cache->lock);
+ cache->delalloc_bytes -= len;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+}
+
+static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *oe)
+{
+ struct btrfs_file_extent_item stack_fi;
+ bool update_inode_bytes;
+ u64 num_bytes = oe->num_bytes;
+ u64 ram_bytes = oe->ram_bytes;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
+ oe->disk_num_bytes);
+ btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ num_bytes = oe->truncated_len;
+ ram_bytes = num_bytes;
+ }
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
+ btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ /*
+ * For delalloc, when completing an ordered extent we update the inode's
+ * bytes when clearing the range in the inode's io tree, so pass false
+ * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
+ * except if the ordered extent was truncated.
+ */
+ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
+
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ oe->file_offset, &stack_fi,
+ update_inode_bytes, oe->qgroup_rsv);
+}
+
+/*
+ * As ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans = NULL;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_state *cached_state = NULL;
+ u64 start, end;
+ int compress_type = 0;
+ int ret = 0;
+ u64 logical_len = ordered_extent->num_bytes;
+ bool freespace_inode;
+ bool truncated = false;
+ bool clear_reserved_extent = true;
+ unsigned int clear_bits = EXTENT_DEFRAG;
+
+ start = ordered_extent->file_offset;
+ end = start + ordered_extent->num_bytes - 1;
+
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
+ clear_bits |= EXTENT_DELALLOC_NEW;
+
+ freespace_inode = btrfs_is_free_space_inode(inode);
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
+
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /* A valid bdev implies a write on a sequential zone */
+ if (ordered_extent->bdev) {
+ btrfs_rewrite_logical_zoned(ordered_extent);
+ btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ } else if (btrfs_is_data_reloc_root(inode->root)) {
+ btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ }
+
+ btrfs_free_io_failure_record(inode, start, end);
+
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+ truncated = true;
+ logical_len = ordered_extent->truncated_len;
+ /* Truncated the entire extent, don't bother adding */
+ if (!logical_len)
+ goto out;
+ }
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
+ else
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+ trans->block_rsv = &inode->block_rsv;
+ ret = btrfs_update_inode_fallback(trans, root, inode);
+ if (ret) /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ clear_bits |= EXTENT_LOCKED;
+ lock_extent(io_tree, start, end, &cached_state);
+
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
+ else
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ trans->block_rsv = &inode->block_rsv;
+
+ if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+ compress_type = ordered_extent->compress_type;
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+ BUG_ON(compress_type);
+ ret = btrfs_mark_extent_written(trans, inode,
+ ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ logical_len);
+ btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ } else {
+ BUG_ON(root == fs_info->tree_root);
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
+ if (!ret) {
+ clear_reserved_extent = false;
+ btrfs_release_delalloc_bytes(fs_info,
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ }
+ }
+ unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = add_pending_csums(trans, &ordered_extent->list);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ /*
+ * If this is a new delalloc range, clear its new delalloc flag to
+ * update the inode's number of bytes. This needs to be done first
+ * before updating the inode item.
+ */
+ if ((clear_bits & EXTENT_DELALLOC_NEW) &&
+ !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
+ clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+ &cached_state);
+
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ ret = btrfs_update_inode_fallback(trans, root, inode);
+ if (ret) { /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ ret = 0;
+out:
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits,
+ &cached_state);
+
+ if (trans)
+ btrfs_end_transaction(trans);
+
+ if (ret || truncated) {
+ u64 unwritten_start = start;
+
+ /*
+ * If we failed to finish this ordered extent for any reason we
+ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
+ * extent, and mark the inode with the error if it wasn't
+ * already set. Any error during writeback would have already
+ * set the mapping error, so we need to set it if we're the ones
+ * marking this ordered extent as failed.
+ */
+ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
+ &ordered_extent->flags))
+ mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+
+ if (truncated)
+ unwritten_start += logical_len;
+ clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
+
+ /* Drop extent maps for the part of the extent we didn't write. */
+ btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+
+ /*
+ * If the ordered extent had an IOERR or something else went
+ * wrong we need to return the space for this ordered extent
+ * back to the allocator. We only free the extent in the
+ * truncated case if we didn't write out the extent at all.
+ *
+ * If we made it past insert_reserved_file_extent before we
+ * errored out then we don't need to do this as the accounting
+ * has already been done.
+ */
+ if ((ret || !logical_len) &&
+ clear_reserved_extent &&
+ !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+ /*
+ * Discard the range before returning it back to the
+ * free space pool
+ */
+ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
+ btrfs_discard_extent(fs_info,
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes,
+ NULL);
+ btrfs_free_reserved_extent(fs_info,
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes, 1);
+ /*
+ * Actually free the qgroup rsv which was released when
+ * the ordered extent was created.
+ */
+ btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
+ ordered_extent->qgroup_rsv,
+ BTRFS_QGROUP_RSV_DATA);
+ }
+ }
+
+ /*
+ * This needs to be done to make sure anybody waiting knows we are done
+ * updating everything for this ordered extent.
+ */
+ btrfs_remove_ordered_extent(inode, ordered_extent);
+
+ /* once for us */
+ btrfs_put_ordered_extent(ordered_extent);
+ /* once for the tree */
+ btrfs_put_ordered_extent(ordered_extent);
+
+ return ret;
+}
+
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
+ u64 end, bool uptodate)
+{
+ trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
+
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
+}
+
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ */
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected)
+{
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ char *kaddr;
+
+ ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
+
+ shash->tfm = fs_info->csum_shash;
+
+ kaddr = kmap_local_page(page) + pgoff;
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+ kunmap_local(kaddr);
+
+ if (memcmp(csum, csum_expected, fs_info->csum_size))
+ return -EIO;
+ return 0;
+}
+
+static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
+{
+ u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
+
+ return csums + offset_in_sectors * fs_info->csum_size;
+}
+
+/*
+ * check_data_csum - verify checksum of one sector of uncompressed data
+ * @inode: inode
+ * @bbio: btrfs_bio which contains the csum
+ * @bio_offset: offset to the beginning of the bio (in bytes)
+ * @page: page where is the data to be verified
+ * @pgoff: offset inside the page
+ *
+ * The length of such check is always one sector size.
+ *
+ * When csum mismatch is detected, we will also report the error and fill the
+ * corrupted range with zero. (Thus it needs the extra parameters)
+ */
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u32 len = fs_info->sectorsize;
+ u8 *csum_expected;
+ u8 csum[BTRFS_CSUM_SIZE];
+
+ ASSERT(pgoff + len <= PAGE_SIZE);
+
+ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
+
+ if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
+ goto zeroit;
+ return 0;
+
+zeroit:
+ btrfs_print_data_csum_error(BTRFS_I(inode),
+ bbio->file_offset + bio_offset,
+ csum, csum_expected, bbio->mirror_num);
+ if (bbio->device)
+ btrfs_dev_stat_inc_and_print(bbio->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ memzero_page(page, pgoff, len);
+ return -EIO;
+}
+
+/*
+ * When reads are done, we need to check csums to verify the data is correct.
+ * if there's a match, we allow the bio to finish. If not, the code in
+ * extent_io.c will try to find good copies for us.
+ *
+ * @bio_offset: offset to the beginning of the bio (in bytes)
+ * @start: file offset of the range start
+ * @end: file offset of the range end (inclusive)
+ *
+ * Return a bitmap where bit set means a csum mismatch, and bit not set means
+ * csum match.
+ */
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ const u32 sectorsize = root->fs_info->sectorsize;
+ u32 pg_off;
+ unsigned int result = 0;
+
+ /*
+ * This only happens for NODATASUM or compressed read.
+ * Normally this should be covered by above check for compressed read
+ * or the next check for NODATASUM. Just do a quicker exit here.
+ */
+ if (bbio->csum == NULL)
+ return 0;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return 0;
+
+ if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
+ return 0;
+
+ ASSERT(page_offset(page) <= start &&
+ end <= page_offset(page) + PAGE_SIZE - 1);
+ for (pg_off = offset_in_page(start);
+ pg_off < offset_in_page(end);
+ pg_off += sectorsize, bio_offset += sectorsize) {
+ u64 file_offset = pg_off + page_offset(page);
+ int ret;
+
+ if (btrfs_is_data_reloc_root(root) &&
+ test_range_bit(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, 1, NULL)) {
+ /* Skip the range without csum for data reloc inode */
+ clear_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ continue;
+ }
+ ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
+ if (ret < 0) {
+ const int nr_bit = (pg_off - offset_in_page(start)) >>
+ root->fs_info->sectorsize_bits;
+
+ result |= (1U << nr_bit);
+ }
+ }
+ return result;
+}
+
+/*
+ * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ *
+ * @inode: The inode we want to perform iput on
+ *
+ * This function uses the generic vfs_inode::i_count to track whether we should
+ * just decrement it (in case it's > 1) or if this is the last iput then link
+ * the inode to the delayed iput machinery. Delayed iputs are processed at
+ * transaction commit time/superblock commit/cleaner kthread.
+ */
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *binode = BTRFS_I(inode);
+
+ if (atomic_add_unless(&inode->i_count, -1, 1))
+ return;
+
+ atomic_inc(&fs_info->nr_delayed_iputs);
+ spin_lock(&fs_info->delayed_iput_lock);
+ ASSERT(list_empty(&binode->delayed_iput));
+ list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+ spin_unlock(&fs_info->delayed_iput_lock);
+ if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
+}
+
+static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode)
+{
+ list_del_init(&inode->delayed_iput);
+ spin_unlock(&fs_info->delayed_iput_lock);
+ iput(&inode->vfs_inode);
+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+ wake_up(&fs_info->delayed_iputs_wait);
+ spin_lock(&fs_info->delayed_iput_lock);
+}
+
+static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
+ struct btrfs_inode *inode)
+{
+ if (!list_empty(&inode->delayed_iput)) {
+ spin_lock(&fs_info->delayed_iput_lock);
+ if (!list_empty(&inode->delayed_iput))
+ run_delayed_iput_locked(fs_info, inode);
+ spin_unlock(&fs_info->delayed_iput_lock);
+ }
+}
+
+void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+
+ spin_lock(&fs_info->delayed_iput_lock);
+ while (!list_empty(&fs_info->delayed_iputs)) {
+ struct btrfs_inode *inode;
+
+ inode = list_first_entry(&fs_info->delayed_iputs,
+ struct btrfs_inode, delayed_iput);
+ run_delayed_iput_locked(fs_info, inode);
+ cond_resched_lock(&fs_info->delayed_iput_lock);
+ }
+ spin_unlock(&fs_info->delayed_iput_lock);
+}
+
+/*
+ * Wait for flushing all delayed iputs
+ *
+ * @fs_info: the filesystem
+ *
+ * This will wait on any delayed iputs that are currently running with KILLABLE
+ * set. Once they are all done running we will return, unless we are killed in
+ * which case we return EINTR. This helps in user operations like fallocate etc
+ * that might get blocked on the iputs.
+ *
+ * Return EINTR if we were killed, 0 if nothing's pending
+ */
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+ atomic_read(&fs_info->nr_delayed_iputs) == 0);
+ if (ret)
+ return -EINTR;
+ return 0;
+}
+
+/*
+ * This creates an orphan entry for the given inode in case something goes wrong
+ * in the middle of an unlink.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ int ret;
+
+ ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * We have done the delete so we can go ahead and remove the orphan item for
+ * this particular inode.
+ */
+static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+int btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key, found_key;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode;
+ u64 last_objectid = 0;
+ int ret = 0, nr_unlink = 0;
+
+ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ path->reada = READA_BACK;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * if ret == 0 means we found what we were searching for, which
+ * is weird, but possible, so only screw with path if we didn't
+ * find the key and see if we have stuff that matches
+ */
+ if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ /* pull out the item */
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ /* make sure the item matches what we want */
+ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+ break;
+ if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ /* release the path since we're done with it */
+ btrfs_release_path(path);
+
+ /*
+ * this is where we are basically btrfs_lookup, without the
+ * crossing root thing. we store the inode number in the
+ * offset of the orphan item.
+ */
+
+ if (found_key.offset == last_objectid) {
+ btrfs_err(fs_info,
+ "Error removing orphan entry, stopping orphan cleanup");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ last_objectid = found_key.offset;
+
+ found_key.objectid = found_key.offset;
+ found_key.type = BTRFS_INODE_ITEM_KEY;
+ found_key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, last_objectid, root);
+ ret = PTR_ERR_OR_ZERO(inode);
+ if (ret && ret != -ENOENT)
+ goto out;
+
+ if (ret == -ENOENT && root == fs_info->tree_root) {
+ struct btrfs_root *dead_root;
+ int is_dead_root = 0;
+
+ /*
+ * This is an orphan in the tree root. Currently these
+ * could come from 2 sources:
+ * a) a root (snapshot/subvolume) deletion in progress
+ * b) a free space cache inode
+ * We need to distinguish those two, as the orphan item
+ * for a root must not get deleted before the deletion
+ * of the snapshot/subvolume's tree completes.
+ *
+ * btrfs_find_orphan_roots() ran before us, which has
+ * found all deleted roots and loaded them into
+ * fs_info->fs_roots_radix. So here we can find if an
+ * orphan item corresponds to a deleted root by looking
+ * up the root from that radix tree.
+ */
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)found_key.objectid);
+ if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
+ is_dead_root = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (is_dead_root) {
+ /* prevent this orphan from being found again */
+ key.offset = found_key.objectid - 1;
+ continue;
+ }
+
+ }
+
+ /*
+ * If we have an inode with links, there are a couple of
+ * possibilities:
+ *
+ * 1. We were halfway through creating fsverity metadata for the
+ * file. In that case, the orphan item represents incomplete
+ * fsverity metadata which must be cleaned up with
+ * btrfs_drop_verity_items and deleting the orphan item.
+
+ * 2. Old kernels (before v3.12) used to create an
+ * orphan item for truncate indicating that there were possibly
+ * extent items past i_size that needed to be deleted. In v3.12,
+ * truncate was changed to update i_size in sync with the extent
+ * items, but the (useless) orphan item was still created. Since
+ * v4.18, we don't create the orphan item for truncate at all.
+ *
+ * So, this item could mean that we need to do a truncate, but
+ * only if this filesystem was last used on a pre-v3.12 kernel
+ * and was not cleanly unmounted. The odds of that are quite
+ * slim, and it's a pain to do the truncate now, so just delete
+ * the orphan item.
+ *
+ * It's also possible that this orphan item was supposed to be
+ * deleted but wasn't. The inode number may have been reused,
+ * but either way, we can delete the orphan item.
+ */
+ if (ret == -ENOENT || inode->i_nlink) {
+ if (!ret) {
+ ret = btrfs_drop_verity_items(BTRFS_I(inode));
+ iput(inode);
+ if (ret)
+ goto out;
+ }
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ btrfs_debug(fs_info, "auto deleting %Lu",
+ found_key.objectid);
+ ret = btrfs_del_orphan_item(trans, root,
+ found_key.objectid);
+ btrfs_end_transaction(trans);
+ if (ret)
+ goto out;
+ continue;
+ }
+
+ nr_unlink++;
+
+ /* this will do delete_inode and everything for us */
+ iput(inode);
+ }
+ /* release the path since we're done with it */
+ btrfs_release_path(path);
+
+ if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
+ trans = btrfs_join_transaction(root);
+ if (!IS_ERR(trans))
+ btrfs_end_transaction(trans);
+ }
+
+ if (nr_unlink)
+ btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
+
+out:
+ if (ret)
+ btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * very simple check to peek ahead in the leaf looking for xattrs. If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+ int slot, u64 objectid,
+ int *first_xattr_slot)
+{
+ u32 nritems = btrfs_header_nritems(leaf);
+ struct btrfs_key found_key;
+ static u64 xattr_access = 0;
+ static u64 xattr_default = 0;
+ int scanned = 0;
+
+ if (!xattr_access) {
+ xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+ xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
+ }
+
+ slot++;
+ *first_xattr_slot = -1;
+ while (slot < nritems) {
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* we found a different objectid, there must not be acls */
+ if (found_key.objectid != objectid)
+ return 0;
+
+ /* we found an xattr, assume we've got an acl */
+ if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (*first_xattr_slot == -1)
+ *first_xattr_slot = slot;
+ if (found_key.offset == xattr_access ||
+ found_key.offset == xattr_default)
+ return 1;
+ }
+
+ /*
+ * we found a key greater than an xattr key, there can't
+ * be any acls later on
+ */
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+ return 0;
+
+ slot++;
+ scanned++;
+
+ /*
+ * it goes inode, inode backrefs, xattrs, extents,
+ * so if there are a ton of hard links to an inode there can
+ * be a lot of backrefs. Don't waste time searching too hard,
+ * this is just an optimization
+ */
+ if (scanned >= 8)
+ break;
+ }
+ /* we hit the end of the leaf before we found an xattr or
+ * something larger than an xattr. We have to assume the inode
+ * has acls
+ */
+ if (*first_xattr_slot == -1)
+ *first_xattr_slot = slot;
+ return 1;
+}
+
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+static int btrfs_read_locked_inode(struct inode *inode,
+ struct btrfs_path *in_path)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_path *path = in_path;
+ struct extent_buffer *leaf;
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key location;
+ unsigned long ptr;
+ int maybe_acls;
+ u32 rdev;
+ int ret;
+ bool filled = false;
+ int first_xattr_slot;
+
+ ret = btrfs_fill_inode(inode, &rdev);
+ if (!ret)
+ filled = true;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
+ ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+ if (ret) {
+ if (path != in_path)
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+
+ if (filled)
+ goto cache_index;
+
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+ inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
+ i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
+ i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
+ btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+ round_up(i_size_read(inode), fs_info->sectorsize));
+
+ inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
+
+ inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
+
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_timespec_sec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_timespec_nsec(leaf, &inode_item->otime);
+
+ inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+ BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+ BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+ inode_set_iversion_queried(inode,
+ btrfs_inode_sequence(leaf, inode_item));
+ inode->i_generation = BTRFS_I(inode)->generation;
+ inode->i_rdev = 0;
+ rdev = btrfs_inode_rdev(leaf, inode_item);
+
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+
+cache_index:
+ /*
+ * If we were modified in the current generation and evicted from memory
+ * and then re-read we need to do a full sync since we don't have any
+ * idea about which extents were modified before we were evicted from
+ * cache.
+ *
+ * This is required for both inode re-read from disk and delayed inode
+ * in delayed_nodes_tree.
+ */
+ if (BTRFS_I(inode)->last_trans == fs_info->generation)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ /*
+ * We don't persist the id of the transaction where an unlink operation
+ * against the inode was last made. So here we assume the inode might
+ * have been evicted, and therefore the exact value of last_unlink_trans
+ * lost, and set it to last_trans to avoid metadata inconsistencies
+ * between the inode and its parent if the inode is fsync'ed and the log
+ * replayed. For example, in the scenario:
+ *
+ * touch mydir/foo
+ * ln mydir/foo mydir/bar
+ * sync
+ * unlink mydir/bar
+ * echo 2 > /proc/sys/vm/drop_caches # evicts inode
+ * xfs_io -c fsync mydir/foo
+ * <power failure>
+ * mount fs, triggers fsync log replay
+ *
+ * We must make sure that when we fsync our inode foo we also log its
+ * parent inode, otherwise after log replay the parent still has the
+ * dentry with the "bar" name but our inode foo has a link count of 1
+ * and doesn't have an inode ref with the name "bar" anymore.
+ *
+ * Setting last_unlink_trans to last_trans is a pessimistic approach,
+ * but it guarantees correctness at the expense of occasional full
+ * transaction commits on fsync if our inode is a directory, or if our
+ * inode is not a directory, logging its parent unnecessarily.
+ */
+ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+
+ /*
+ * Same logic as for last_unlink_trans. We don't persist the generation
+ * of the last transaction where this inode was used for a reflink
+ * operation, so after eviction and reloading the inode we must be
+ * pessimistic and assume the last transaction that modified the inode.
+ */
+ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+
+ path->slots[0]++;
+ if (inode->i_nlink != 1 ||
+ path->slots[0] >= btrfs_header_nritems(leaf))
+ goto cache_acl;
+
+ btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
+ if (location.objectid != btrfs_ino(BTRFS_I(inode)))
+ goto cache_acl;
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ if (location.type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ptr;
+ BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+ } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)ptr;
+ BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
+ extref);
+ }
+cache_acl:
+ /*
+ * try to precache a NULL acl entry for files that don't have
+ * any xattrs or acls
+ */
+ maybe_acls = acls_after_inode_item(leaf, path->slots[0],
+ btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
+ if (first_xattr_slot != -1) {
+ path->slots[0] = first_xattr_slot;
+ ret = btrfs_load_inode_props(inode, path);
+ if (ret)
+ btrfs_err(fs_info,
+ "error loading props for ino %llu (root %llu): %d",
+ btrfs_ino(BTRFS_I(inode)),
+ root->root_key.objectid, ret);
+ }
+ if (path != in_path)
+ btrfs_free_path(path);
+
+ if (!maybe_acls)
+ cache_no_acl(inode);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ break;
+ case S_IFDIR:
+ inode->i_fop = &btrfs_dir_file_operations;
+ inode->i_op = &btrfs_dir_inode_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &btrfs_aops;
+ break;
+ default:
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ break;
+ }
+
+ btrfs_sync_inode_flags_to_i_flags(inode);
+ return 0;
+}
+
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+ struct extent_buffer *leaf,
+ struct btrfs_inode_item *item,
+ struct inode *inode)
+{
+ struct btrfs_map_token token;
+ u64 flags;
+
+ btrfs_init_map_token(&token, leaf);
+
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
+ btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+
+ btrfs_set_token_timespec_sec(&token, &item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
+ inode->i_ctime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
+
+ btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
+ btrfs_set_token_inode_generation(&token, item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
+ btrfs_set_token_inode_block_group(&token, item, 0);
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto failed;
+ }
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+
+ fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_set_inode_last_trans(trans, inode);
+ ret = 0;
+failed:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ /*
+ * If the inode is a free space inode, we can deadlock during commit
+ * if we put it into the delayed code.
+ *
+ * The data relocation inode should also be directly updated
+ * without delay
+ */
+ if (!btrfs_is_free_space_inode(inode)
+ && !btrfs_is_data_reloc_root(root)
+ && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
+ btrfs_update_root_times(trans, root);
+
+ ret = btrfs_delayed_update_inode(trans, root, inode);
+ if (!ret)
+ btrfs_set_inode_last_trans(trans, inode);
+ return ret;
+ }
+
+ return btrfs_update_inode_item(trans, root, inode);
+}
+
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode)
+{
+ int ret;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret == -ENOSPC)
+ return btrfs_update_inode_item(trans, root, inode);
+ return ret;
+}
+
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code. It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ const struct fscrypt_str *name,
+ struct btrfs_rename_ctx *rename_ctx)
+{
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ int ret = 0;
+ struct btrfs_dir_item *di;
+ u64 index;
+ u64 ino = btrfs_ino(inode);
+ u64 dir_ino = btrfs_ino(dir);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
+ goto err;
+ }
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto err;
+ btrfs_release_path(path);
+
+ /*
+ * If we don't have dir index, we have to get it by looking up
+ * the inode ref, since we get the inode ref, remove it directly,
+ * it is unnecessary to do delayed deletion.
+ *
+ * But if we have dir index, needn't search inode ref to get it.
+ * Since the inode ref is close to the inode item, it is better
+ * that we delay to delete it, and just do this deletion when
+ * we update the inode item.
+ */
+ if (inode->dir_index) {
+ ret = btrfs_delayed_delete_inode_ref(inode);
+ if (!ret) {
+ index = inode->dir_index;
+ goto skip_backref;
+ }
+ }
+
+ ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
+ if (ret) {
+ btrfs_info(fs_info,
+ "failed to delete reference to %.*s, inode %llu parent %llu",
+ name->len, name->name, ino, dir_ino);
+ btrfs_abort_transaction(trans, ret);
+ goto err;
+ }
+skip_backref:
+ if (rename_ctx)
+ rename_ctx->index = index;
+
+ ret = btrfs_delete_delayed_dir_index(trans, dir, index);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto err;
+ }
+
+ /*
+ * If we are in a rename context, we don't need to update anything in the
+ * log. That will be done later during the rename by btrfs_log_new_name().
+ * Besides that, doing it here would only cause extra unnecessary btree
+ * operations on the log tree, increasing latency for applications.
+ */
+ if (!rename_ctx) {
+ btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
+ }
+
+ /*
+ * If we have a pending delayed iput we could end up with the final iput
+ * being run in btrfs-cleaner context. If we have enough of these built
+ * up we can end up burning a lot of time in btrfs-cleaner without any
+ * way to throttle the unlinks. Since we're currently holding a ref on
+ * the inode we can run the delayed iput here without any issues as the
+ * final iput won't be done until after we drop the ref we're currently
+ * holding.
+ */
+ btrfs_run_delayed_iput(fs_info, inode);
+err:
+ btrfs_free_path(path);
+ if (ret)
+ goto out;
+
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
+ inode_inc_iversion(&inode->vfs_inode);
+ inode_inc_iversion(&dir->vfs_inode);
+ inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
+ dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
+ ret = btrfs_update_inode(trans, root, dir);
+out:
+ return ret;
+}
+
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir, struct btrfs_inode *inode,
+ const struct fscrypt_str *name)
+{
+ int ret;
+
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
+ if (!ret) {
+ drop_nlink(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, inode->root, inode);
+ }
+ return ret;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
+{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+
+ /*
+ * 1 for the possible orphan item
+ * 1 for the dir item
+ * 1 for the dir index
+ * 1 for the inode ref
+ * 1 for the inode
+ * 1 for the parent inode
+ */
+ return btrfs_start_transaction_fallback_global_rsv(root, 6);
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct inode *inode = d_inode(dentry);
+ int ret;
+ struct fscrypt_name fname;
+
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
+
+ /* This needs to handle no-key deletions later on */
+
+ trans = __unlink_start_trans(dir);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto fscrypt_free;
+ }
+
+ btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ 0);
+
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ &fname.disk_name);
+ if (ret)
+ goto end_trans;
+
+ if (inode->i_nlink == 0) {
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret)
+ goto end_trans;
+ }
+
+end_trans:
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
+fscrypt_free:
+ fscrypt_free_filename(&fname);
+ return ret;
+}
+
+static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+ int ret;
+ u64 objectid;
+ u64 dir_ino = btrfs_ino(BTRFS_I(dir));
+ struct fscrypt_name fname;
+
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ return ret;
+
+ /* This needs to handle no-key deletions later on */
+
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
+ objectid = inode->root->root_key.objectid;
+ } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
+ objectid = inode->location.objectid;
+ } else {
+ WARN_ON(1);
+ fscrypt_free_filename(&fname);
+ return -EINVAL;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+ &fname.disk_name, -1);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * This is a placeholder inode for a subvolume we didn't have a
+ * reference to at the time of the snapshot creation. In the meantime
+ * we could have renamed the real subvol link into our snapshot, so
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
+ * Instead simply lookup the dir_index_item for this entry so we can
+ * remove it. Otherwise we know we have a ref to the root and we can
+ * call btrfs_del_root_ref, and it _shouldn't_ fail.
+ */
+ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
+ di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
+ if (IS_ERR_OR_NULL(di)) {
+ if (!di)
+ ret = -ENOENT;
+ else
+ ret = PTR_ERR(di);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ index = key.offset;
+ btrfs_release_path(path);
+ } else {
+ ret = btrfs_del_root_ref(trans, objectid,
+ root->root_key.objectid, dir_ino,
+ &index, &fname.disk_name);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ }
+
+ ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ btrfs_i_size_write(BTRFS_I(dir), dir->i_size - fname.disk_name.len * 2);
+ inode_inc_iversion(dir);
+ dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime;
+ ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+out:
+ btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
+ return ret;
+}
+
+/*
+ * Helper to check if the subvolume references other subvolumes or if it's
+ * default.
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
+ u64 dir_id;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Make sure this root isn't set as the default subvol */
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
+ dir_id, &name, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.objectid == root->root_key.objectid) {
+ ret = -EPERM;
+ btrfs_err(fs_info,
+ "deleting default subvolume %llu is not allowed",
+ key.objectid);
+ goto out;
+ }
+ btrfs_release_path(path);
+ }
+
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == root->root_key.objectid &&
+ key.type == BTRFS_ROOT_REF_KEY)
+ ret = -ENOTEMPTY;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/* Delete all dentries for inodes belonging to the root */
+static void btrfs_prune_dentries(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+ u64 objectid = 0;
+
+ if (!BTRFS_FS_ERROR(fs_info))
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < btrfs_ino(entry))
+ node = node->rb_left;
+ else if (objectid > btrfs_ino(entry))
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= btrfs_ino(entry)) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ objectid = btrfs_ino(entry) + 1;
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ /*
+ * btrfs_drop_inode will have it removed from the inode
+ * cache when its usage count hits zero.
+ */
+ iput(inode);
+ cond_resched();
+ spin_lock(&root->inode_lock);
+ goto again;
+ }
+
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+}
+
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_root *dest = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv block_rsv;
+ u64 root_flags;
+ int ret;
+
+ down_write(&fs_info->subvol_sem);
+
+ /*
+ * Don't allow to delete a subvolume with send in progress. This is
+ * inside the inode lock so the error handling that has to drop the bit
+ * again is not run concurrently.
+ */
+ spin_lock(&dest->root_item_lock);
+ if (dest->send_in_progress) {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu during send",
+ dest->root_key.objectid);
+ ret = -EPERM;
+ goto out_up_write;
+ }
+ if (atomic_read(&dest->nr_swapfiles)) {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu with active swapfile",
+ root->root_key.objectid);
+ ret = -EPERM;
+ goto out_up_write;
+ }
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+
+ ret = may_destroy_subvol(dest);
+ if (ret)
+ goto out_undead;
+
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * One for dir inode,
+ * two for dir entries,
+ * two for root ref/backref.
+ */
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+ if (ret)
+ goto out_undead;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_release;
+ }
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
+
+ btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
+ ret = btrfs_unlink_subvol(trans, dir, dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+
+ memset(&dest->root_item.drop_progress, 0,
+ sizeof(dest->root_item.drop_progress));
+ btrfs_set_root_drop_level(&dest->root_item, 0);
+ btrfs_set_root_refs(&dest->root_item, 0);
+
+ if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
+ ret = btrfs_insert_orphan_item(trans,
+ fs_info->tree_root,
+ dest->root_key.objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ }
+
+ ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
+ BTRFS_UUID_KEY_SUBVOL,
+ dest->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+ ret = btrfs_uuid_tree_remove(trans,
+ dest->root_item.received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ dest->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ }
+
+ free_anon_bdev(dest->anon_dev);
+ dest->anon_dev = 0;
+out_end_trans:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
+ ret = btrfs_end_transaction(trans);
+ inode->i_flags |= S_DEAD;
+out_release:
+ btrfs_subvolume_release_metadata(root, &block_rsv);
+out_undead:
+ if (ret) {
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ }
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (!ret) {
+ d_invalidate(dentry);
+ btrfs_prune_dentries(dest);
+ ASSERT(dest->send_in_progress == 0);
+ }
+
+ return ret;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ int err = 0;
+ struct btrfs_trans_handle *trans;
+ u64 last_unlink_trans;
+ struct fscrypt_name fname;
+
+ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ return -ENOTEMPTY;
+ if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
+ return btrfs_delete_subvolume(dir, dentry);
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (err)
+ return err;
+
+ /* This needs to handle no-key deletions later on */
+
+ trans = __unlink_start_trans(dir);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_notrans;
+ }
+
+ if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ err = btrfs_unlink_subvol(trans, dir, dentry);
+ goto out;
+ }
+
+ err = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (err)
+ goto out;
+
+ last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
+
+ /* now the directory is empty */
+ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
+ &fname.disk_name);
+ if (!err) {
+ btrfs_i_size_write(BTRFS_I(inode), 0);
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to
+ * its parent directory. This is to prevent an unrecoverable
+ * log tree in the case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ */
+ if (last_unlink_trans >= trans->transid)
+ BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+ }
+out:
+ btrfs_end_transaction(trans);
+out_notrans:
+ btrfs_btree_balance_dirty(fs_info);
+ fscrypt_free_filename(&fname);
+
+ return err;
+}
+
+/*
+ * btrfs_truncate_block - read, zero a chunk and write a block
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ * offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the block for the "from" offset and cow the block and zero the
+ * part we want to zero. This is used with truncate and hole punching.
+ */
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ bool only_release_metadata = false;
+ u32 blocksize = fs_info->sectorsize;
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (blocksize - 1);
+ struct page *page;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ size_t write_bytes = blocksize;
+ int ret = 0;
+ u64 block_start;
+ u64 block_end;
+
+ if (IS_ALIGNED(offset, blocksize) &&
+ (!len || IS_ALIGNED(len, blocksize)))
+ goto out;
+
+ block_start = round_down(from, blocksize);
+ block_end = block_start + blocksize - 1;
+
+ ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
+ blocksize, false);
+ if (ret < 0) {
+ if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
+ /* For nocow case, no need to reserve data space */
+ only_release_metadata = true;
+ } else {
+ goto out;
+ }
+ }
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ block_start, blocksize);
+ goto out;
+ }
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page) {
+ btrfs_delalloc_release_space(inode, data_reserved, block_start,
+ blocksize, true);
+ btrfs_delalloc_release_extents(inode, blocksize);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * We unlock the page after the io is completed and then re-lock it
+ * above. release_folio() could have come in between that and cleared
+ * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * here to make sure it's properly set for the subpage stuff.
+ */
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
+
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, block_start, block_end, &cached_state);
+
+ ordered = btrfs_lookup_ordered_extent(inode, block_start);
+ if (ordered) {
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
+ unlock_page(page);
+ put_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ clear_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ &cached_state);
+
+ ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
+ &cached_state);
+ if (ret) {
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
+ goto out_unlock;
+ }
+
+ if (offset != blocksize) {
+ if (!len)
+ len = blocksize - offset;
+ if (front)
+ memzero_page(page, (block_start - page_offset(page)),
+ offset);
+ else
+ memzero_page(page, (block_start - page_offset(page)) + offset,
+ len);
+ }
+ btrfs_page_clear_checked(fs_info, page, block_start,
+ block_end + 1 - block_start);
+ btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
+
+ if (only_release_metadata)
+ set_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_NORESERVE, NULL, GFP_NOFS);
+
+out_unlock:
+ if (ret) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
+ }
+ btrfs_delalloc_release_extents(inode, blocksize);
+ unlock_page(page);
+ put_page(page);
+out:
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
+static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
+ u64 offset, u64 len)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ int ret;
+
+ /*
+ * If NO_HOLES is enabled, we don't need to do anything.
+ * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+ * or btrfs_update_inode() will be called, which guarantee that the next
+ * fsync will know this inode was changed and needs to be logged.
+ */
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
+ return 0;
+
+ /*
+ * 1 - for the one we're dropping
+ * 1 - for the one we're adding
+ * 1 - for updating the inode.
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ drop_args.start = offset;
+ drop_args.end = offset + len;
+ drop_args.drop_cache = true;
+
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ } else {
+ btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
+ btrfs_update_inode(trans, root, inode);
+ }
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+/*
+ * This function puts in dummy file extents for the area we're creating a hole
+ * for. So if we are truncating this file to a larger size we need to insert
+ * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+ * the range between oldsize and size
+ */
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_map *em = NULL;
+ struct extent_state *cached_state = NULL;
+ u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
+ u64 block_end = ALIGN(size, fs_info->sectorsize);
+ u64 last_byte;
+ u64 cur_offset;
+ u64 hole_size;
+ int err = 0;
+
+ /*
+ * If our size started in the middle of a block we need to zero out the
+ * rest of the block before we expand the i_size, otherwise we could
+ * expose stale data.
+ */
+ err = btrfs_truncate_block(inode, oldsize, 0, 0);
+ if (err)
+ return err;
+
+ if (size <= hole_start)
+ return 0;
+
+ btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
+ &cached_state);
+ cur_offset = hole_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ block_end - cur_offset);
+ if (IS_ERR(em)) {
+ err = PTR_ERR(em);
+ em = NULL;
+ break;
+ }
+ last_byte = min(extent_map_end(em), block_end);
+ last_byte = ALIGN(last_byte, fs_info->sectorsize);
+ hole_size = last_byte - cur_offset;
+
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ struct extent_map *hole_em;
+
+ err = maybe_insert_hole(root, inode, cur_offset,
+ hole_size);
+ if (err)
+ break;
+
+ err = btrfs_inode_set_file_extent_range(inode,
+ cur_offset, hole_size);
+ if (err)
+ break;
+
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ cur_offset + hole_size - 1,
+ false);
+ btrfs_set_inode_full_sync(inode);
+ goto next;
+ }
+ hole_em->start = cur_offset;
+ hole_em->len = hole_size;
+ hole_em->orig_start = cur_offset;
+
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
+ hole_em->ram_bytes = hole_size;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = fs_info->generation;
+
+ err = btrfs_replace_extent_map_range(inode, hole_em, true);
+ free_extent_map(hole_em);
+ } else {
+ err = btrfs_inode_set_file_extent_range(inode,
+ cur_offset, hole_size);
+ if (err)
+ break;
+ }
+next:
+ free_extent_map(em);
+ em = NULL;
+ cur_offset = last_byte;
+ if (cur_offset >= block_end)
+ break;
+ }
+ free_extent_map(em);
+ unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
+ return err;
+}
+
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ loff_t oldsize = i_size_read(inode);
+ loff_t newsize = attr->ia_size;
+ int mask = attr->ia_valid;
+ int ret;
+
+ /*
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ * special case where we need to update the times despite not having
+ * these flags set. For all other operations the VFS set these flags
+ * explicitly if it wants a timestamp update.
+ */
+ if (newsize != oldsize) {
+ inode_inc_iversion(inode);
+ if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
+ }
+
+ if (newsize > oldsize) {
+ /*
+ * Don't do an expanding truncate while snapshotting is ongoing.
+ * This is to ensure the snapshot captures a fully consistent
+ * state of this file - if the snapshot captures this expanding
+ * truncation, it must capture all writes that happened before
+ * this truncation.
+ */
+ btrfs_drew_write_lock(&root->snapshot_lock);
+ ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
+ if (ret) {
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ return ret;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ return PTR_ERR(trans);
+ }
+
+ i_size_write(inode, newsize);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ pagecache_isize_extended(inode, oldsize, newsize);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_end_transaction(trans);
+ } else {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_wait_ordered_range(inode,
+ ALIGN(newsize, fs_info->sectorsize),
+ (u64)-1);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * We're truncating a file that used to have good data down to
+ * zero. Make sure any new writes to the file get on disk
+ * on close.
+ */
+ if (newsize == 0)
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
+ &BTRFS_I(inode)->runtime_flags);
+
+ truncate_setsize(inode, newsize);
+
+ inode_dio_wait(inode);
+
+ ret = btrfs_truncate(inode, newsize == oldsize);
+ if (ret && inode->i_nlink) {
+ int err;
+
+ /*
+ * Truncate failed, so fix up the in-memory size. We
+ * adjusted disk_i_size down as we removed extents, so
+ * wait for disk_i_size to be stable and then update the
+ * in-memory size to match.
+ */
+ err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (err)
+ return err;
+ i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+ }
+ }
+
+ return ret;
+}
+
+static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int err;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ err = setattr_prepare(mnt_userns, dentry, attr);
+ if (err)
+ return err;
+
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ err = btrfs_setsize(inode, attr);
+ if (err)
+ return err;
+ }
+
+ if (attr->ia_valid) {
+ setattr_copy(mnt_userns, inode, attr);
+ inode_inc_iversion(inode);
+ err = btrfs_dirty_inode(inode);
+
+ if (!err && attr->ia_valid & ATTR_MODE)
+ err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ }
+
+ return err;
+}
+
+/*
+ * While truncating the inode pages during eviction, we get the VFS
+ * calling btrfs_invalidate_folio() against each folio of the inode. This
+ * is slow because the calls to btrfs_invalidate_folio() result in a
+ * huge amount of calls to lock_extent() and clear_extent_bit(),
+ * which keep merging and splitting extent_state structures over and over,
+ * wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
+ * skip all those expensive operations on a per folio basis and do only
+ * the ordered io finishing, while we release here the extent_map and
+ * extent_state structures, without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct rb_node *node;
+
+ ASSERT(inode->i_state & I_FREEING);
+ truncate_inode_pages_final(&inode->i_data);
+
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+
+ /*
+ * Keep looping until we have no more ranges in the io tree.
+ * We can have ongoing bios started by readahead that have
+ * their endio callback (extent_io.c:end_bio_extent_readpage)
+ * still in progress (unlocked the pages in the bio but did not yet
+ * unlocked the ranges in the io tree). Therefore this means some
+ * ranges can still be locked and eviction started because before
+ * submitting those bios, which are executed by a separate task (work
+ * queue kthread), inode references (inode->i_count) were not taken
+ * (which would be dropped in the end io callback of each bio).
+ * Therefore here we effectively end up waiting for those bios and
+ * anyone else holding locked ranges without having bumped the inode's
+ * reference count - if we don't do it, when they access the inode's
+ * io_tree to unlock a range it may be too late, leading to an
+ * use-after-free issue.
+ */
+ spin_lock(&io_tree->lock);
+ while (!RB_EMPTY_ROOT(&io_tree->state)) {
+ struct extent_state *state;
+ struct extent_state *cached_state = NULL;
+ u64 start;
+ u64 end;
+ unsigned state_flags;
+
+ node = rb_first(&io_tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ start = state->start;
+ end = state->end;
+ state_flags = state->state;
+ spin_unlock(&io_tree->lock);
+
+ lock_extent(io_tree, start, end, &cached_state);
+
+ /*
+ * If still has DELALLOC flag, the extent didn't reach disk,
+ * and its reserved space won't be freed by delayed_ref.
+ * So we need to free its reserved space here.
+ * (Refer to comment in btrfs_invalidate_folio, case 2)
+ *
+ * Note, end is the bytenr of last byte, so we need + 1 here.
+ */
+ if (state_flags & EXTENT_DELALLOC)
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
+ end - start + 1, NULL);
+
+ clear_extent_bit(io_tree, start, end,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
+ &cached_state);
+
+ cond_resched();
+ spin_lock(&io_tree->lock);
+ }
+ spin_unlock(&io_tree->lock);
+}
+
+static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans;
+ u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
+ int ret;
+
+ /*
+ * Eviction should be taking place at some place safe because of our
+ * delayed iputs. However the normal flushing code will run delayed
+ * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
+ *
+ * We reserve the delayed_refs_extra here again because we can't use
+ * btrfs_start_transaction(root, 0) for the same deadlocky reason as
+ * above. We reserve our extra bit here because we generate a ton of
+ * delayed refs activity by truncating.
+ *
+ * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
+ * if we fail to make this reservation we can re-try without the
+ * delayed_refs_extra so we can make some forward progress.
+ */
+ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
+ BTRFS_RESERVE_FLUSH_EVICT);
+ if (ret) {
+ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
+ BTRFS_RESERVE_FLUSH_EVICT);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "could not allocate space for delete; will truncate on mount");
+ return ERR_PTR(-ENOSPC);
+ }
+ delayed_refs_extra = 0;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return trans;
+
+ if (delayed_refs_extra) {
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ trans->bytes_reserved = delayed_refs_extra;
+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+ delayed_refs_extra, 1);
+ }
+ return trans;
+}
+
+void btrfs_evict_inode(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *rsv;
+ int ret;
+
+ trace_btrfs_inode_evict(inode);
+
+ if (!root) {
+ fsverity_cleanup_inode(inode);
+ clear_inode(inode);
+ return;
+ }
+
+ evict_inode_truncate_pages(inode);
+
+ if (inode->i_nlink &&
+ ((btrfs_root_refs(&root->root_item) != 0 &&
+ root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ btrfs_is_free_space_inode(BTRFS_I(inode))))
+ goto no_delete;
+
+ if (is_bad_inode(inode))
+ goto no_delete;
+
+ btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
+
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ goto no_delete;
+
+ if (inode->i_nlink > 0) {
+ BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+ root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+ goto no_delete;
+ }
+
+ /*
+ * This makes sure the inode item in tree is uptodate and the space for
+ * the inode update is released.
+ */
+ ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
+ if (ret)
+ goto no_delete;
+
+ /*
+ * This drops any pending insert or delete operations we have for this
+ * inode. We could have a delayed dir index deletion queued up, but
+ * we're removing the inode completely so that'll be taken care of in
+ * the truncate.
+ */
+ btrfs_kill_delayed_inode_items(BTRFS_I(inode));
+
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv)
+ goto no_delete;
+ rsv->size = btrfs_calc_metadata_size(fs_info, 1);
+ rsv->failfast = true;
+
+ btrfs_i_size_write(BTRFS_I(inode), 0);
+
+ while (1) {
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(inode),
+ .ino = btrfs_ino(BTRFS_I(inode)),
+ .new_size = 0,
+ .min_type = 0,
+ };
+
+ trans = evict_refill_and_join(root, rsv);
+ if (IS_ERR(trans))
+ goto free_rsv;
+
+ trans->block_rsv = rsv;
+
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ if (ret && ret != -ENOSPC && ret != -EAGAIN)
+ goto free_rsv;
+ else if (!ret)
+ break;
+ }
+
+ /*
+ * Errors here aren't a big deal, it just means we leave orphan items in
+ * the tree. They will be cleaned up on the next mount. If the inode
+ * number gets reused, cleanup deletes the orphan item without doing
+ * anything, and unlink reuses the existing orphan item.
+ *
+ * If it turns out that we are dropping too many of these, we might want
+ * to add a mechanism for retrying these after a commit.
+ */
+ trans = evict_refill_and_join(root, rsv);
+ if (!IS_ERR(trans)) {
+ trans->block_rsv = rsv;
+ btrfs_orphan_del(trans, BTRFS_I(inode));
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ btrfs_end_transaction(trans);
+ }
+
+free_rsv:
+ btrfs_free_block_rsv(fs_info, rsv);
+no_delete:
+ /*
+ * If we didn't successfully delete, the orphan item will still be in
+ * the tree and we'll retry on the next mount. Again, we might also want
+ * to retry these periodically in the future.
+ */
+ btrfs_remove_delayed_node(BTRFS_I(inode));
+ fsverity_cleanup_inode(inode);
+ clear_inode(inode);
+}
+
+/*
+ * Return the key found in the dir entry in the location pointer, fill @type
+ * with BTRFS_FT_*, and return 0.
+ *
+ * If no dir entries were found, returns -ENOENT.
+ * If found a corrupted location in dir entry, returns -EUCLEAN.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+ struct btrfs_key *location, u8 *type)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int ret = 0;
+ struct fscrypt_name fname;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ if (ret)
+ goto out;
+
+ /* This needs to handle no-key deletions later on */
+
+ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
+ &fname.disk_name, 0);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
+ goto out;
+ }
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+ if (location->type != BTRFS_INODE_ITEM_KEY &&
+ location->type != BTRFS_ROOT_ITEM_KEY) {
+ ret = -EUCLEAN;
+ btrfs_warn(root->fs_info,
+"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
+ __func__, fname.disk_name.name, btrfs_ino(BTRFS_I(dir)),
+ location->objectid, location->type, location->offset);
+ }
+ if (!ret)
+ *type = btrfs_dir_type(path->nodes[0], di);
+out:
+ fscrypt_free_filename(&fname);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root. This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
+ struct inode *dir,
+ struct dentry *dentry,
+ struct btrfs_key *location,
+ struct btrfs_root **sub_root)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *new_root;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+ int err = 0;
+ struct fscrypt_name fname;
+
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (ret)
+ return ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = -ENOENT;
+ key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ if (ret) {
+ if (ret < 0)
+ err = ret;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
+ btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
+ goto out;
+
+ ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
+ (unsigned long)(ref + 1), fname.disk_name.len);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+
+ new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
+ if (IS_ERR(new_root)) {
+ err = PTR_ERR(new_root);
+ goto out;
+ }
+
+ *sub_root = new_root;
+ location->objectid = btrfs_root_dirid(&new_root->root_item);
+ location->type = BTRFS_INODE_ITEM_KEY;
+ location->offset = 0;
+ err = 0;
+out:
+ btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
+ return err;
+}
+
+static void inode_tree_add(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *entry;
+ struct rb_node **p;
+ struct rb_node *parent;
+ struct rb_node *new = &BTRFS_I(inode)->rb_node;
+ u64 ino = btrfs_ino(BTRFS_I(inode));
+
+ if (inode_unhashed(inode))
+ return;
+ parent = NULL;
+ spin_lock(&root->inode_lock);
+ p = &root->inode_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_inode, rb_node);
+
+ if (ino < btrfs_ino(entry))
+ p = &parent->rb_left;
+ else if (ino > btrfs_ino(entry))
+ p = &parent->rb_right;
+ else {
+ WARN_ON(!(entry->vfs_inode.i_state &
+ (I_WILL_FREE | I_FREEING)));
+ rb_replace_node(parent, new, &root->inode_tree);
+ RB_CLEAR_NODE(parent);
+ spin_unlock(&root->inode_lock);
+ return;
+ }
+ }
+ rb_link_node(new, parent, p);
+ rb_insert_color(new, &root->inode_tree);
+ spin_unlock(&root->inode_lock);
+}
+
+static void inode_tree_del(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+ int empty = 0;
+
+ spin_lock(&root->inode_lock);
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
+ rb_erase(&inode->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&inode->rb_node);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ }
+ spin_unlock(&root->inode_lock);
+
+ if (empty && btrfs_root_refs(&root->root_item) == 0) {
+ spin_lock(&root->inode_lock);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ spin_unlock(&root->inode_lock);
+ if (empty)
+ btrfs_add_dead_root(root);
+ }
+}
+
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+ struct btrfs_iget_args *args = p;
+
+ inode->i_ino = args->ino;
+ BTRFS_I(inode)->location.objectid = args->ino;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
+ BTRFS_I(inode)->root = btrfs_grab_root(args->root);
+ BUG_ON(args->root && !BTRFS_I(inode)->root);
+
+ if (args->root && args->root == args->root->fs_info->tree_root &&
+ args->ino != BTRFS_BTREE_INODE_OBJECTID)
+ set_bit(BTRFS_INODE_FREE_SPACE_INODE,
+ &BTRFS_I(inode)->runtime_flags);
+ return 0;
+}
+
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+ struct btrfs_iget_args *args = opaque;
+
+ return args->ino == BTRFS_I(inode)->location.objectid &&
+ args->root == BTRFS_I(inode)->root;
+}
+
+static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
+ struct btrfs_root *root)
+{
+ struct inode *inode;
+ struct btrfs_iget_args args;
+ unsigned long hashval = btrfs_inode_hash(ino, root);
+
+ args.ino = ino;
+ args.root = root;
+
+ inode = iget5_locked(s, hashval, btrfs_find_actor,
+ btrfs_init_locked_inode,
+ (void *)&args);
+ return inode;
+}
+
+/*
+ * Get an inode object given its inode number and corresponding root.
+ * Path can be preallocated to prevent recursing back to iget through
+ * allocator. NULL is also valid but may require an additional allocation
+ * later.
+ */
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
+ struct btrfs_root *root, struct btrfs_path *path)
+{
+ struct inode *inode;
+
+ inode = btrfs_iget_locked(s, ino, root);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (inode->i_state & I_NEW) {
+ int ret;
+
+ ret = btrfs_read_locked_inode(inode, path);
+ if (!ret) {
+ inode_tree_add(inode);
+ unlock_new_inode(inode);
+ } else {
+ iget_failed(inode);
+ /*
+ * ret > 0 can come from btrfs_search_slot called by
+ * btrfs_read_locked_inode, this means the inode item
+ * was not found.
+ */
+ if (ret > 0)
+ ret = -ENOENT;
+ inode = ERR_PTR(ret);
+ }
+ }
+
+ return inode;
+}
+
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
+{
+ return btrfs_iget_path(s, ino, root, NULL);
+}
+
+static struct inode *new_simple_dir(struct super_block *s,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
+{
+ struct inode *inode = new_inode(s);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ BTRFS_I(inode)->root = btrfs_grab_root(root);
+ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+
+ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ /*
+ * We only need lookup, the rest is read-only and there's no inode
+ * associated with the dentry
+ */
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_opflags &= ~IOP_XATTR;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ inode->i_mtime = current_time(inode);
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+ return inode;
+}
+
+static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
+static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
+static_assert(BTRFS_FT_DIR == FT_DIR);
+static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
+static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
+static_assert(BTRFS_FT_FIFO == FT_FIFO);
+static_assert(BTRFS_FT_SOCK == FT_SOCK);
+static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
+
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+ return fs_umode_to_ftype(inode->i_mode);
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *sub_root = root;
+ struct btrfs_key location;
+ u8 di_type = 0;
+ int ret = 0;
+
+ if (dentry->d_name.len > BTRFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (location.type == BTRFS_INODE_ITEM_KEY) {
+ inode = btrfs_iget(dir->i_sb, location.objectid, root);
+ if (IS_ERR(inode))
+ return inode;
+
+ /* Do extra check against inode mode with di_type */
+ if (btrfs_inode_type(inode) != di_type) {
+ btrfs_crit(fs_info,
+"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
+ inode->i_mode, btrfs_inode_type(inode),
+ di_type);
+ iput(inode);
+ return ERR_PTR(-EUCLEAN);
+ }
+ return inode;
+ }
+
+ ret = fixup_tree_root_location(fs_info, dir, dentry,
+ &location, &sub_root);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ inode = ERR_PTR(ret);
+ else
+ inode = new_simple_dir(dir->i_sb, &location, root);
+ } else {
+ inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
+ btrfs_put_root(sub_root);
+
+ if (IS_ERR(inode))
+ return inode;
+
+ down_read(&fs_info->cleanup_work_sem);
+ if (!sb_rdonly(inode->i_sb))
+ ret = btrfs_orphan_cleanup(sub_root);
+ up_read(&fs_info->cleanup_work_sem);
+ if (ret) {
+ iput(inode);
+ inode = ERR_PTR(ret);
+ }
+ }
+
+ return inode;
+}
+
+static int btrfs_dentry_delete(const struct dentry *dentry)
+{
+ struct btrfs_root *root;
+ struct inode *inode = d_inode(dentry);
+
+ if (!inode && !IS_ROOT(dentry))
+ inode = d_inode(dentry->d_parent);
+
+ if (inode) {
+ root = BTRFS_I(inode)->root;
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+
+ if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return 1;
+ }
+ return 0;
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode = btrfs_lookup_dentry(dir, dentry);
+
+ if (inode == ERR_PTR(-ENOENT))
+ inode = NULL;
+ return d_splice_alias(inode, dentry);
+}
+
+/*
+ * Find the highest existing sequence number in a directory and then set the
+ * in-memory index_cnt variable to the first free sequence number.
+ */
+static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_key key, found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ /* FIXME: we should be able to handle this */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ if (path->slots[0] == 0) {
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
+ goto out;
+ }
+
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != btrfs_ino(inode) ||
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
+ goto out;
+ }
+
+ inode->index_cnt = found_key.offset + 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
+{
+ int ret = 0;
+
+ btrfs_inode_lock(&dir->vfs_inode, 0);
+ if (dir->index_cnt == (u64)-1) {
+ ret = btrfs_inode_delayed_dir_index_count(dir);
+ if (ret) {
+ ret = btrfs_set_inode_index_count(dir);
+ if (ret)
+ goto out;
+ }
+ }
+
+ /* index_cnt is the index number of next new entry, so decrement it. */
+ *index = dir->index_cnt - 1;
+out:
+ btrfs_inode_unlock(&dir->vfs_inode, 0);
+
+ return ret;
+}
+
+/*
+ * All this infrastructure exists because dir_emit can fault, and we are holding
+ * the tree lock when doing readdir. For now just allocate a buffer and copy
+ * our information into that, and then dir_emit from the buffer. This is
+ * similar to what NFS does, only we don't keep the buffer around in pagecache
+ * because I'm afraid I'll mess that up. Long term we need to make filldir do
+ * copy_to_user_inatomic so we don't have to worry about page faulting under the
+ * tree lock.
+ */
+static int btrfs_opendir(struct inode *inode, struct file *file)
+{
+ struct btrfs_file_private *private;
+ u64 last_index;
+ int ret;
+
+ ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
+ if (ret)
+ return ret;
+
+ private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
+ if (!private)
+ return -ENOMEM;
+ private->last_index = last_index;
+ private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!private->filldir_buf) {
+ kfree(private);
+ return -ENOMEM;
+ }
+ file->private_data = private;
+ return 0;
+}
+
+static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct btrfs_file_private *private = file->private_data;
+ int ret;
+
+ ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
+ &private->last_index);
+ if (ret)
+ return ret;
+
+ return generic_file_llseek(file, offset, whence);
+}
+
+struct dir_entry {
+ u64 ino;
+ u64 offset;
+ unsigned type;
+ int name_len;
+};
+
+static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
+{
+ while (entries--) {
+ struct dir_entry *entry = addr;
+ char *name = (char *)(entry + 1);
+
+ ctx->pos = get_unaligned(&entry->offset);
+ if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
+ get_unaligned(&entry->ino),
+ get_unaligned(&entry->type)))
+ return 1;
+ addr += sizeof(struct dir_entry) +
+ get_unaligned(&entry->name_len);
+ ctx->pos++;
+ }
+ return 0;
+}
+
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_private *private = file->private_data;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ void *addr;
+ struct list_head ins_list;
+ struct list_head del_list;
+ int ret;
+ char *name_ptr;
+ int name_len;
+ int entries = 0;
+ int total_len = 0;
+ bool put = false;
+ struct btrfs_key location;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ addr = private->filldir_buf;
+ path->reada = READA_FORWARD;
+
+ INIT_LIST_HEAD(&ins_list);
+ INIT_LIST_HEAD(&del_list);
+ put = btrfs_readdir_get_delayed_items(inode, private->last_index,
+ &ins_list, &del_list);
+
+again:
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = ctx->pos;
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+
+ btrfs_for_each_slot(root, &key, &found_key, path, ret) {
+ struct dir_entry *entry;
+ struct extent_buffer *leaf = path->nodes[0];
+
+ if (found_key.objectid != key.objectid)
+ break;
+ if (found_key.type != BTRFS_DIR_INDEX_KEY)
+ break;
+ if (found_key.offset < ctx->pos)
+ continue;
+ if (found_key.offset > private->last_index)
+ break;
+ if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
+ continue;
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ name_len = btrfs_dir_name_len(leaf, di);
+ if ((total_len + sizeof(struct dir_entry) + name_len) >=
+ PAGE_SIZE) {
+ btrfs_release_path(path);
+ ret = btrfs_filldir(private->filldir_buf, entries, ctx);
+ if (ret)
+ goto nopos;
+ addr = private->filldir_buf;
+ entries = 0;
+ total_len = 0;
+ goto again;
+ }
+
+ entry = addr;
+ put_unaligned(name_len, &entry->name_len);
+ name_ptr = (char *)(entry + 1);
+ read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
+ name_len);
+ put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
+ &entry->type);
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
+ put_unaligned(location.objectid, &entry->ino);
+ put_unaligned(found_key.offset, &entry->offset);
+ entries++;
+ addr += sizeof(struct dir_entry) + name_len;
+ total_len += sizeof(struct dir_entry) + name_len;
+ }
+ /* Catch error encountered during iteration */
+ if (ret < 0)
+ goto err;
+
+ btrfs_release_path(path);
+
+ ret = btrfs_filldir(private->filldir_buf, entries, ctx);
+ if (ret)
+ goto nopos;
+
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+ if (ret)
+ goto nopos;
+
+ /*
+ * Stop new entries from being returned after we return the last
+ * entry.
+ *
+ * New directory entries are assigned a strictly increasing
+ * offset. This means that new entries created during readdir
+ * are *guaranteed* to be seen in the future by that readdir.
+ * This has broken buggy programs which operate on names as
+ * they're returned by readdir. Until we re-use freed offsets
+ * we have this hack to stop new entries from being returned
+ * under the assumption that they'll never reach this huge
+ * offset.
+ *
+ * This is being careful not to overflow 32bit loff_t unless the
+ * last entry requires it because doing so has broken 32bit apps
+ * in the past.
+ */
+ if (ctx->pos >= INT_MAX)
+ ctx->pos = LLONG_MAX;
+ else
+ ctx->pos = INT_MAX;
+nopos:
+ ret = 0;
+err:
+ if (put)
+ btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes. But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+static int btrfs_dirty_inode(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+ return 0;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
+ /* whoops, lets try again with the full transaction */
+ btrfs_end_transaction(trans);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ }
+ btrfs_end_transaction(trans);
+ if (BTRFS_I(inode)->delayed_node)
+ btrfs_balance_delayed_items(fs_info);
+
+ return ret;
+}
+
+/*
+ * This is a copy of file_update_time. We need this so we can return error on
+ * ENOSPC for updating the inode in the case of file write and mmap writes.
+ */
+static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
+ int flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ bool dirty = flags & ~S_VERSION;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ if (flags & S_VERSION)
+ dirty |= inode_maybe_inc_iversion(inode, dirty);
+ if (flags & S_CTIME)
+ inode->i_ctime = *now;
+ if (flags & S_MTIME)
+ inode->i_mtime = *now;
+ if (flags & S_ATIME)
+ inode->i_atime = *now;
+ return dirty ? btrfs_dirty_inode(inode) : 0;
+}
+
+/*
+ * helper to find a free sequence number in a given directory. This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
+{
+ int ret = 0;
+
+ if (dir->index_cnt == (u64)-1) {
+ ret = btrfs_inode_delayed_dir_index_count(dir);
+ if (ret) {
+ ret = btrfs_set_inode_index_count(dir);
+ if (ret)
+ return ret;
+ }
+ }
+
+ *index = dir->index_cnt;
+ dir->index_cnt++;
+
+ return ret;
+}
+
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+ struct btrfs_iget_args args;
+
+ args.ino = BTRFS_I(inode)->location.objectid;
+ args.root = BTRFS_I(inode)->root;
+
+ return insert_inode_locked4(inode,
+ btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+ btrfs_find_actor, &args);
+}
+
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items)
+{
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ int ret;
+
+ if (!args->orphan) {
+ ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
+ &args->fname);
+ if (ret)
+ return ret;
+ }
+
+ ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
+ if (ret) {
+ fscrypt_free_filename(&args->fname);
+ return ret;
+ }
+
+ /* 1 to add inode item */
+ *trans_num_items = 1;
+ /* 1 to add compression property */
+ if (BTRFS_I(dir)->prop_compress)
+ (*trans_num_items)++;
+ /* 1 to add default ACL xattr */
+ if (args->default_acl)
+ (*trans_num_items)++;
+ /* 1 to add access ACL xattr */
+ if (args->acl)
+ (*trans_num_items)++;
+#ifdef CONFIG_SECURITY
+ /* 1 to add LSM xattr */
+ if (dir->i_security)
+ (*trans_num_items)++;
+#endif
+ if (args->orphan) {
+ /* 1 to add orphan item */
+ (*trans_num_items)++;
+ } else {
+ /*
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
+ *
+ * No need for 1 unit for the inode ref item because it is
+ * inserted in a batch together with the inode item at
+ * btrfs_create_new_inode().
+ */
+ *trans_num_items += 3;
+ }
+ return 0;
+}
+
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
+{
+ posix_acl_release(args->acl);
+ posix_acl_release(args->default_acl);
+ fscrypt_free_filename(&args->fname);
+}
+
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Currently only the compression flags and the cow flags are inherited.
+ */
+static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+ unsigned int flags;
+
+ flags = BTRFS_I(dir)->flags;
+
+ if (flags & BTRFS_INODE_NOCOMPRESS) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ } else if (flags & BTRFS_INODE_COMPRESS) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ }
+
+ if (flags & BTRFS_INODE_NODATACOW) {
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ }
+
+ btrfs_sync_inode_flags_to_i_flags(inode);
+}
+
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args)
+{
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_root *root;
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_key *location;
+ struct btrfs_path *path;
+ u64 objectid;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_key key[2];
+ u32 sizes[2];
+ struct btrfs_item_batch batch;
+ unsigned long ptr;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (!args->subvol)
+ BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
+ root = BTRFS_I(inode)->root;
+
+ ret = btrfs_get_free_objectid(root, &objectid);
+ if (ret)
+ goto out;
+ inode->i_ino = objectid;
+
+ if (args->orphan) {
+ /*
+ * O_TMPFILE, set link count to 0, so that after this point, we
+ * fill in an inode item with the correct link count.
+ */
+ set_nlink(inode, 0);
+ } else {
+ trace_btrfs_inode_request(dir);
+
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
+ if (ret)
+ goto out;
+ }
+ /* index_cnt is ignored for everything but a dir. */
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+ BTRFS_I(inode)->generation = trans->transid;
+ inode->i_generation = BTRFS_I(inode)->generation;
+
+ /*
+ * Subvolumes don't inherit flags from their parent directory.
+ * Originally this was probably by accident, but we probably can't
+ * change it now without compatibility issues.
+ */
+ if (!args->subvol)
+ btrfs_inherit_iflags(inode, dir);
+
+ if (S_ISREG(inode->i_mode)) {
+ if (btrfs_test_opt(fs_info, NODATASUM))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ if (btrfs_test_opt(fs_info, NODATACOW))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
+ }
+
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ location->type = BTRFS_INODE_ITEM_KEY;
+
+ ret = btrfs_insert_inode_locked(inode);
+ if (ret < 0) {
+ if (!args->orphan)
+ BTRFS_I(dir)->index_cnt--;
+ goto out;
+ }
+
+ /*
+ * We could have gotten an inode number from somebody who was fsynced
+ * and then removed in this same transaction, so let's just set full
+ * sync since it will be a full sync anyway and this will blow away the
+ * old info in the log.
+ */
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
+
+ key[0].objectid = objectid;
+ key[0].type = BTRFS_INODE_ITEM_KEY;
+ key[0].offset = 0;
+
+ sizes[0] = sizeof(struct btrfs_inode_item);
+
+ if (!args->orphan) {
+ /*
+ * Start new inodes with an inode_ref. This is slightly more
+ * efficient for small numbers of hard links since they will
+ * be packed into one item. Extended refs will kick in if we
+ * add more hard links than can fit in the ref item.
+ */
+ key[1].objectid = objectid;
+ key[1].type = BTRFS_INODE_REF_KEY;
+ if (args->subvol) {
+ key[1].offset = objectid;
+ sizes[1] = 2 + sizeof(*ref);
+ } else {
+ key[1].offset = btrfs_ino(BTRFS_I(dir));
+ sizes[1] = name->len + sizeof(*ref);
+ }
+ }
+
+ batch.keys = &key[0];
+ batch.data_sizes = &sizes[0];
+ batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
+ batch.nr = args->orphan ? 1 : 2;
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
+ if (ret != 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
+
+ inode->i_mtime = current_time(inode);
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+ /*
+ * We're going to fill the inode item now, so at this point the inode
+ * must be fully initialized.
+ */
+
+ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
+ sizeof(*inode_item));
+ fill_inode_item(trans, path->nodes[0], inode_item, inode);
+
+ if (!args->orphan) {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ struct btrfs_inode_ref);
+ ptr = (unsigned long)(ref + 1);
+ if (args->subvol) {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
+ write_extent_buffer(path->nodes[0], "..", ptr, 2);
+ } else {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref,
+ name->len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref,
+ BTRFS_I(inode)->dir_index);
+ write_extent_buffer(path->nodes[0], name->name, ptr,
+ name->len);
+ }
+ }
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ /*
+ * We don't need the path anymore, plus inheriting properties, adding
+ * ACLs, security xattrs, orphan item or adding the link, will result in
+ * allocating yet another path. So just free our path.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ if (args->subvol) {
+ struct inode *parent;
+
+ /*
+ * Subvolumes inherit properties from their parent subvolume,
+ * not the directory they were created in.
+ */
+ parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_I(dir)->root);
+ if (IS_ERR(parent)) {
+ ret = PTR_ERR(parent);
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, parent);
+ iput(parent);
+ }
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, dir);
+ }
+ if (ret) {
+ btrfs_err(fs_info,
+ "error inheriting props for ino %llu (root %llu): %d",
+ btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
+ ret);
+ }
+
+ /*
+ * Subvolumes don't inherit ACLs or get passed to the LSM. This is
+ * probably a bug.
+ */
+ if (!args->subvol) {
+ ret = btrfs_init_inode_security(trans, args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
+ }
+
+ inode_tree_add(inode);
+
+ trace_btrfs_inode_new(inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
+
+ btrfs_update_root_times(trans, root);
+
+ if (args->orphan) {
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ } else {
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+ 0, BTRFS_I(inode)->dir_index);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
+
+ return 0;
+
+discard:
+ /*
+ * discard_new_inode() calls iput(), but the caller owns the reference
+ * to the inode.
+ */
+ ihold(inode);
+ discard_new_inode(inode);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
+ const struct fscrypt_str *name, int add_backref, u64 index)
+{
+ int ret = 0;
+ struct btrfs_key key;
+ struct btrfs_root *root = parent_inode->root;
+ u64 ino = btrfs_ino(inode);
+ u64 parent_ino = btrfs_ino(parent_inode);
+
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ memcpy(&key, &inode->root->root_key, sizeof(key));
+ } else {
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ }
+
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_add_root_ref(trans, key.objectid,
+ root->root_key.objectid, parent_ino,
+ index, name);
+ } else if (add_backref) {
+ ret = btrfs_insert_inode_ref(trans, root, name,
+ ino, parent_ino, index);
+ }
+
+ /* Nothing to clean up yet */
+ if (ret)
+ return ret;
+
+ ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
+ btrfs_inode_type(&inode->vfs_inode), index);
+ if (ret == -EEXIST || ret == -EOVERFLOW)
+ goto fail_dir_item;
+ else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
+ name->len * 2);
+ inode_inc_iversion(&parent_inode->vfs_inode);
+ /*
+ * If we are replaying a log tree, we do not want to update the mtime
+ * and ctime of the parent directory with the current time, since the
+ * log replay procedure is responsible for setting them to their correct
+ * values (the ones it had when the fsync was done).
+ */
+ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
+ struct timespec64 now = current_time(&parent_inode->vfs_inode);
+
+ parent_inode->vfs_inode.i_mtime = now;
+ parent_inode->vfs_inode.i_ctime = now;
+ }
+ ret = btrfs_update_inode(trans, root, parent_inode);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+
+fail_dir_item:
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ u64 local_index;
+ int err;
+ err = btrfs_del_root_ref(trans, key.objectid,
+ root->root_key.objectid, parent_ino,
+ &local_index, name);
+ if (err)
+ btrfs_abort_transaction(trans, err);
+ } else if (add_backref) {
+ u64 local_index;
+ int err;
+
+ err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
+ &local_index);
+ if (err)
+ btrfs_abort_transaction(trans, err);
+ }
+
+ /* Return the original error code */
+ return ret;
+}
+
+static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .inode = inode,
+ };
+ unsigned int trans_num_items;
+ struct btrfs_trans_handle *trans;
+ int err;
+
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (err)
+ goto out_inode;
+
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
+ }
+
+ err = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!err)
+ d_instantiate_new(dentry, inode);
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
+ return err;
+}
+
+static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ return btrfs_create_common(dir, dentry, inode);
+}
+
+static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_aops;
+ return btrfs_create_common(dir, dentry, inode);
+}
+
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = d_inode(old_dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct fscrypt_name fname;
+ u64 index;
+ int err;
+ int drop_inode = 0;
+
+ /* do not allow sys_link's with other subvols of the same device */
+ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
+ return -EXDEV;
+
+ if (inode->i_nlink >= BTRFS_LINK_MAX)
+ return -EMLINK;
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (err)
+ goto fail;
+
+ err = btrfs_set_inode_index(BTRFS_I(dir), &index);
+ if (err)
+ goto fail;
+
+ /*
+ * 2 items for inode and inode ref
+ * 2 items for dir items
+ * 1 item for parent inode
+ * 1 item for orphan item deletion if O_TMPFILE
+ */
+ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ goto fail;
+ }
+
+ /* There are several dir indexes for this inode, clear the cache. */
+ BTRFS_I(inode)->dir_index = 0ULL;
+ inc_nlink(inode);
+ inode_inc_iversion(inode);
+ inode->i_ctime = current_time(inode);
+ ihold(inode);
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
+
+ err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ &fname.disk_name, 1, index);
+
+ if (err) {
+ drop_inode = 1;
+ } else {
+ struct dentry *parent = dentry->d_parent;
+
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (err)
+ goto fail;
+ if (inode->i_nlink == 1) {
+ /*
+ * If new hard link count is 1, it's a file created
+ * with open(2) O_TMPFILE flag.
+ */
+ err = btrfs_orphan_del(trans, BTRFS_I(inode));
+ if (err)
+ goto fail;
+ }
+ d_instantiate(dentry, inode);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
+ }
+
+fail:
+ fscrypt_free_filename(&fname);
+ if (trans)
+ btrfs_end_transaction(trans);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(fs_info);
+ return err;
+}
+
+static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode);
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+ return btrfs_create_common(dir, dentry, inode);
+}
+
+static noinline int uncompress_inline(struct btrfs_path *path,
+ struct page *page,
+ size_t pg_offset, u64 extent_offset,
+ struct btrfs_file_extent_item *item)
+{
+ int ret;
+ struct extent_buffer *leaf = path->nodes[0];
+ char *tmp;
+ size_t max_size;
+ unsigned long inline_size;
+ unsigned long ptr;
+ int compress_type;
+
+ WARN_ON(pg_offset != 0);
+ compress_type = btrfs_file_extent_compression(leaf, item);
+ max_size = btrfs_file_extent_ram_bytes(leaf, item);
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
+ tmp = kmalloc(inline_size, GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ ptr = btrfs_file_extent_inline_start(item);
+
+ read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+ max_size = min_t(unsigned long, PAGE_SIZE, max_size);
+ ret = btrfs_decompress(compress_type, tmp, page,
+ extent_offset, inline_size, max_size);
+
+ /*
+ * decompression code contains a memset to fill in any space between the end
+ * of the uncompressed data and the end of max_size in case the decompressed
+ * data ends up shorter than ram_bytes. That doesn't cover the hole between
+ * the end of an inline extent and the beginning of the next block, so we
+ * cover that region here.
+ */
+
+ if (max_size + pg_offset < PAGE_SIZE)
+ memzero_page(page, pg_offset + max_size,
+ PAGE_SIZE - max_size - pg_offset);
+ kfree(tmp);
+ return ret;
+}
+
+/**
+ * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
+ * @inode: file to search in
+ * @page: page to read extent data into if the extent is inline
+ * @pg_offset: offset into @page to copy to
+ * @start: file offset
+ * @len: length of range starting at @start
+ *
+ * This returns the first &struct extent_map which overlaps with the given
+ * range, reading it from the B-tree and caching it if necessary. Note that
+ * there may be more extents which overlap the given range after the returned
+ * extent_map.
+ *
+ * If @page is not NULL and the extent is inline, this also reads the extent
+ * data directly into the page and marks the extent up to date in the io_tree.
+ *
+ * Return: ERR_PTR on error, non-NULL extent_map on success.
+ */
+struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
+ struct page *page, size_t pg_offset,
+ u64 start, u64 len)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ int ret = 0;
+ u64 extent_start = 0;
+ u64 extent_end = 0;
+ u64 objectid = btrfs_ino(inode);
+ int extent_type = -1;
+ struct btrfs_path *path = NULL;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *item;
+ struct extent_buffer *leaf;
+ struct btrfs_key found_key;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ read_unlock(&em_tree->lock);
+
+ if (em) {
+ if (em->start > start || em->start + em->len <= start)
+ free_extent_map(em);
+ else if (em->block_start == EXTENT_MAP_INLINE && page)
+ free_extent_map(em);
+ else
+ goto out;
+ }
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ em->start = EXTENT_MAP_HOLE;
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->len = (u64)-1;
+ em->block_len = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Chances are we'll be called again, so go ahead and do readahead */
+ path->reada = READA_FORWARD;
+
+ /*
+ * The same explanation in load_free_space_cache applies here as well,
+ * we only read when we're loading the free space cache, and at that
+ * point the commit_root has everything we need.
+ */
+ if (btrfs_is_free_space_inode(inode)) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
+
+ ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto not_found;
+ path->slots[0]--;
+ ret = 0;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_EXTENT_DATA_KEY) {
+ /*
+ * If we backup past the first extent we want to move forward
+ * and see if there is an extent in front of us, otherwise we'll
+ * say there is a hole for our whole search range which can
+ * cause problems.
+ */
+ extent_end = start;
+ goto next;
+ }
+
+ extent_type = btrfs_file_extent_type(leaf, item);
+ extent_start = found_key.offset;
+ extent_end = btrfs_file_extent_end(path);
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ /* Only regular file could have regular/prealloc extent */
+ if (!S_ISREG(inode->vfs_inode.i_mode)) {
+ ret = -EUCLEAN;
+ btrfs_crit(fs_info,
+ "regular/prealloc extent found for non-regular inode %llu",
+ btrfs_ino(inode));
+ goto out;
+ }
+ trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
+ extent_start);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
+ path->slots[0],
+ extent_start);
+ }
+next:
+ if (start >= extent_end) {
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ goto not_found;
+
+ leaf = path->nodes[0];
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+ if (start + len <= found_key.offset)
+ goto not_found;
+ if (start > found_key.offset)
+ goto next;
+
+ /* New extent overlaps with existing one */
+ em->start = start;
+ em->orig_start = start;
+ em->len = found_key.offset - start;
+ em->block_start = EXTENT_MAP_HOLE;
+ goto insert;
+ }
+
+ btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ goto insert;
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ unsigned long ptr;
+ char *map;
+ size_t size;
+ size_t extent_offset;
+ size_t copy_size;
+
+ if (!page)
+ goto out;
+
+ size = btrfs_file_extent_ram_bytes(leaf, item);
+ extent_offset = page_offset(page) + pg_offset - extent_start;
+ copy_size = min_t(u64, PAGE_SIZE - pg_offset,
+ size - extent_offset);
+ em->start = extent_start + extent_offset;
+ em->len = ALIGN(copy_size, fs_info->sectorsize);
+ em->orig_block_len = em->len;
+ em->orig_start = em->start;
+ ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+
+ if (!PageUptodate(page)) {
+ if (btrfs_file_extent_compression(leaf, item) !=
+ BTRFS_COMPRESS_NONE) {
+ ret = uncompress_inline(path, page, pg_offset,
+ extent_offset, item);
+ if (ret)
+ goto out;
+ } else {
+ map = kmap_local_page(page);
+ read_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
+ if (pg_offset + copy_size < PAGE_SIZE) {
+ memset(map + pg_offset + copy_size, 0,
+ PAGE_SIZE - pg_offset -
+ copy_size);
+ }
+ kunmap_local(map);
+ }
+ flush_dcache_page(page);
+ }
+ goto insert;
+ }
+not_found:
+ em->start = start;
+ em->orig_start = start;
+ em->len = len;
+ em->block_start = EXTENT_MAP_HOLE;
+insert:
+ ret = 0;
+ btrfs_release_path(path);
+ if (em->start > start || extent_map_end(em) <= start) {
+ btrfs_err(fs_info,
+ "bad extent! em: [%llu %llu] passed [%llu %llu]",
+ em->start, em->len, start, len);
+ ret = -EIO;
+ goto out;
+ }
+
+ write_lock(&em_tree->lock);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ write_unlock(&em_tree->lock);
+out:
+ btrfs_free_path(path);
+
+ trace_btrfs_get_extent(root, inode, em);
+
+ if (ret) {
+ free_extent_map(em);
+ return ERR_PTR(ret);
+ }
+ return em;
+}
+
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ const u64 orig_start,
+ const u64 block_start,
+ const u64 block_len,
+ const u64 orig_block_len,
+ const u64 ram_bytes,
+ const int type)
+{
+ struct extent_map *em = NULL;
+ int ret;
+
+ if (type != BTRFS_ORDERED_NOCOW) {
+ em = create_io_em(inode, start, len, orig_start, block_start,
+ block_len, orig_block_len, ram_bytes,
+ BTRFS_COMPRESS_NONE, /* compress_type */
+ type);
+ if (IS_ERR(em))
+ goto out;
+ }
+ ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
+ block_len, 0,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT),
+ BTRFS_COMPRESS_NONE);
+ if (ret) {
+ if (em) {
+ free_extent_map(em);
+ btrfs_drop_extent_map_range(inode, start,
+ start + len - 1, false);
+ }
+ em = ERR_PTR(ret);
+ }
+ out:
+
+ return em;
+}
+
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+ u64 start, u64 len)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_map *em;
+ struct btrfs_key ins;
+ u64 alloc_hint;
+ int ret;
+
+ alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
+ ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
+ 0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
+ if (ret)
+ return ERR_PTR(ret);
+
+ em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+ ins.objectid, ins.offset, ins.offset,
+ ins.offset, BTRFS_ORDERED_REGULAR);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ if (IS_ERR(em))
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+ 1);
+
+ return em;
+}
+
+static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ bool readonly = false;
+
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!block_group || block_group->ro)
+ readonly = true;
+ if (block_group)
+ btrfs_put_block_group(block_group);
+ return readonly;
+}
+
+/*
+ * Check if we can do nocow write into the range [@offset, @offset + @len)
+ *
+ * @offset: File offset
+ * @len: The length to write, will be updated to the nocow writeable
+ * range
+ * @orig_start: (optional) Return the original file offset of the file extent
+ * @orig_len: (optional) Return the original on-disk length of the file extent
+ * @ram_bytes: (optional) Return the ram_bytes of the file extent
+ * @strict: if true, omit optimizations that might force us into unnecessary
+ * cow. e.g., don't trust generation number.
+ *
+ * Return:
+ * >0 and update @len if we can do nocow write
+ * 0 if we can't do nocow write
+ * <0 if error happened
+ *
+ * NOTE: This only checks the file extents, caller is responsible to wait for
+ * any ordered extents.
+ */
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes, bool nowait, bool strict)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct can_nocow_file_extent_args nocow_args = { 0 };
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ int found_type;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->nowait = nowait;
+
+ ret = btrfs_lookup_file_extent(NULL, root, path,
+ btrfs_ino(BTRFS_I(inode)), offset, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 1) {
+ if (path->slots[0] == 0) {
+ /* can't find the item, must cow */
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]--;
+ }
+ ret = 0;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* not our file or wrong item type, must cow */
+ goto out;
+ }
+
+ if (key.offset > offset) {
+ /* Wrong offset, must cow */
+ goto out;
+ }
+
+ if (btrfs_file_extent_end(path) <= offset)
+ goto out;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (ram_bytes)
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+
+ nocow_args.start = offset;
+ nocow_args.end = offset + *len - 1;
+ nocow_args.strict = strict;
+ nocow_args.free_path = true;
+
+ ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ /* can_nocow_file_extent() has freed the path. */
+ path = NULL;
+
+ if (ret != 1) {
+ /* Treat errors as not being able to NOCOW. */
+ ret = 0;
+ goto out;
+ }
+
+ ret = 0;
+ if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
+ goto out;
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 range_end;
+
+ range_end = round_up(offset + nocow_args.num_bytes,
+ root->fs_info->sectorsize) - 1;
+ ret = test_range_bit(io_tree, offset, range_end,
+ EXTENT_DELALLOC, 0, NULL);
+ if (ret) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
+ if (orig_start)
+ *orig_start = key.offset - nocow_args.extent_offset;
+ if (orig_block_len)
+ *orig_block_len = nocow_args.disk_num_bytes;
+
+ *len = nocow_args.num_bytes;
+ ret = 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+ struct extent_state **cached_state,
+ unsigned int iomap_flags)
+{
+ const bool writing = (iomap_flags & IOMAP_WRITE);
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ int ret = 0;
+
+ while (1) {
+ if (nowait) {
+ if (!try_lock_extent(io_tree, lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ lock_extent(io_tree, lockstart, lockend, cached_state);
+ }
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure there's no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
+ lockend - lockstart + 1);
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered &&
+ (!writing || !filemap_range_has_page(inode->i_mapping,
+ lockstart, lockend)))
+ break;
+
+ unlock_extent(io_tree, lockstart, lockend, cached_state);
+
+ if (ordered) {
+ if (nowait) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ break;
+ }
+ /*
+ * If we are doing a DIO read and the ordered extent we
+ * found is for a buffered write, we can not wait for it
+ * to complete and retry, because if we do so we can
+ * deadlock with concurrent buffered writes on page
+ * locks. This happens only if our DIO read covers more
+ * than one extent map, if at this point has already
+ * created an ordered extent for a previous extent map
+ * and locked its range in the inode's io tree, and a
+ * concurrent write against that previous extent map's
+ * range and this range started (we unlock the ranges
+ * in the io tree only when the bios complete and
+ * buffered writes always lock pages before attempting
+ * to lock range in the io tree).
+ */
+ if (writing ||
+ test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
+ btrfs_start_ordered_extent(ordered, 1);
+ else
+ ret = nowait ? -EAGAIN : -ENOTBLK;
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /*
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readahead (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readahead wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
+ */
+ ret = nowait ? -EAGAIN : -ENOTBLK;
+ }
+
+ if (ret)
+ break;
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/* The callers of this must take lock_extent() */
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
+ u64 block_len, u64 orig_block_len,
+ u64 ram_bytes, int compress_type,
+ int type)
+{
+ struct extent_map *em;
+ int ret;
+
+ ASSERT(type == BTRFS_ORDERED_PREALLOC ||
+ type == BTRFS_ORDERED_COMPRESSED ||
+ type == BTRFS_ORDERED_NOCOW ||
+ type == BTRFS_ORDERED_REGULAR);
+
+ em = alloc_extent_map();
+ if (!em)
+ return ERR_PTR(-ENOMEM);
+
+ em->start = start;
+ em->orig_start = orig_start;
+ em->len = len;
+ em->block_len = block_len;
+ em->block_start = block_start;
+ em->orig_block_len = orig_block_len;
+ em->ram_bytes = ram_bytes;
+ em->generation = -1;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ set_bit(EXTENT_FLAG_FILLING, &em->flags);
+ } else if (type == BTRFS_ORDERED_COMPRESSED) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ }
+
+ ret = btrfs_replace_extent_map_range(inode, em, true);
+ if (ret) {
+ free_extent_map(em);
+ return ERR_PTR(ret);
+ }
+
+ /* em got 2 refs now, callers needs to do free_extent_map once. */
+ return em;
+}
+
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 *lenp,
+ unsigned int iomap_flags)
+{
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *em = *map;
+ int type;
+ u64 block_start, orig_start, orig_block_len, ram_bytes;
+ struct btrfs_block_group *bg;
+ bool can_nocow = false;
+ bool space_reserved = false;
+ u64 len = *lenp;
+ u64 prev_len;
+ int ret = 0;
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->block_start != EXTENT_MAP_HOLE)) {
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = em->block_start + (start - em->start);
+
+ if (can_nocow_extent(inode, start, &len, &orig_start,
+ &orig_block_len, &ram_bytes, false, false) == 1) {
+ bg = btrfs_inc_nocow_writers(fs_info, block_start);
+ if (bg)
+ can_nocow = true;
+ }
+ }
+
+ prev_len = len;
+ if (can_nocow) {
+ struct extent_map *em2;
+
+ /* We can NOCOW, so only need to reserve metadata space. */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ nowait);
+ if (ret < 0) {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+ btrfs_dec_nocow_writers(bg);
+ if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+ ret = -EAGAIN;
+ goto out;
+ }
+ space_reserved = true;
+
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(bg);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em2;
+ em = em2;
+ }
+
+ if (IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+
+ dio_data->nocow_done = true;
+ } else {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+
+ if (nowait) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * If we could not allocate data space before locking the file
+ * range and we can't do a NOCOW write, then we have to fail.
+ */
+ if (!dio_data->data_space_reserved) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /*
+ * We have to COW and we have already reserved data space before,
+ * so now we reserve only metadata.
+ */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ false);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+
+ em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ *map = em;
+ len = min(len, em->len - (start - em->start));
+ if (len < prev_len)
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ prev_len - len, true);
+ }
+
+ /*
+ * We have created our ordered extent, so we can now release our reservation
+ * for an outstanding extent.
+ */
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+out:
+ if (ret && space_reserved) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+ }
+ *lenp = len;
+ return ret;
+}
+
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *em;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_dio_data *dio_data = iter->private;
+ u64 lockstart, lockend;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
+ u64 len = length;
+ const u64 data_alloc_len = length;
+ bool unlock_extents = false;
+
+ /*
+ * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+ * we're NOWAIT we may submit a bio for a partial range and return
+ * EIOCBQUEUED, which would result in an errant short read.
+ *
+ * The best way to handle this would be to allow for partial completions
+ * of iocb's, so we could submit the partial bio, return and fault in
+ * the rest of the pages, and then submit the io for the rest of the
+ * range. However we don't have that currently, so simply return
+ * -EAGAIN at this point so that the normal path is used.
+ */
+ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+ return -EAGAIN;
+
+ /*
+ * Cap the size of reads to that usually seen in buffered I/O as we need
+ * to allocate a contiguous array for the checksums.
+ */
+ if (!write)
+ len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
+
+ lockstart = start;
+ lockend = start + len - 1;
+
+ /*
+ * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk - the first flush only starts
+ * compression on the data, while keeping the pages locked, so by the
+ * time the second flush returns we know bios for the compressed pages
+ * were submitted and finished, and the pages no longer under writeback.
+ *
+ * If we have a NOWAIT request and we have any pages in the range that
+ * are locked, likely due to compression still in progress, we don't want
+ * to block on page locks. We also don't want to block on pages marked as
+ * dirty or under writeback (same as for the non-compression case).
+ * iomap_dio_rw() did the same check, but after that and before we got
+ * here, mmap'ed writes may have happened or buffered reads started
+ * (readpage() and readahead(), which lock pages), as we haven't locked
+ * the file range yet.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (flags & IOMAP_NOWAIT) {
+ if (filemap_range_needs_writeback(inode->i_mapping,
+ lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
+ }
+
+ memset(dio_data, 0, sizeof(*dio_data));
+
+ /*
+ * We always try to allocate data space and must do it before locking
+ * the file range, to avoid deadlocks with concurrent writes to the same
+ * range if the range has several extents and the writes don't expand the
+ * current i_size (the inode lock is taken in shared mode). If we fail to
+ * allocate data space here we continue and later, after locking the
+ * file range, we fail with ENOSPC only if we figure out we can not do a
+ * NOCOW write.
+ */
+ if (write && !(flags & IOMAP_NOWAIT)) {
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, data_alloc_len, false);
+ if (!ret)
+ dio_data->data_space_reserved = true;
+ else if (ret && !(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ goto err;
+ }
+
+ /*
+ * If this errors out it's because we couldn't invalidate pagecache for
+ * this range and we need to fallback to buffered IO, or we are doing a
+ * NOWAIT read/write and we need to block.
+ */
+ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+ if (ret < 0)
+ goto err;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
+
+ /*
+ * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ * io. INLINE is special, and we could probably kludge it in here, but
+ * it's still buffered so for safety lets just fall back to the generic
+ * buffered path.
+ *
+ * For COMPRESSED we _have_ to read the entire extent in so we can
+ * decompress it, so there will be buffering required no matter what we
+ * do, so go ahead and fallback to buffered.
+ *
+ * We return -ENOTBLK because that's what makes DIO go ahead and go back
+ * to buffered IO. Don't blame me, this is the price we pay for using
+ * the generic code.
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ em->block_start == EXTENT_MAP_INLINE) {
+ free_extent_map(em);
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
+ goto unlock_err;
+ }
+
+ len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, &len, flags);
+ if (ret < 0)
+ goto unlock_err;
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
+ if (dio_data->data_space_reserved) {
+ u64 release_offset;
+ u64 release_len = 0;
+
+ if (dio_data->nocow_done) {
+ release_offset = start;
+ release_len = data_alloc_len;
+ } else if (len < data_alloc_len) {
+ release_offset = start + len;
+ release_len = data_alloc_len - len;
+ }
+
+ if (release_len > 0)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ release_offset,
+ release_len);
+ }
+ } else {
+ /*
+ * We need to unlock only the end area that we aren't using.
+ * The rest is going to be unlocked by the endio routine.
+ */
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
+ }
+
+ if (unlock_extents)
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
+ iomap->length = len;
+
+ if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
+ iomap->flags |= IOMAP_F_ZONE_APPEND;
+
+ free_extent_map(em);
+
+ return 0;
+
+unlock_err:
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+err:
+ if (dio_data->data_space_reserved) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, data_alloc_len);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_dio_data *dio_data = iter->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+ NULL);
+ return 0;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+ pos, length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
+ ret = -ENOTBLK;
+ }
+
+ if (write)
+ extent_changeset_free(dio_data->data_reserved);
+ return ret;
+}
+
+static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
+{
+ /*
+ * This implies a barrier so that stores to dio_bio->bi_status before
+ * this and loads of dio_bio->bi_status after this are fully ordered.
+ */
+ if (!refcount_dec_and_test(&dip->refs))
+ return;
+
+ if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
+ btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+ dip->file_offset, dip->bytes,
+ !dip->bio.bi_status);
+ } else {
+ unlock_extent(&BTRFS_I(dip->inode)->io_tree,
+ dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
+ }
+
+ kfree(dip->csums);
+ bio_endio(&dip->bio);
+}
+
+static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ enum btrfs_compression_type compress_type)
+{
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ BUG_ON(bio_op(bio) == REQ_OP_WRITE);
+
+ refcount_inc(&dip->refs);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+}
+
+static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+ struct btrfs_bio *bbio,
+ const bool uptodate)
+{
+ struct inode *inode = dip->inode;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
+ blk_status_t err = BLK_STS_OK;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
+
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
+
+ if (uptodate &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
+ bv.bv_offset))) {
+ btrfs_clean_io_failure(BTRFS_I(inode), start,
+ bv.bv_page, bv.bv_offset);
+ } else {
+ int ret;
+
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
+ }
+ }
+
+ return err;
+}
+
+static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+ struct bio *bio,
+ u64 dio_file_offset)
+{
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
+}
+
+static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
+{
+ struct btrfs_dio_private *dip = bbio->private;
+ struct bio *bio = &bbio->bio;
+ blk_status_t err = bio->bi_status;
+
+ if (err)
+ btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+ "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
+ btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
+ bio->bi_opf, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, err);
+
+ if (bio_op(bio) == REQ_OP_READ)
+ err = btrfs_check_read_dio_bio(dip, bbio, !err);
+
+ if (err)
+ dip->bio.bi_status = err;
+
+ btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
+
+ bio_put(bio);
+ btrfs_dio_private_put(dip);
+}
+
+static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+ u64 file_offset, int async_submit)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
+ blk_status_t ret;
+
+ /* Save the original iter for read repair */
+ if (btrfs_op(bio) == BTRFS_MAP_READ)
+ btrfs_bio(bio)->iter = bio->bi_iter;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ goto map;
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ /* Check btrfs_submit_data_write_bio() for async submit rules */
+ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+ btrfs_submit_bio_start_direct_io))
+ return;
+
+ /*
+ * If we aren't doing async submit, calculate the csum of the
+ * bio now.
+ */
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
+ } else {
+ btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
+ file_offset - dip->file_offset);
+ }
+map:
+ btrfs_submit_bio(fs_info, bio, 0);
+}
+
+static void btrfs_submit_direct(const struct iomap_iter *iter,
+ struct bio *dio_bio, loff_t file_offset)
+{
+ struct btrfs_dio_private *dip =
+ container_of(dio_bio, struct btrfs_dio_private, bio);
+ struct inode *inode = iter->inode;
+ const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
+ BTRFS_BLOCK_GROUP_RAID56_MASK);
+ struct bio *bio;
+ u64 start_sector;
+ int async_submit = 0;
+ u64 submit_len;
+ u64 clone_offset = 0;
+ u64 clone_len;
+ u64 logical;
+ int ret;
+ blk_status_t status;
+ struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iter->private;
+ struct extent_map *em = NULL;
+
+ dip->inode = inode;
+ dip->file_offset = file_offset;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ refcount_set(&dip->refs, 1);
+ dip->csums = NULL;
+
+ if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ unsigned int nr_sectors =
+ (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
+
+ /*
+ * Load the csums up front to reduce csum tree searches and
+ * contention when submitting bios.
+ */
+ status = BLK_STS_RESOURCE;
+ dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
+ if (!dip->csums)
+ goto out_err;
+
+ status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
+ if (status != BLK_STS_OK)
+ goto out_err;
+ }
+
+ start_sector = dio_bio->bi_iter.bi_sector;
+ submit_len = dio_bio->bi_iter.bi_size;
+
+ do {
+ logical = start_sector << 9;
+ em = btrfs_get_chunk_map(fs_info, logical, submit_len);
+ if (IS_ERR(em)) {
+ status = errno_to_blk_status(PTR_ERR(em));
+ em = NULL;
+ goto out_err_em;
+ }
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
+ logical, &geom);
+ if (ret) {
+ status = errno_to_blk_status(ret);
+ goto out_err_em;
+ }
+
+ clone_len = min(submit_len, geom.len);
+ ASSERT(clone_len <= UINT_MAX);
+
+ /*
+ * This will never fail as it's passing GPF_NOFS and
+ * the allocation is backed by btrfs_bioset.
+ */
+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
+ btrfs_end_dio_bio, dip);
+ btrfs_bio(bio)->file_offset = file_offset;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ status = extract_ordered_extent(BTRFS_I(inode), bio,
+ file_offset);
+ if (status) {
+ bio_put(bio);
+ goto out_err;
+ }
+ }
+
+ ASSERT(submit_len >= clone_len);
+ submit_len -= clone_len;
+
+ /*
+ * Increase the count before we submit the bio so we know
+ * the end IO handler won't happen before we increase the
+ * count. Otherwise, the dip might get freed before we're
+ * done setting it up.
+ *
+ * We transfer the initial reference to the last bio, so we
+ * don't need to increment the reference count for the last one.
+ */
+ if (submit_len > 0) {
+ refcount_inc(&dip->refs);
+ /*
+ * If we are submitting more than one bio, submit them
+ * all asynchronously. The exception is RAID 5 or 6, as
+ * asynchronous checksums make it difficult to collect
+ * full stripe writes.
+ */
+ if (!raid56)
+ async_submit = 1;
+ }
+
+ btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+
+ dio_data->submitted += clone_len;
+ clone_offset += clone_len;
+ start_sector += clone_len >> 9;
+ file_offset += clone_len;
+
+ free_extent_map(em);
+ } while (submit_len > 0);
+ return;
+
+out_err_em:
+ free_extent_map(em);
+out_err:
+ dio_bio->bi_status = status;
+ btrfs_dio_private_put(dip);
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_submit_direct,
+ .bio_set = &btrfs_dio_bioset,
+};
+
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+{
+ struct btrfs_dio_data data;
+
+ return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data;
+
+ return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ int ret;
+
+ ret = fiemap_prep(inode, fieinfo, start, &len, 0);
+ if (ret)
+ return ret;
+
+ /*
+ * fiemap_prep() called filemap_write_and_wait() for the whole possible
+ * file range (0 to LLONG_MAX), but that is not enough if we have
+ * compression enabled. The first filemap_fdatawrite_range() only kicks
+ * in the compression of data (in an async thread) and will return
+ * before the compression is done and writeback is started. A second
+ * filemap_fdatawrite_range() is needed to wait for the compression to
+ * complete and writeback to start. We also need to wait for ordered
+ * extents to complete, because our fiemap implementation uses mainly
+ * file extent items to list the extents, searching for extent maps
+ * only for file ranges with holes or prealloc extents to figure out
+ * if we have delalloc in those ranges.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret)
+ return ret;
+ }
+
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
+}
+
+static int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return extent_writepages(mapping, wbc);
+}
+
+static void btrfs_readahead(struct readahead_control *rac)
+{
+ extent_readahead(rac);
+}
+
+/*
+ * For release_folio() and invalidate_folio() we have a race window where
+ * folio_end_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock. So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
+
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * This may look insane as we just acquire the spinlock and release it,
+ * without doing anything. But we just want to make sure no one is
+ * still holding the subpage spinlock.
+ * And since the page is not dirty nor writeback, and we have page
+ * locked, the only possible way to hold a spinlock is from the endio
+ * function to clear page writeback.
+ *
+ * Here we just acquire the spinlock so that all existing callers
+ * should exit and we're safe to release/invalidate the page.
+ */
+ spin_lock_irq(&subpage->lock);
+ spin_unlock_irq(&subpage->lock);
+}
+
+static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
+{
+ int ret = try_release_extent_mapping(&folio->page, gfp_flags);
+
+ if (ret == 1) {
+ wait_subpage_spinlock(&folio->page);
+ clear_page_extent_mapped(&folio->page);
+ }
+ return ret;
+}
+
+static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
+{
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
+ return __btrfs_release_folio(folio, gfp_flags);
+}
+
+#ifdef CONFIG_MIGRATION
+static int btrfs_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ int ret = filemap_migrate_folio(mapping, dst, src, mode);
+
+ if (ret != MIGRATEPAGE_SUCCESS)
+ return ret;
+
+ if (folio_test_ordered(src)) {
+ folio_clear_ordered(src);
+ folio_set_ordered(dst);
+ }
+
+ return MIGRATEPAGE_SUCCESS;
+}
+#else
+#define btrfs_migrate_folio NULL
+#endif
+
+static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
+{
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *tree = &inode->io_tree;
+ struct extent_state *cached_state = NULL;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = page_start + folio_size(folio) - 1;
+ u64 cur;
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
+
+ /*
+ * We have folio locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this folio.
+ *
+ * But already submitted bio can still be finished on this folio.
+ * Furthermore, endio function won't skip folio which has Ordered
+ * (Private2) already cleared, so it's possible for endio and
+ * invalidate_folio to do the same ordered extent accounting twice
+ * on one folio.
+ *
+ * So here we wait for any submitted bios to finish, so that we won't
+ * do double ordered extent accounting on the same folio.
+ */
+ folio_wait_writeback(folio);
+ wait_subpage_spinlock(&folio->page);
+
+ /*
+ * For subpage case, we have call sites like
+ * btrfs_punch_hole_lock_range() which passes range not aligned to
+ * sectorsize.
+ * If the range doesn't cover the full folio, we don't need to and
+ * shouldn't clear page extent mapped, as folio->private can still
+ * record subpage dirty bits for other part of the range.
+ *
+ * For cases that invalidate the full folio even the range doesn't
+ * cover the full folio, like invalidating the last folio, we're
+ * still safe to wait for ordered extent to finish.
+ */
+ if (!(offset == 0 && length == folio_size(folio))) {
+ btrfs_release_folio(folio, GFP_NOFS);
+ return;
+ }
+
+ if (!inode_evicting)
+ lock_extent(tree, page_start, page_end, &cached_state);
+
+ cur = page_start;
+ while (cur < page_end) {
+ struct btrfs_ordered_extent *ordered;
+ u64 range_end;
+ u32 range_len;
+ u32 extra_flags = 0;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, cur,
+ page_end + 1 - cur);
+ if (!ordered) {
+ range_end = page_end;
+ /*
+ * No ordered extent covering this range, we are safe
+ * to delete all extent states in the range.
+ */
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
+ goto next;
+ }
+ if (ordered->file_offset > cur) {
+ /*
+ * There is a range between [cur, oe->file_offset) not
+ * covered by any ordered extent.
+ * We are safe to delete all extent states, and handle
+ * the ordered extent in the next iteration.
+ */
+ range_end = ordered->file_offset - 1;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
+ goto next;
+ }
+
+ range_end = min(ordered->file_offset + ordered->num_bytes - 1,
+ page_end);
+ ASSERT(range_end + 1 - cur < U32_MAX);
+ range_len = range_end + 1 - cur;
+ if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ /*
+ * If Ordered (Private2) is cleared, it means endio has
+ * already been executed for the range.
+ * We can't delete the extent states as
+ * btrfs_finish_ordered_io() may still use some of them.
+ */
+ goto next;
+ }
+ btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+
+ /*
+ * IO on this page will never be started, so we need to account
+ * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
+ * here, must leave that up for the ordered extent completion.
+ *
+ * This will also unlock the range for incoming
+ * btrfs_finish_ordered_io().
+ */
+ if (!inode_evicting)
+ clear_extent_bit(tree, cur, range_end,
+ EXTENT_DELALLOC |
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ spin_lock_irq(&inode->ordered_tree.lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ ordered->truncated_len = min(ordered->truncated_len,
+ cur - ordered->file_offset);
+ spin_unlock_irq(&inode->ordered_tree.lock);
+
+ /*
+ * If the ordered extent has finished, we're safe to delete all
+ * the extent states of the range, otherwise
+ * btrfs_finish_ordered_io() will get executed by endio for
+ * other pages, so we can't delete extent states.
+ */
+ if (btrfs_dec_test_ordered_pending(inode, &ordered,
+ cur, range_end + 1 - cur)) {
+ btrfs_finish_ordered_io(ordered);
+ /*
+ * The ordered extent has finished, now we're again
+ * safe to delete all extent states of the range.
+ */
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
+ }
+next:
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ /*
+ * Qgroup reserved space handler
+ * Sector(s) here will be either:
+ *
+ * 1) Already written to disk or bio already finished
+ * Then its QGROUP_RESERVED bit in io_tree is already cleared.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the
+ * QGROUP_RESERVED bit of its io_tree, and free the qgroup
+ * reserved data space.
+ * Since the IO will never happen for this page.
+ */
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
+ if (!inode_evicting) {
+ clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
+ extra_flags, &cached_state);
+ }
+ cur = range_end + 1;
+ }
+ /*
+ * We have iterated through all ordered extents of the page, the page
+ * should not have Ordered (Private2) anymore, or the above iteration
+ * did something wrong.
+ */
+ ASSERT(!folio_test_ordered(folio));
+ btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ if (!inode_evicting)
+ __btrfs_release_folio(folio, GFP_NOFS);
+ clear_page_extent_mapped(&folio->page);
+}
+
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF. Because
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct extent_changeset *data_reserved = NULL;
+ unsigned long zero_start;
+ loff_t size;
+ vm_fault_t ret;
+ int ret2;
+ int reserved = 0;
+ u64 reserved_space;
+ u64 page_start;
+ u64 page_end;
+ u64 end;
+
+ reserved_space = PAGE_SIZE;
+
+ sb_start_pagefault(inode->i_sb);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_SIZE - 1;
+ end = page_end;
+
+ /*
+ * Reserving delalloc space after obtaining the page lock can lead to
+ * deadlock. For example, if a dirty page is locked by this function
+ * and the call to btrfs_delalloc_reserve_space() ends up triggering
+ * dirty page write out, then the btrfs_writepages() function could
+ * end up waiting indefinitely to get a lock on the page currently
+ * being processed by btrfs_page_mkwrite() function.
+ */
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
+ if (!ret2) {
+ ret2 = file_update_time(vmf->vma->vm_file);
+ reserved = 1;
+ }
+ if (ret2) {
+ ret = vmf_error(ret2);
+ if (reserved)
+ goto out;
+ goto out_noreserve;
+ }
+
+ ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+again:
+ down_read(&BTRFS_I(inode)->i_mmap_lock);
+ lock_page(page);
+ size = i_size_read(inode);
+
+ if ((page->mapping != inode->i_mapping) ||
+ (page_start >= size)) {
+ /* page got truncated out from underneath us */
+ goto out_unlock;
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret2 = set_page_extent_mapped(page);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
+
+ /*
+ * we can't set the delalloc bits if there are pending ordered
+ * extents. Drop our locks and wait for them to finish
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
+ PAGE_SIZE);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ if (page->index == ((size - 1) >> PAGE_SHIFT)) {
+ reserved_space = round_up(size - page_start,
+ fs_info->sectorsize);
+ if (reserved_space < PAGE_SIZE) {
+ end = page_start + reserved_space - 1;
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, page_start,
+ PAGE_SIZE - reserved_space, true);
+ }
+ }
+
+ /*
+ * page_mkwrite gets called when the page is firstly dirtied after it's
+ * faulted in, but write(2) could also dirty a page and set delalloc
+ * bits, thus in this case for space account reason, we still need to
+ * clear any delalloc bits within this page range since we have to
+ * reserve data&meta space before lock_page() (see above comments).
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
+ &cached_state);
+ if (ret2) {
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
+
+ /* page is wholly or partially inside EOF */
+ if (page_start + PAGE_SIZE > size)
+ zero_start = offset_in_page(size);
+ else
+ zero_start = PAGE_SIZE;
+
+ if (zero_start != PAGE_SIZE)
+ memzero_page(page, zero_start, PAGE_SIZE - zero_start);
+
+ btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+ btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
+
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return VM_FAULT_LOCKED;
+
+out_unlock:
+ unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+out:
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
+ reserved_space, (ret != 0));
+out_noreserve:
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return ret;
+}
+
+static int btrfs_truncate(struct inode *inode, bool skip_writeback)
+{
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(inode),
+ .ino = btrfs_ino(BTRFS_I(inode)),
+ .min_type = BTRFS_EXTENT_DATA_KEY,
+ .clear_extent_range = true,
+ };
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *rsv;
+ int ret;
+ struct btrfs_trans_handle *trans;
+ u64 mask = fs_info->sectorsize - 1;
+ u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
+
+ if (!skip_writeback) {
+ ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+ (u64)-1);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Yes ladies and gentlemen, this is indeed ugly. We have a couple of
+ * things going on here:
+ *
+ * 1) We need to reserve space to update our inode.
+ *
+ * 2) We need to have something to cache all the space that is going to
+ * be free'd up by the truncate operation, but also have some slack
+ * space reserved in case it uses space during the truncate (thank you
+ * very much snapshotting).
+ *
+ * And we need these to be separate. The fact is we can use a lot of
+ * space doing the truncate, and we have no earthly idea how much space
+ * we will use, so we need the truncate reservation to be separate so it
+ * doesn't end up using space reserved for updating the inode. We also
+ * need to be able to stop the transaction and start a new one, which
+ * means we need to be able to update the inode several times, and we
+ * have no idea of knowing how many times that will be, so we can't just
+ * reserve 1 item for the entirety of the operation, so that has to be
+ * done separately as well.
+ *
+ * So that leaves us with
+ *
+ * 1) rsv - for the truncate reservation, which we will steal from the
+ * transaction reservation.
+ * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
+ * updating the inode.
+ */
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv)
+ return -ENOMEM;
+ rsv->size = min_size;
+ rsv->failfast = true;
+
+ /*
+ * 1 for the truncate slack space
+ * 1 for updating the inode.
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /* Migrate the slack space for the truncate to our reserve */
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ min_size, false);
+ BUG_ON(ret);
+
+ trans->block_rsv = rsv;
+
+ while (1) {
+ struct extent_state *cached_state = NULL;
+ const u64 new_size = inode->i_size;
+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+
+ control.new_size = new_size;
+ lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+ /*
+ * We want to drop from the next block forward in case this new
+ * size is not block aligned since we will be keeping the last
+ * block of the extent just the way it is.
+ */
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ ALIGN(new_size, fs_info->sectorsize),
+ (u64)-1, false);
+
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+
+ inode_sub_bytes(inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ if (ret != -ENOSPC && ret != -EAGAIN)
+ break;
+
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ break;
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
+ rsv, min_size, false);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
+ }
+
+ /*
+ * We can't call btrfs_truncate_block inside a trans handle as we could
+ * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
+ * know we've truncated everything except the last little bit, and can
+ * do btrfs_truncate_block and then update the disk_i_size.
+ */
+ if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
+ if (ret)
+ goto out;
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ }
+
+ if (trans) {
+ int ret2;
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret2 && !ret)
+ ret = ret2;
+
+ ret2 = btrfs_end_transaction(trans);
+ if (ret2 && !ret)
+ ret = ret2;
+ btrfs_btree_balance_dirty(fs_info);
+ }
+out:
+ btrfs_free_block_rsv(fs_info, rsv);
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ *
+ * If no extents were dropped or trimmed we don't need to force the next
+ * fsync to truncate all the inode's items from the log and re-log them
+ * all. This means the truncate operation did not change the file size,
+ * or changed it to a smaller size but there was only an implicit hole
+ * between the old i_size and the new i_size, and there were no prealloc
+ * extents beyond i_size to drop.
+ */
+ if (control.extents_found > 0)
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
+
+ return ret;
+}
+
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
+{
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ /*
+ * Subvolumes don't inherit the sgid bit or the parent's gid if
+ * the parent's sgid bit is set. This is probably a bug.
+ */
+ inode_init_owner(mnt_userns, inode, NULL,
+ S_IFDIR | (~current_umask() & S_IRWXUGO));
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+ }
+ return inode;
+}
+
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_inode *ei;
+ struct inode *inode;
+
+ ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
+ if (!ei)
+ return NULL;
+
+ ei->root = NULL;
+ ei->generation = 0;
+ ei->last_trans = 0;
+ ei->last_sub_trans = 0;
+ ei->logged_trans = 0;
+ ei->delalloc_bytes = 0;
+ ei->new_delalloc_bytes = 0;
+ ei->defrag_bytes = 0;
+ ei->disk_i_size = 0;
+ ei->flags = 0;
+ ei->ro_flags = 0;
+ ei->csum_bytes = 0;
+ ei->index_cnt = (u64)-1;
+ ei->dir_index = 0;
+ ei->last_unlink_trans = 0;
+ ei->last_reflink_trans = 0;
+ ei->last_log_commit = 0;
+
+ spin_lock_init(&ei->lock);
+ spin_lock_init(&ei->io_failure_lock);
+ ei->outstanding_extents = 0;
+ if (sb->s_magic != BTRFS_TEST_MAGIC)
+ btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
+ BTRFS_BLOCK_RSV_DELALLOC);
+ ei->runtime_flags = 0;
+ ei->prop_compress = BTRFS_COMPRESS_NONE;
+ ei->defrag_compress = BTRFS_COMPRESS_NONE;
+
+ ei->delayed_node = NULL;
+
+ ei->i_otime.tv_sec = 0;
+ ei->i_otime.tv_nsec = 0;
+
+ inode = &ei->vfs_inode;
+ extent_map_tree_init(&ei->extent_tree);
+ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
+ extent_io_tree_init(fs_info, &ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT, NULL);
+ ei->io_failure_tree = RB_ROOT;
+ atomic_set(&ei->sync_writers, 0);
+ mutex_init(&ei->log_mutex);
+ btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ INIT_LIST_HEAD(&ei->delalloc_inodes);
+ INIT_LIST_HEAD(&ei->delayed_iput);
+ RB_CLEAR_NODE(&ei->rb_node);
+ init_rwsem(&ei->i_mmap_lock);
+
+ return inode;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode)
+{
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+#endif
+
+void btrfs_free_inode(struct inode *inode)
+{
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+void btrfs_destroy_inode(struct inode *vfs_inode)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
+ bool freespace_inode;
+
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+ WARN_ON(vfs_inode->i_data.nrpages);
+ WARN_ON(inode->block_rsv.reserved);
+ WARN_ON(inode->block_rsv.size);
+ WARN_ON(inode->outstanding_extents);
+ if (!S_ISDIR(vfs_inode->i_mode)) {
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ }
+ WARN_ON(inode->csum_bytes);
+ WARN_ON(inode->defrag_bytes);
+
+ /*
+ * This can happen where we create an inode, but somebody else also
+ * created the same inode and we need to destroy the one we already
+ * created.
+ */
+ if (!root)
+ return;
+
+ /*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+ if (!ordered)
+ break;
+ else {
+ btrfs_err(root->fs_info,
+ "found ordered extent %llu %llu on inode cleanup",
+ ordered->file_offset, ordered->num_bytes);
+
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
+
+ btrfs_remove_ordered_extent(inode, ordered);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ }
+ }
+ btrfs_qgroup_check_reserved_leak(inode);
+ inode_tree_del(inode);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+ btrfs_put_root(inode->root);
+}
+
+int btrfs_drop_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (root == NULL)
+ return 1;
+
+ /* the snap/subvol tree is on deleting */
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+ else
+ return generic_drop_inode(inode);
+}
+
+static void init_once(void *foo)
+{
+ struct btrfs_inode *ei = foo;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+void __cold btrfs_destroy_cachep(void)
+{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ bioset_exit(&btrfs_dio_bioset);
+ kmem_cache_destroy(btrfs_inode_cachep);
+ kmem_cache_destroy(btrfs_trans_handle_cachep);
+ kmem_cache_destroy(btrfs_path_cachep);
+ kmem_cache_destroy(btrfs_free_space_cachep);
+ kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
+}
+
+int __init btrfs_init_cachep(void)
+{
+ btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
+ sizeof(struct btrfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ init_once);
+ if (!btrfs_inode_cachep)
+ goto fail;
+
+ btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
+ sizeof(struct btrfs_trans_handle), 0,
+ SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_trans_handle_cachep)
+ goto fail;
+
+ btrfs_path_cachep = kmem_cache_create("btrfs_path",
+ sizeof(struct btrfs_path), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_path_cachep)
+ goto fail;
+
+ btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
+ sizeof(struct btrfs_free_space), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_free_space_cachep)
+ goto fail;
+
+ btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
+ PAGE_SIZE, PAGE_SIZE,
+ SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_free_space_bitmap_cachep)
+ goto fail;
+
+ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_dio_private, bio),
+ BIOSET_NEED_BVECS))
+ goto fail;
+
+ return 0;
+fail:
+ btrfs_destroy_cachep();
+ return -ENOMEM;
+}
+
+static int btrfs_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ u64 delalloc_bytes;
+ u64 inode_bytes;
+ struct inode *inode = d_inode(path->dentry);
+ u32 blocksize = inode->i_sb->s_blocksize;
+ u32 bi_flags = BTRFS_I(inode)->flags;
+ u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
+
+ stat->result_mask |= STATX_BTIME;
+ stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
+ stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
+ if (bi_flags & BTRFS_INODE_APPEND)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (bi_flags & BTRFS_INODE_COMPRESS)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+ if (bi_flags & BTRFS_INODE_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (bi_flags & BTRFS_INODE_NODUMP)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+ stat->attributes |= STATX_ATTR_VERITY;
+
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+
+ generic_fillattr(mnt_userns, inode, stat);
+ stat->dev = BTRFS_I(inode)->root->anon_dev;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
+ inode_bytes = inode_get_bytes(inode);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ stat->blocks = (ALIGN(inode_bytes, blocksize) +
+ ALIGN(delalloc_bytes, blocksize)) >> 9;
+ return 0;
+}
+
+static int btrfs_rename_exchange(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct timespec64 ctime = current_time(old_inode);
+ struct btrfs_rename_ctx old_rename_ctx;
+ struct btrfs_rename_ctx new_rename_ctx;
+ u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
+ u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
+ u64 old_idx = 0;
+ u64 new_idx = 0;
+ int ret;
+ int ret2;
+ bool need_abort = false;
+ struct fscrypt_name old_fname, new_fname;
+ struct fscrypt_str *old_name, *new_name;
+
+ /*
+ * For non-subvolumes allow exchange only within one subvolume, in the
+ * same inode namespace. Two subvolumes (represented as directory) can
+ * be exchanged as they're a logical link and have a fixed inode number.
+ */
+ if (root != dest &&
+ (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino != BTRFS_FIRST_FREE_OBJECTID))
+ return -EXDEV;
+
+ ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
+ if (ret)
+ return ret;
+
+ ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
+ if (ret) {
+ fscrypt_free_filename(&old_fname);
+ return ret;
+ }
+
+ old_name = &old_fname.disk_name;
+ new_name = &new_fname.disk_name;
+
+ /* close the race window with snapshot create/destroy ioctl */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&fs_info->subvol_sem);
+
+ /*
+ * For each inode:
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
+ * 1 to update parent inode
+ *
+ * If the parents are the same, we only need to account for one
+ */
+ trans_num_items = (old_dir == new_dir ? 9 : 10);
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode item
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ trans_num_items += 4;
+ else
+ trans_num_items += 3;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
+
+ if (dest != root) {
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret)
+ goto out_fail;
+ }
+
+ /*
+ * We need to find a free sequence number both in the source and
+ * in the destination directory for the exchange.
+ */
+ ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
+ if (ret)
+ goto out_fail;
+ ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
+ if (ret)
+ goto out_fail;
+
+ BTRFS_I(old_inode)->dir_index = 0ULL;
+ BTRFS_I(new_inode)->dir_index = 0ULL;
+
+ /* Reference for the source. */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(trans);
+ } else {
+ ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
+ btrfs_ino(BTRFS_I(new_dir)),
+ old_idx);
+ if (ret)
+ goto out_fail;
+ need_abort = true;
+ }
+
+ /* And now for the dest. */
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(trans);
+ } else {
+ ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
+ btrfs_ino(BTRFS_I(old_dir)),
+ new_idx);
+ if (ret) {
+ if (need_abort)
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ }
+
+ /* Update inode version and ctime/mtime. */
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
+ inode_inc_iversion(old_inode);
+ inode_inc_iversion(new_inode);
+ old_dir->i_mtime = ctime;
+ old_dir->i_ctime = ctime;
+ new_dir->i_mtime = ctime;
+ new_dir->i_ctime = ctime;
+ old_inode->i_ctime = ctime;
+ new_inode->i_ctime = ctime;
+
+ if (old_dentry->d_parent != new_dentry->d_parent) {
+ btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
+ BTRFS_I(old_inode), 1);
+ btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
+ BTRFS_I(new_inode), 1);
+ }
+
+ /* src is a subvolume */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
+ } else { /* src is an inode */
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
+ BTRFS_I(old_dentry->d_inode),
+ old_name, &old_rename_ctx);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ /* dest is a subvolume */
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
+ } else { /* dest is an inode */
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
+ BTRFS_I(new_dentry->d_inode),
+ new_name, &new_rename_ctx);
+ if (!ret)
+ ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
+ new_name, 0, old_idx);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
+ old_name, 0, new_idx);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ if (old_inode->i_nlink == 1)
+ BTRFS_I(old_inode)->dir_index = old_idx;
+ if (new_inode->i_nlink == 1)
+ BTRFS_I(new_inode)->dir_index = new_idx;
+
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(dest);
+
+ /* Do the log updates for all inodes. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ old_rename_ctx.index, new_dentry->d_parent);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
+ new_rename_ctx.index, old_dentry->d_parent);
+
+ /* Now unpin the logs. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(dest);
+out_fail:
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
+out_notrans:
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&fs_info->subvol_sem);
+
+ fscrypt_free_filename(&new_fname);
+ fscrypt_free_filename(&old_fname);
+ return ret;
+}
+
+static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
+{
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ inode_init_owner(mnt_userns, inode, dir,
+ S_IFCHR | WHITEOUT_MODE);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+ }
+ return inode;
+}
+
+static int btrfs_rename(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_new_inode_args whiteout_args = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ };
+ struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ struct inode *new_inode = d_inode(new_dentry);
+ struct inode *old_inode = d_inode(old_dentry);
+ struct btrfs_rename_ctx rename_ctx;
+ u64 index = 0;
+ int ret;
+ int ret2;
+ u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
+ struct fscrypt_name old_fname, new_fname;
+
+ if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return -EPERM;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ return -EXDEV;
+
+ if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+ (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
+ return -ENOTEMPTY;
+
+ if (S_ISDIR(old_inode->i_mode) && new_inode &&
+ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ return -ENOTEMPTY;
+
+ ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
+ if (ret)
+ return ret;
+
+ ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
+ if (ret) {
+ fscrypt_free_filename(&old_fname);
+ return ret;
+ }
+
+ /* check for collisions, even if the name isn't there */
+ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
+ if (ret) {
+ if (ret == -EEXIST) {
+ /* we shouldn't get
+ * eexist without a new_inode */
+ if (WARN_ON(!new_inode)) {
+ goto out_fscrypt_names;
+ }
+ } else {
+ /* maybe -EOVERFLOW */
+ goto out_fscrypt_names;
+ }
+ }
+ ret = 0;
+
+ /*
+ * we're using rename to replace one file with another. Start IO on it
+ * now so we don't add too much work to the end of the transaction
+ */
+ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
+ filemap_flush(old_inode->i_mapping);
+
+ if (flags & RENAME_WHITEOUT) {
+ whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir);
+ if (!whiteout_args.inode) {
+ ret = -ENOMEM;
+ goto out_fscrypt_names;
+ }
+ ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
+ if (ret)
+ goto out_whiteout_inode;
+ } else {
+ /* 1 to update the old parent inode. */
+ trans_num_items = 1;
+ }
+
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* Close the race window with snapshot create/destroy ioctl */
+ down_read(&fs_info->subvol_sem);
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
+ /*
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
+ */
+ trans_num_items += 4;
+ /* 1 to update new parent inode if it's not the same as the old parent */
+ if (new_dir != old_dir)
+ trans_num_items++;
+ if (new_inode) {
+ /*
+ * 1 to update inode
+ * 1 to remove inode ref
+ * 1 to remove dir item
+ * 1 to remove dir index
+ * 1 to possibly add orphan item
+ */
+ trans_num_items += 5;
+ }
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
+
+ if (dest != root) {
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret)
+ goto out_fail;
+ }
+
+ ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
+ if (ret)
+ goto out_fail;
+
+ BTRFS_I(old_inode)->dir_index = 0ULL;
+ if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(trans);
+ } else {
+ ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
+ old_ino, btrfs_ino(BTRFS_I(new_dir)),
+ index);
+ if (ret)
+ goto out_fail;
+ }
+
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
+ inode_inc_iversion(old_inode);
+ old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_ctime = old_dir->i_mtime;
+ new_dir->i_mtime = old_dir->i_mtime;
+ new_dir->i_ctime = old_dir->i_mtime;
+ old_inode->i_ctime = old_dir->i_mtime;
+
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
+ BTRFS_I(old_inode), 1);
+
+ if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
+ } else {
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
+ BTRFS_I(d_inode(old_dentry)),
+ &old_fname.disk_name, &rename_ctx);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ if (new_inode) {
+ inode_inc_iversion(new_inode);
+ new_inode->i_ctime = current_time(new_inode);
+ if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
+ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
+ BUG_ON(new_inode->i_nlink == 0);
+ } else {
+ ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
+ BTRFS_I(d_inode(new_dentry)),
+ &new_fname.disk_name);
+ }
+ if (!ret && new_inode->i_nlink == 0)
+ ret = btrfs_orphan_add(trans,
+ BTRFS_I(d_inode(new_dentry)));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ }
+
+ ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
+ &new_fname.disk_name, 0, index);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+
+ if (old_inode->i_nlink == 1)
+ BTRFS_I(old_inode)->dir_index = index;
+
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ rename_ctx.index, new_dentry->d_parent);
+
+ if (flags & RENAME_WHITEOUT) {
+ ret = btrfs_create_new_inode(trans, &whiteout_args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ } else {
+ unlock_new_inode(whiteout_args.inode);
+ iput(whiteout_args.inode);
+ whiteout_args.inode = NULL;
+ }
+ }
+out_fail:
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
+out_notrans:
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&fs_info->subvol_sem);
+ if (flags & RENAME_WHITEOUT)
+ btrfs_new_inode_args_destroy(&whiteout_args);
+out_whiteout_inode:
+ if (flags & RENAME_WHITEOUT)
+ iput(whiteout_args.inode);
+out_fscrypt_names:
+ fscrypt_free_filename(&old_fname);
+ fscrypt_free_filename(&new_fname);
+ return ret;
+}
+
+static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
+{
+ int ret;
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE)
+ ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+ new_dentry);
+ else
+ ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
+
+ btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
+
+ return ret;
+}
+
+struct btrfs_delalloc_work {
+ struct inode *inode;
+ struct completion completion;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+ struct btrfs_delalloc_work *delalloc_work;
+ struct inode *inode;
+
+ delalloc_work = container_of(work, struct btrfs_delalloc_work,
+ work);
+ inode = delalloc_work->inode;
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+
+ iput(inode);
+ complete(&delalloc_work->completion);
+}
+
+static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
+{
+ struct btrfs_delalloc_work *work;
+
+ work = kmalloc(sizeof(*work), GFP_NOFS);
+ if (!work)
+ return NULL;
+
+ init_completion(&work->completion);
+ INIT_LIST_HEAD(&work->list);
+ work->inode = inode;
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+
+ return work;
+}
+
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+static int start_delalloc_inodes(struct btrfs_root *root,
+ struct writeback_control *wbc, bool snapshot,
+ bool in_reclaim_context)
+{
+ struct btrfs_inode *binode;
+ struct inode *inode;
+ struct btrfs_delalloc_work *work, *next;
+ struct list_head works;
+ struct list_head splice;
+ int ret = 0;
+ bool full_flush = wbc->nr_to_write == LONG_MAX;
+
+ INIT_LIST_HEAD(&works);
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->delalloc_mutex);
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
+ while (!list_empty(&splice)) {
+ binode = list_entry(splice.next, struct btrfs_inode,
+ delalloc_inodes);
+
+ list_move_tail(&binode->delalloc_inodes,
+ &root->delalloc_inodes);
+
+ if (in_reclaim_context &&
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ continue;
+
+ inode = igrab(&binode->vfs_inode);
+ if (!inode) {
+ cond_resched_lock(&root->delalloc_lock);
+ continue;
+ }
+ spin_unlock(&root->delalloc_lock);
+
+ if (snapshot)
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+ &binode->runtime_flags);
+ if (full_flush) {
+ work = btrfs_alloc_delalloc_work(inode);
+ if (!work) {
+ iput(inode);
+ ret = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&work->list, &works);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ } else {
+ ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
+ btrfs_add_delayed_iput(inode);
+ if (ret || wbc->nr_to_write <= 0)
+ goto out;
+ }
+ cond_resched();
+ spin_lock(&root->delalloc_lock);
+ }
+ spin_unlock(&root->delalloc_lock);
+
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ wait_for_completion(&work->completion);
+ kfree(work);
+ }
+
+ if (!list_empty(&splice)) {
+ spin_lock(&root->delalloc_lock);
+ list_splice_tail(&splice, &root->delalloc_inodes);
+ spin_unlock(&root->delalloc_lock);
+ }
+ mutex_unlock(&root->delalloc_mutex);
+ return ret;
+}
+
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
+{
+ struct writeback_control wbc = {
+ .nr_to_write = LONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (BTRFS_FS_ERROR(fs_info))
+ return -EROFS;
+
+ return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
+}
+
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context)
+{
+ struct writeback_control wbc = {
+ .nr_to_write = nr,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+ struct btrfs_root *root;
+ struct list_head splice;
+ int ret;
+
+ if (BTRFS_FS_ERROR(fs_info))
+ return -EROFS;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&fs_info->delalloc_root_mutex);
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ /*
+ * Reset nr_to_write here so we know that we're doing a full
+ * flush.
+ */
+ if (nr == LONG_MAX)
+ wbc.nr_to_write = LONG_MAX;
+
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ root = btrfs_grab_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->delalloc_root,
+ &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
+ btrfs_put_root(root);
+ if (ret < 0 || wbc.nr_to_write <= 0)
+ goto out;
+ spin_lock(&fs_info->delalloc_root_lock);
+ }
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ ret = 0;
+out:
+ if (!list_empty(&splice)) {
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_tail(&splice, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+ }
+ mutex_unlock(&fs_info->delalloc_root_mutex);
+ return ret;
+}
+
+static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ };
+ unsigned int trans_num_items;
+ int err;
+ int name_len;
+ int datasize;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ struct extent_buffer *leaf;
+
+ name_len = strlen(symname);
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ return -ENAMETOOLONG;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO);
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &btrfs_aops;
+ btrfs_i_size_write(BTRFS_I(inode), name_len);
+ inode_set_bytes(inode, name_len);
+
+ new_inode_args.inode = inode;
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (err)
+ goto out_inode;
+ /* 1 additional item for the inline extent */
+ trans_num_items++;
+
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
+ }
+
+ err = btrfs_create_new_inode(trans, &new_inode_args);
+ if (err)
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ btrfs_abort_transaction(trans, err);
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
+ }
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ datasize = btrfs_file_extent_calc_inline_size(name_len);
+ err = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ if (err) {
+ btrfs_abort_transaction(trans, err);
+ btrfs_free_path(path);
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei,
+ BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
+ ptr = btrfs_file_extent_inline_start(ei);
+ write_extent_buffer(leaf, symname, ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ d_instantiate_new(dentry, inode);
+ err = 0;
+out:
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
+ return err;
+}
+
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_trans_handle *trans_in,
+ struct btrfs_inode *inode,
+ struct btrfs_key *ins,
+ u64 file_offset)
+{
+ struct btrfs_file_extent_item stack_fi;
+ struct btrfs_replace_extent_info extent_info;
+ struct btrfs_trans_handle *trans = trans_in;
+ struct btrfs_path *path;
+ u64 start = ins->objectid;
+ u64 len = ins->offset;
+ u64 qgroup_released = 0;
+ int ret;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, inode,
+ file_offset, &stack_fi,
+ true, qgroup_released);
+ if (ret)
+ goto free_qgroup;
+ return trans;
+ }
+
+ extent_info.disk_offset = start;
+ extent_info.disk_len = len;
+ extent_info.data_offset = 0;
+ extent_info.data_len = len;
+ extent_info.file_offset = file_offset;
+ extent_info.extent_buf = (char *)&stack_fi;
+ extent_info.is_new_extent = true;
+ extent_info.update_times = true;
+ extent_info.qgroup_reserved = qgroup_released;
+ extent_info.insertions = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto free_qgroup;
+ }
+
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
+ file_offset + len - 1, &extent_info,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ goto free_qgroup;
+ return trans;
+
+free_qgroup:
+ /*
+ * We have released qgroup data range at the beginning of the function,
+ * and normally qgroup_released bytes will be freed when committing
+ * transaction.
+ * But if we error out early, we have to free what we have released
+ * or we leak qgroup data reservation.
+ */
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid, qgroup_released,
+ BTRFS_QGROUP_RSV_DATA);
+ return ERR_PTR(ret);
+}
+
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint,
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key ins;
+ u64 cur_offset = start;
+ u64 clear_offset = start;
+ u64 i_size;
+ u64 cur_bytes;
+ u64 last_alloc = (u64)-1;
+ int ret = 0;
+ bool own_trans = true;
+ u64 end = start + num_bytes - 1;
+
+ if (trans)
+ own_trans = false;
+ while (num_bytes > 0) {
+ cur_bytes = min_t(u64, num_bytes, SZ_256M);
+ cur_bytes = max(cur_bytes, min_size);
+ /*
+ * If we are severely fragmented we could end up with really
+ * small allocations, so if the allocator is returning small
+ * chunks lets make its job easier by only searching for those
+ * sized chunks.
+ */
+ cur_bytes = min(cur_bytes, last_alloc);
+ ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
+ min_size, 0, *alloc_hint, &ins, 1, 0);
+ if (ret)
+ break;
+
+ /*
+ * We've reserved this space, and thus converted it from
+ * ->bytes_may_use to ->bytes_reserved. Any error that happens
+ * from here on out we will only need to clear our reservation
+ * for the remaining unreserved area, so advance our
+ * clear_offset by our extent size.
+ */
+ clear_offset += ins.offset;
+
+ last_alloc = ins.offset;
+ trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
+ &ins, cur_offset);
+ /*
+ * Now that we inserted the prealloc extent we can finally
+ * decrement the number of reservations in the block group.
+ * If we did it before, we could race with relocation and have
+ * relocation miss the reserved extent, making it fail later.
+ */
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_free_reserved_extent(fs_info, ins.objectid,
+ ins.offset, 0);
+ break;
+ }
+
+ em = alloc_extent_map();
+ if (!em) {
+ btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
+ cur_offset + ins.offset - 1, false);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
+ goto next;
+ }
+
+ em->start = cur_offset;
+ em->orig_start = cur_offset;
+ em->len = ins.offset;
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
+ em->ram_bytes = ins.offset;
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->generation = trans->transid;
+
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
+ free_extent_map(em);
+next:
+ num_bytes -= ins.offset;
+ cur_offset += ins.offset;
+ *alloc_hint = ins.objectid + ins.offset;
+
+ inode_inc_iversion(inode);
+ inode->i_ctime = current_time(inode);
+ BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (actual_len > inode->i_size) &&
+ (cur_offset > inode->i_size)) {
+ if (cur_offset > actual_len)
+ i_size = actual_len;
+ else
+ i_size = cur_offset;
+ i_size_write(inode, i_size);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ }
+
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ if (own_trans)
+ btrfs_end_transaction(trans);
+ break;
+ }
+
+ if (own_trans) {
+ btrfs_end_transaction(trans);
+ trans = NULL;
+ }
+ }
+ if (clear_offset < end)
+ btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
+ end - clear_offset + 1);
+ return ret;
+}
+
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
+{
+ return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+ min_size, actual_len, alloc_hint,
+ NULL);
+}
+
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+ struct btrfs_trans_handle *trans, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
+{
+ return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+ min_size, actual_len, alloc_hint, trans);
+}
+
+static int btrfs_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ umode_t mode = inode->i_mode;
+
+ if (mask & MAY_WRITE &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
+ return -EACCES;
+ }
+ return generic_permission(mnt_userns, inode, mask);
+}
+
+static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct file *file, umode_t mode)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = file->f_path.dentry,
+ .orphan = true,
+ };
+ unsigned int trans_num_items;
+ int ret;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_aops;
+
+ new_inode_args.inode = inode;
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
+ goto out_inode;
+
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_new_inode_args;
+ }
+
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+
+ /*
+ * We set number of links to 0 in btrfs_create_new_inode(), and here we
+ * set it to 1 because d_tmpfile() will issue a warning if the count is
+ * 0, through:
+ *
+ * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+ */
+ set_nlink(inode, 1);
+
+ if (!ret) {
+ d_tmpfile(file, inode);
+ unlock_new_inode(inode);
+ mark_inode_dirty(inode);
+ }
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (ret)
+ iput(inode);
+ return finish_open_simple(file, ret);
+}
+
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct page *page;
+ u32 len;
+
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
+ while (index <= end_index) {
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
+ ASSERT(page); /* Pages should be in the extent_io_tree */
+
+ btrfs_page_set_writeback(fs_info, page, start, len);
+ put_page(page);
+ index++;
+ }
+}
+
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type)
+{
+ switch (compress_type) {
+ case BTRFS_COMPRESS_NONE:
+ return BTRFS_ENCODED_IO_COMPRESSION_NONE;
+ case BTRFS_COMPRESS_ZLIB:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
+ case BTRFS_COMPRESS_LZO:
+ /*
+ * The LZO format depends on the sector size. 64K is the maximum
+ * sector size that we support.
+ */
+ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
+ return -EINVAL;
+ return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
+ (fs_info->sectorsize_bits - 12);
+ case BTRFS_COMPRESS_ZSTD:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
+ default:
+ return -EUCLEAN;
+ }
+}
+
+static ssize_t btrfs_encoded_read_inline(
+ struct kiocb *iocb,
+ struct iov_iter *iter, u64 start,
+ u64 lockend,
+ struct extent_state **cached_state,
+ u64 extent_start, size_t count,
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *item;
+ u64 ram_bytes;
+ unsigned long ptr;
+ void *tmp;
+ ssize_t ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ extent_start, 0);
+ if (ret) {
+ if (ret > 0) {
+ /* The extent item disappeared? */
+ ret = -EIO;
+ }
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ encoded->len = min_t(u64, extent_start + ram_bytes,
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, item));
+ if (ret < 0)
+ goto out;
+ encoded->compression = ret;
+ if (encoded->compression) {
+ size_t inline_size;
+
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]);
+ if (inline_size > count) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+ count = inline_size;
+ encoded->unencoded_len = ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - extent_start;
+ } else {
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ ptr += iocb->ki_pos - extent_start;
+ }
+
+ tmp = kmalloc(count, GFP_NOFS);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(leaf, tmp, ptr, count);
+ btrfs_release_path(path);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ ret = copy_to_iter(tmp, count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ kfree(tmp);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_encoded_read_private {
+ struct btrfs_inode *inode;
+ u64 file_offset;
+ wait_queue_head_t wait;
+ atomic_t pending;
+ blk_status_t status;
+ bool skip_csum;
+};
+
+static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
+ struct bio *bio, int mirror_num)
+{
+ struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ blk_status_t ret;
+
+ if (!priv->skip_csum) {
+ ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+ if (ret)
+ return ret;
+ }
+
+ atomic_inc(&priv->pending);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return BLK_STS_OK;
+}
+
+static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+{
+ const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
+ struct btrfs_encoded_read_private *priv = bbio->private;
+ struct btrfs_inode *inode = priv->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ u32 bio_offset = 0;
+
+ if (priv->skip_csum || !uptodate)
+ return bbio->bio.bi_status;
+
+ bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ unsigned int i, nr_sectors, pgoff;
+
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ pgoff = bvec->bv_offset;
+ for (i = 0; i < nr_sectors; i++) {
+ ASSERT(pgoff < PAGE_SIZE);
+ if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ bvec->bv_page, pgoff))
+ return BLK_STS_IOERR;
+ bio_offset += sectorsize;
+ pgoff += sectorsize;
+ }
+ }
+ return BLK_STS_OK;
+}
+
+static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
+{
+ struct btrfs_encoded_read_private *priv = bbio->private;
+ blk_status_t status;
+
+ status = btrfs_encoded_read_verify_csum(bbio);
+ if (status) {
+ /*
+ * The memory barrier implied by the atomic_dec_return() here
+ * pairs with the memory barrier implied by the
+ * atomic_dec_return() or io_wait_event() in
+ * btrfs_encoded_read_regular_fill_pages() to ensure that this
+ * write is observed before the load of status in
+ * btrfs_encoded_read_regular_fill_pages().
+ */
+ WRITE_ONCE(priv->status, status);
+ }
+ if (!atomic_dec_return(&priv->pending))
+ wake_up(&priv->wait);
+ btrfs_bio_free_csum(bbio);
+ bio_put(&bbio->bio);
+}
+
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size, struct page **pages)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_encoded_read_private priv = {
+ .inode = inode,
+ .file_offset = file_offset,
+ .pending = ATOMIC_INIT(1),
+ .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
+ };
+ unsigned long i = 0;
+ u64 cur = 0;
+ int ret;
+
+ init_waitqueue_head(&priv.wait);
+ /*
+ * Submit bios for the extent, splitting due to bio or stripe limits as
+ * necessary.
+ */
+ while (cur < disk_io_size) {
+ struct extent_map *em;
+ struct btrfs_io_geometry geom;
+ struct bio *bio = NULL;
+ u64 remaining;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+ disk_io_size - cur);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ } else {
+ ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+ disk_bytenr + cur, &geom);
+ free_extent_map(em);
+ }
+ if (ret) {
+ WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+ break;
+ }
+ remaining = min(geom.len, disk_io_size - cur);
+ while (bio || remaining) {
+ size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+
+ if (!bio) {
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
+ btrfs_encoded_read_endio,
+ &priv);
+ bio->bi_iter.bi_sector =
+ (disk_bytenr + cur) >> SECTOR_SHIFT;
+ }
+
+ if (!bytes ||
+ bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+ blk_status_t status;
+
+ status = submit_encoded_read_bio(inode, bio, 0);
+ if (status) {
+ WRITE_ONCE(priv.status, status);
+ bio_put(bio);
+ goto out;
+ }
+ bio = NULL;
+ continue;
+ }
+
+ i++;
+ cur += bytes;
+ remaining -= bytes;
+ }
+ }
+
+out:
+ if (atomic_dec_return(&priv.pending))
+ io_wait_event(priv.wait, !atomic_read(&priv.pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ return blk_status_to_errno(READ_ONCE(priv.status));
+}
+
+static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
+ struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ unsigned long nr_pages, i;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ ret = btrfs_alloc_page_array(nr_pages, pages);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+ disk_io_size, pages);
+ if (ret)
+ goto out;
+
+ unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ if (compressed) {
+ i = 0;
+ page_offset = 0;
+ } else {
+ i = (iocb->ki_pos - start) >> PAGE_SHIFT;
+ page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
+ }
+ cur = 0;
+ while (cur < count) {
+ size_t bytes = min_t(size_t, count - cur,
+ PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(pages[i], page_offset, bytes,
+ iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+ i++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = count;
+out:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ return ret;
+}
+
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ ssize_t ret;
+ size_t count = iov_iter_count(iter);
+ u64 start, lockend, disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ bool unlocked = false;
+
+ file_accessed(iocb->ki_filp);
+
+ btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+
+ if (iocb->ki_pos >= inode->vfs_inode.i_size) {
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return 0;
+ }
+ start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
+ /*
+ * We don't know how long the extent containing iocb->ki_pos is, but if
+ * it's compressed we know that it won't be longer than this.
+ */
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+ lock_extent(io_tree, start, lockend, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ cond_resched();
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock_extent;
+ }
+
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ u64 extent_start = em->start;
+
+ /*
+ * For inline extents we get everything we need out of the
+ * extent item.
+ */
+ free_extent_map(em);
+ em = NULL;
+ ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
+ &cached_state, extent_start,
+ count, encoded, &unlocked);
+ goto out;
+ }
+
+ /*
+ * We only want to return up to EOF even if the extent extends beyond
+ * that.
+ */
+ encoded->len = min_t(u64, extent_map_end(em),
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ disk_bytenr = EXTENT_MAP_HOLE;
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ disk_bytenr = em->block_start;
+ /*
+ * Bail if the buffer isn't large enough to return the whole
+ * compressed extent.
+ */
+ if (em->block_len > count) {
+ ret = -ENOBUFS;
+ goto out_em;
+ }
+ disk_io_size = em->block_len;
+ count = em->block_len;
+ encoded->unencoded_len = em->ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ em->compress_type);
+ if (ret < 0)
+ goto out_em;
+ encoded->compression = ret;
+ } else {
+ disk_bytenr = em->block_start + (start - em->start);
+ if (encoded->len > count)
+ encoded->len = count;
+ /*
+ * Don't read beyond what we locked. This also limits the page
+ * allocations that we'll do.
+ */
+ disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + disk_io_size - iocb->ki_pos;
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ }
+ free_extent_map(em);
+ em = NULL;
+
+ if (disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlocked = true;
+ ret = iov_iter_zero(count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ } else {
+ ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ encoded->compression,
+ &unlocked);
+ }
+
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+out_em:
+ free_extent_map(em);
+out_unlock_extent:
+ if (!unlocked)
+ unlock_extent(io_tree, start, lockend, &cached_state);
+out_unlock_inode:
+ if (!unlocked)
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+}
+
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_changeset *data_reserved = NULL;
+ struct extent_state *cached_state = NULL;
+ int compression;
+ size_t orig_count;
+ u64 start, end;
+ u64 num_bytes, ram_bytes, disk_num_bytes;
+ unsigned long nr_pages, i;
+ struct page **pages;
+ struct btrfs_key ins;
+ bool extent_reserved = false;
+ struct extent_map *em;
+ ssize_t ret;
+
+ switch (encoded->compression) {
+ case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
+ compression = BTRFS_COMPRESS_ZLIB;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
+ compression = BTRFS_COMPRESS_ZSTD;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
+ /* The sector size must match for LZO. */
+ if (encoded->compression -
+ BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
+ fs_info->sectorsize_bits)
+ return -EINVAL;
+ compression = BTRFS_COMPRESS_LZO;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ return -EINVAL;
+
+ orig_count = iov_iter_count(from);
+
+ /* The extent size must be sane. */
+ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
+ orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
+ return -EINVAL;
+
+ /*
+ * The compressed data must be smaller than the decompressed data.
+ *
+ * It's of course possible for data to compress to larger or the same
+ * size, but the buffered I/O path falls back to no compression for such
+ * data, and we don't want to break any assumptions by creating these
+ * extents.
+ *
+ * Note that this is less strict than the current check we have that the
+ * compressed data must be at least one sector smaller than the
+ * decompressed data. We only want to enforce the weaker requirement
+ * from old kernels that it is at least one byte smaller.
+ */
+ if (orig_count >= encoded->unencoded_len)
+ return -EINVAL;
+
+ /* The extent must start on a sector boundary. */
+ start = iocb->ki_pos;
+ if (!IS_ALIGNED(start, fs_info->sectorsize))
+ return -EINVAL;
+
+ /*
+ * The extent must end on a sector boundary. However, we allow a write
+ * which ends at or extends i_size to have an unaligned length; we round
+ * up the extent size and set i_size to the unaligned end.
+ */
+ if (start + encoded->len < inode->vfs_inode.i_size &&
+ !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
+ return -EINVAL;
+
+ /* Finally, the offset in the unencoded data must be sector-aligned. */
+ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
+ return -EINVAL;
+
+ num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
+ ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
+ end = start + num_bytes - 1;
+
+ /*
+ * If the extent cannot be inline, the compressed data on disk must be
+ * sector-aligned. For convenience, we extend it with zeroes if it
+ * isn't.
+ */
+ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
+ nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
+ char *kaddr;
+
+ pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ kaddr = kmap_local_page(pages[i]);
+ if (copy_from_iter(kaddr, bytes, from) != bytes) {
+ kunmap_local(kaddr);
+ ret = -EFAULT;
+ goto out_pages;
+ }
+ if (bytes < PAGE_SIZE)
+ memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
+ kunmap_local(kaddr);
+ }
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ if (ret)
+ goto out_pages;
+ ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
+ start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ if (ret)
+ goto out_pages;
+ lock_extent(io_tree, start, end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
+ if (!ordered &&
+ !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
+ break;
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, end, &cached_state);
+ cond_resched();
+ }
+
+ /*
+ * We don't use the higher-level delalloc space functions because our
+ * num_bytes and disk_num_bytes are different.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
+ if (ret)
+ goto out_unlock;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
+ if (ret)
+ goto out_free_data_space;
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
+ false);
+ if (ret)
+ goto out_qgroup_free_data;
+
+ /* Try an inline extent first. */
+ if (start == 0 && encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0) {
+ ret = cow_file_range_inline(inode, encoded->len, orig_count,
+ compression, pages, true);
+ if (ret <= 0) {
+ if (ret == 0)
+ ret = orig_count;
+ goto out_delalloc_release;
+ }
+ }
+
+ ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
+ disk_num_bytes, 0, 0, &ins, 1, 1);
+ if (ret)
+ goto out_delalloc_release;
+ extent_reserved = true;
+
+ em = create_io_em(inode, start, num_bytes,
+ start - encoded->unencoded_offset, ins.objectid,
+ ins.offset, ins.offset, ram_bytes, compression,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserved;
+ }
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
+ ins.objectid, ins.offset,
+ encoded->unencoded_offset,
+ (1 << BTRFS_ORDERED_ENCODED) |
+ (1 << BTRFS_ORDERED_COMPRESSED),
+ compression);
+ if (ret) {
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ goto out_free_reserved;
+ }
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ if (start + encoded->len > inode->vfs_inode.i_size)
+ i_size_write(&inode->vfs_inode, start + encoded->len);
+
+ unlock_extent(io_tree, start, end, &cached_state);
+
+ btrfs_delalloc_release_extents(inode, num_bytes);
+
+ if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+ ins.offset, pages, nr_pages, 0, NULL,
+ false)) {
+ btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
+ ret = -EIO;
+ goto out_pages;
+ }
+ ret = orig_count;
+ goto out;
+
+out_free_reserved:
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_delalloc_release:
+ btrfs_delalloc_release_extents(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
+out_qgroup_free_data:
+ if (ret < 0)
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
+out_free_data_space:
+ /*
+ * If btrfs_reserve_extent() succeeded, then we already decremented
+ * bytes_may_use.
+ */
+ if (!extent_reserved)
+ btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+out_unlock:
+ unlock_extent(io_tree, start, end, &cached_state);
+out_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kvfree(pages);
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+ return ret;
+}
+
+#ifdef CONFIG_SWAP
+/*
+ * Add an entry indicating a block group or device which is pinned by a
+ * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
+ * negative errno on failure.
+ */
+static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
+ bool is_block_group)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_swapfile_pin *sp, *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+
+ sp = kmalloc(sizeof(*sp), GFP_NOFS);
+ if (!sp)
+ return -ENOMEM;
+ sp->ptr = ptr;
+ sp->inode = inode;
+ sp->is_block_group = is_block_group;
+ sp->bg_extent_count = 1;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ p = &fs_info->swapfile_pins.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
+ if (sp->ptr < entry->ptr ||
+ (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
+ p = &(*p)->rb_left;
+ } else if (sp->ptr > entry->ptr ||
+ (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
+ p = &(*p)->rb_right;
+ } else {
+ if (is_block_group)
+ entry->bg_extent_count++;
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ kfree(sp);
+ return 1;
+ }
+ }
+ rb_link_node(&sp->node, parent, p);
+ rb_insert_color(&sp->node, &fs_info->swapfile_pins);
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ return 0;
+}
+
+/* Free all of the entries pinned by this swapfile. */
+static void btrfs_free_swapfile_pins(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_swapfile_pin *sp;
+ struct rb_node *node, *next;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ node = rb_first(&fs_info->swapfile_pins);
+ while (node) {
+ next = rb_next(node);
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+ if (sp->inode == inode) {
+ rb_erase(&sp->node, &fs_info->swapfile_pins);
+ if (sp->is_block_group) {
+ btrfs_dec_block_group_swap_extents(sp->ptr,
+ sp->bg_extent_count);
+ btrfs_put_block_group(sp->ptr);
+ }
+ kfree(sp);
+ }
+ node = next;
+ }
+ spin_unlock(&fs_info->swapfile_pins_lock);
+}
+
+struct btrfs_swap_info {
+ u64 start;
+ u64 block_start;
+ u64 block_len;
+ u64 lowest_ppage;
+ u64 highest_ppage;
+ unsigned long nr_pages;
+ int nr_extents;
+};
+
+static int btrfs_add_swap_extent(struct swap_info_struct *sis,
+ struct btrfs_swap_info *bsi)
+{
+ unsigned long nr_pages;
+ unsigned long max_pages;
+ u64 first_ppage, first_ppage_reported, next_ppage;
+ int ret;
+
+ /*
+ * Our swapfile may have had its size extended after the swap header was
+ * written. In that case activating the swapfile should not go beyond
+ * the max size set in the swap header.
+ */
+ if (bsi->nr_pages >= sis->max)
+ return 0;
+
+ max_pages = sis->max - bsi->nr_pages;
+ first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
+ next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ if (first_ppage >= next_ppage)
+ return 0;
+ nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
+
+ first_ppage_reported = first_ppage;
+ if (bsi->start == 0)
+ first_ppage_reported++;
+ if (bsi->lowest_ppage > first_ppage_reported)
+ bsi->lowest_ppage = first_ppage_reported;
+ if (bsi->highest_ppage < (next_ppage - 1))
+ bsi->highest_ppage = next_ppage - 1;
+
+ ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
+ if (ret < 0)
+ return ret;
+ bsi->nr_extents += ret;
+ bsi->nr_pages += nr_pages;
+ return 0;
+}
+
+static void btrfs_swap_deactivate(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ btrfs_free_swapfile_pins(inode);
+ atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
+}
+
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em = NULL;
+ struct btrfs_device *device = NULL;
+ struct btrfs_swap_info bsi = {
+ .lowest_ppage = (sector_t)-1ULL,
+ };
+ int ret = 0;
+ u64 isize;
+ u64 start;
+
+ /*
+ * If the swap file was just created, make sure delalloc is done. If the
+ * file changes again after this, the user is doing something stupid and
+ * we don't really care.
+ */
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (ret)
+ return ret;
+
+ /*
+ * The inode is locked, so these flags won't change after we check them.
+ */
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
+ btrfs_warn(fs_info, "swapfile must not be compressed");
+ return -EINVAL;
+ }
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
+ btrfs_warn(fs_info, "swapfile must not be copy-on-write");
+ return -EINVAL;
+ }
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ btrfs_warn(fs_info, "swapfile must not be checksummed");
+ return -EINVAL;
+ }
+
+ /*
+ * Balance or device remove/replace/resize can move stuff around from
+ * under us. The exclop protection makes sure they aren't running/won't
+ * run concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap
+ * file is active and moving the extents. Note that this also prevents
+ * a concurrent device add which isn't actually necessary, but it's not
+ * really worth the trouble to allow it.
+ */
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
+ btrfs_warn(fs_info,
+ "cannot activate swapfile while exclusive operation is running");
+ return -EBUSY;
+ }
+
+ /*
+ * Prevent snapshot creation while we are activating the swap file.
+ * We do not want to race with snapshot creation. If snapshot creation
+ * already started before we bumped nr_swapfiles from 0 to 1 and
+ * completes before the first write into the swap file after it is
+ * activated, than that write would fallback to COW.
+ */
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because snapshot creation is in progress");
+ return -EINVAL;
+ }
+ /*
+ * Snapshots can create extents which require COW even if NODATACOW is
+ * set. We use this counter to prevent snapshots. We must increment it
+ * before walking the extents because we don't want a concurrent
+ * snapshot to run after we've already checked the extents.
+ *
+ * It is possible that subvolume is marked for deletion but still not
+ * removed yet. To prevent this race, we check the root status before
+ * activating the swapfile.
+ */
+ spin_lock(&root->root_item_lock);
+ if (btrfs_root_dead(root)) {
+ spin_unlock(&root->root_item_lock);
+
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because subvolume %llu is being deleted",
+ root->root_key.objectid);
+ return -EPERM;
+ }
+ atomic_inc(&root->nr_swapfiles);
+ spin_unlock(&root->root_item_lock);
+
+ isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
+
+ lock_extent(io_tree, 0, isize - 1, &cached_state);
+ start = 0;
+ while (start < isize) {
+ u64 logical_block_start, physical_block_start;
+ struct btrfs_block_group *bg;
+ u64 len = isize - start;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ btrfs_warn(fs_info, "swapfile must not have holes");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ /*
+ * It's unlikely we'll ever actually find ourselves
+ * here, as a file small enough to fit inline won't be
+ * big enough to store more than the swap header, but in
+ * case something changes in the future, let's catch it
+ * here rather than later.
+ */
+ btrfs_warn(fs_info, "swapfile must not be inline");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ btrfs_warn(fs_info, "swapfile must not be compressed");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ logical_block_start = em->block_start + (start - em->start);
+ len = min(len, em->len - (start - em->start));
+ free_extent_map(em);
+ em = NULL;
+
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ ret = 0;
+ } else {
+ btrfs_warn(fs_info,
+ "swapfile must not be copy-on-write");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ btrfs_warn(fs_info,
+ "swapfile must have single data profile");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (device == NULL) {
+ device = em->map_lookup->stripes[0].dev;
+ ret = btrfs_add_swapfile_pin(inode, device, false);
+ if (ret == 1)
+ ret = 0;
+ else if (ret)
+ goto out;
+ } else if (device != em->map_lookup->stripes[0].dev) {
+ btrfs_warn(fs_info, "swapfile must be on one device");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ physical_block_start = (em->map_lookup->stripes[0].physical +
+ (logical_block_start - em->start));
+ len = min(len, em->len - (logical_block_start - em->start));
+ free_extent_map(em);
+ em = NULL;
+
+ bg = btrfs_lookup_block_group(fs_info, logical_block_start);
+ if (!bg) {
+ btrfs_warn(fs_info,
+ "could not find block group containing swapfile");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!btrfs_inc_block_group_swap_extents(bg)) {
+ btrfs_warn(fs_info,
+ "block group for swapfile at %llu is read-only%s",
+ bg->start,
+ atomic_read(&fs_info->scrubs_running) ?
+ " (scrub running)" : "");
+ btrfs_put_block_group(bg);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_add_swapfile_pin(inode, bg, true);
+ if (ret) {
+ btrfs_put_block_group(bg);
+ if (ret == 1)
+ ret = 0;
+ else
+ goto out;
+ }
+
+ if (bsi.block_len &&
+ bsi.block_start + bsi.block_len == physical_block_start) {
+ bsi.block_len += len;
+ } else {
+ if (bsi.block_len) {
+ ret = btrfs_add_swap_extent(sis, &bsi);
+ if (ret)
+ goto out;
+ }
+ bsi.start = start;
+ bsi.block_start = physical_block_start;
+ bsi.block_len = len;
+ }
+
+ start += len;
+ }
+
+ if (bsi.block_len)
+ ret = btrfs_add_swap_extent(sis, &bsi);
+
+out:
+ if (!IS_ERR_OR_NULL(em))
+ free_extent_map(em);
+
+ unlock_extent(io_tree, 0, isize - 1, &cached_state);
+
+ if (ret)
+ btrfs_swap_deactivate(file);
+
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+
+ btrfs_exclop_finish(fs_info);
+
+ if (ret)
+ return ret;
+
+ if (device)
+ sis->bdev = device->bdev;
+ *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
+ sis->max = bsi.nr_pages;
+ sis->pages = bsi.nr_pages - 1;
+ sis->highest_bit = bsi.nr_pages - 1;
+ return bsi.nr_extents;
+}
+#else
+static void btrfs_swap_deactivate(struct file *file)
+{
+}
+
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+/*
+ * Update the number of bytes used in the VFS' inode. When we replace extents in
+ * a range (clone, dedupe, fallocate's zero range), we must update the number of
+ * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
+ * always get a correct value.
+ */
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+ const u64 add_bytes,
+ const u64 del_bytes)
+{
+ if (add_bytes == del_bytes)
+ return;
+
+ spin_lock(&inode->lock);
+ if (del_bytes > 0)
+ inode_sub_bytes(&inode->vfs_inode, del_bytes);
+ if (add_bytes > 0)
+ inode_add_bytes(&inode->vfs_inode, add_bytes);
+ spin_unlock(&inode->lock);
+}
+
+/**
+ * Verify that there are no ordered extents for a given file range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the file range, should be sector size aligned.
+ * @end: End offset (inclusive) of the file range, its value +1 should be
+ * sector size aligned.
+ *
+ * This should typically be used for cases where we locked an inode's VFS lock in
+ * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range, we have waited for all ordered
+ * extents in the range to complete and finally we have locked the file range in
+ * the inode's io_tree.
+ */
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_ordered_extent *ordered;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
+ if (ordered) {
+ btrfs_err(root->fs_info,
+"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
+ start, end, btrfs_ino(inode), root->root_key.objectid,
+ ordered->file_offset,
+ ordered->file_offset + ordered->num_bytes - 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ ASSERT(ordered == NULL);
+}
+
+static const struct inode_operations btrfs_dir_inode_operations = {
+ .getattr = btrfs_getattr,
+ .lookup = btrfs_lookup,
+ .create = btrfs_create,
+ .unlink = btrfs_unlink,
+ .link = btrfs_link,
+ .mkdir = btrfs_mkdir,
+ .rmdir = btrfs_rmdir,
+ .rename = btrfs_rename2,
+ .symlink = btrfs_symlink,
+ .setattr = btrfs_setattr,
+ .mknod = btrfs_mknod,
+ .listxattr = btrfs_listxattr,
+ .permission = btrfs_permission,
+ .get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
+ .update_time = btrfs_update_time,
+ .tmpfile = btrfs_tmpfile,
+ .fileattr_get = btrfs_fileattr_get,
+ .fileattr_set = btrfs_fileattr_set,
+};
+
+static const struct file_operations btrfs_dir_file_operations = {
+ .llseek = btrfs_dir_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = btrfs_real_readdir,
+ .open = btrfs_opendir,
+ .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = btrfs_compat_ioctl,
+#endif
+ .release = btrfs_release_file,
+ .fsync = btrfs_sync_file,
+};
+
+/*
+ * btrfs doesn't support the bmap operation because swapfiles
+ * use bmap to make a mapping of extents in the file. They assume
+ * these extents won't change over the life of the file and they
+ * use the bmap result to do IO directly to the drive.
+ *
+ * the btrfs bmap call would return logical addresses that aren't
+ * suitable for IO and they also will change frequently as COW
+ * operations happen. So, swapfile + btrfs == corruption.
+ *
+ * For now we're avoiding this by dropping bmap.
+ */
+static const struct address_space_operations btrfs_aops = {
+ .read_folio = btrfs_read_folio,
+ .writepages = btrfs_writepages,
+ .readahead = btrfs_readahead,
+ .direct_IO = noop_direct_IO,
+ .invalidate_folio = btrfs_invalidate_folio,
+ .release_folio = btrfs_release_folio,
+ .migrate_folio = btrfs_migrate_folio,
+ .dirty_folio = filemap_dirty_folio,
+ .error_remove_page = generic_error_remove_page,
+ .swap_activate = btrfs_swap_activate,
+ .swap_deactivate = btrfs_swap_deactivate,
+};
+
+static const struct inode_operations btrfs_file_inode_operations = {
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .listxattr = btrfs_listxattr,
+ .permission = btrfs_permission,
+ .fiemap = btrfs_fiemap,
+ .get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
+ .update_time = btrfs_update_time,
+ .fileattr_get = btrfs_fileattr_get,
+ .fileattr_set = btrfs_fileattr_set,
+};
+static const struct inode_operations btrfs_special_inode_operations = {
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .permission = btrfs_permission,
+ .listxattr = btrfs_listxattr,
+ .get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
+ .update_time = btrfs_update_time,
+};
+static const struct inode_operations btrfs_symlink_inode_operations = {
+ .get_link = page_get_link,
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .permission = btrfs_permission,
+ .listxattr = btrfs_listxattr,
+ .update_time = btrfs_update_time,
+};
+
+const struct dentry_operations btrfs_dentry_operations = {
+ .d_delete = btrfs_dentry_delete,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000..8516c70b5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,5626 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/compat.h>
+#include <linux/security.h>
+#include <linux/xattr.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/uuid.h>
+#include <linux/btrfs.h>
+#include <linux/uaccess.h>
+#include <linux/iversion.h>
+#include <linux/fileattr.h>
+#include <linux/fsverity.h>
+#include <linux/sched/xacct.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "export.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+#include "backref.h"
+#include "rcu-string.h"
+#include "send.h"
+#include "dev-replace.h"
+#include "props.h"
+#include "sysfs.h"
+#include "qgroup.h"
+#include "tree-log.h"
+#include "compression.h"
+#include "space-info.h"
+#include "delalloc-space.h"
+#include "block-group.h"
+#include "subpage.h"
+
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+ __u64 sec;
+ __u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+ char uuid[BTRFS_UUID_SIZE]; /* in */
+ __u64 stransid; /* in */
+ __u64 rtransid; /* out */
+ struct btrfs_ioctl_timespec_32 stime; /* in */
+ struct btrfs_ioctl_timespec_32 rtime; /* out */
+ __u64 flags; /* in */
+ __u64 reserved[16]; /* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+ struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+struct btrfs_ioctl_send_args_32 {
+ __s64 send_fd; /* in */
+ __u64 clone_sources_count; /* in */
+ compat_uptr_t clone_sources; /* in */
+ __u64 parent_root; /* in */
+ __u64 flags; /* in */
+ __u32 version; /* in */
+ __u8 reserved[28]; /* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
+ struct btrfs_ioctl_send_args_32)
+
+struct btrfs_ioctl_encoded_io_args_32 {
+ compat_uptr_t iov;
+ compat_ulong_t iovcnt;
+ __s64 offset;
+ __u64 flags;
+ __u64 len;
+ __u64 unencoded_len;
+ __u64 unencoded_offset;
+ __u32 compression;
+ __u32 encryption;
+ __u8 reserved[64];
+};
+
+#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
+#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
+#endif
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
+ unsigned int flags)
+{
+ if (S_ISDIR(inode->i_mode))
+ return flags;
+ else if (S_ISREG(inode->i_mode))
+ return flags & ~FS_DIRSYNC_FL;
+ else
+ return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/*
+ * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
+ * ioctl.
+ */
+static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
+{
+ unsigned int iflags = 0;
+ u32 flags = binode->flags;
+ u32 ro_flags = binode->ro_flags;
+
+ if (flags & BTRFS_INODE_SYNC)
+ iflags |= FS_SYNC_FL;
+ if (flags & BTRFS_INODE_IMMUTABLE)
+ iflags |= FS_IMMUTABLE_FL;
+ if (flags & BTRFS_INODE_APPEND)
+ iflags |= FS_APPEND_FL;
+ if (flags & BTRFS_INODE_NODUMP)
+ iflags |= FS_NODUMP_FL;
+ if (flags & BTRFS_INODE_NOATIME)
+ iflags |= FS_NOATIME_FL;
+ if (flags & BTRFS_INODE_DIRSYNC)
+ iflags |= FS_DIRSYNC_FL;
+ if (flags & BTRFS_INODE_NODATACOW)
+ iflags |= FS_NOCOW_FL;
+ if (ro_flags & BTRFS_INODE_RO_VERITY)
+ iflags |= FS_VERITY_FL;
+
+ if (flags & BTRFS_INODE_NOCOMPRESS)
+ iflags |= FS_NOCOMP_FL;
+ else if (flags & BTRFS_INODE_COMPRESS)
+ iflags |= FS_COMPR_FL;
+
+ return iflags;
+}
+
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
+{
+ struct btrfs_inode *binode = BTRFS_I(inode);
+ unsigned int new_fl = 0;
+
+ if (binode->flags & BTRFS_INODE_SYNC)
+ new_fl |= S_SYNC;
+ if (binode->flags & BTRFS_INODE_IMMUTABLE)
+ new_fl |= S_IMMUTABLE;
+ if (binode->flags & BTRFS_INODE_APPEND)
+ new_fl |= S_APPEND;
+ if (binode->flags & BTRFS_INODE_NOATIME)
+ new_fl |= S_NOATIME;
+ if (binode->flags & BTRFS_INODE_DIRSYNC)
+ new_fl |= S_DIRSYNC;
+ if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+ new_fl |= S_VERITY;
+
+ set_mask_bits(&inode->i_flags,
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+ S_VERITY, new_fl);
+}
+
+/*
+ * Check if @flags are a supported and valid set of FS_*_FL flags and that
+ * the old and new flags are not conflicting
+ */
+static int check_fsflags(unsigned int old_flags, unsigned int flags)
+{
+ if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+ FS_NOATIME_FL | FS_NODUMP_FL | \
+ FS_SYNC_FL | FS_DIRSYNC_FL | \
+ FS_NOCOMP_FL | FS_COMPR_FL |
+ FS_NOCOW_FL))
+ return -EOPNOTSUPP;
+
+ /* COMPR and NOCOMP on new/old are valid */
+ if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+ return -EINVAL;
+
+ if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
+ return -EINVAL;
+
+ /* NOCOW and compression options are mutually exclusive */
+ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+ if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
+ unsigned int flags)
+{
+ if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
+ return -EPERM;
+
+ return 0;
+}
+
+/*
+ * Set flags/xflags from the internal inode flags. The remaining items of
+ * fsxattr are zeroed.
+ */
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
+
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
+ return 0;
+}
+
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *binode = BTRFS_I(inode);
+ struct btrfs_root *root = binode->root;
+ struct btrfs_trans_handle *trans;
+ unsigned int fsflags, old_fsflags;
+ int ret;
+ const char *comp = NULL;
+ u32 binode_flags;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
+
+ fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(binode);
+ ret = check_fsflags(old_fsflags, fsflags);
+ if (ret)
+ return ret;
+
+ ret = check_fsflags_compatible(fs_info, fsflags);
+ if (ret)
+ return ret;
+
+ binode_flags = binode->flags;
+ if (fsflags & FS_SYNC_FL)
+ binode_flags |= BTRFS_INODE_SYNC;
+ else
+ binode_flags &= ~BTRFS_INODE_SYNC;
+ if (fsflags & FS_IMMUTABLE_FL)
+ binode_flags |= BTRFS_INODE_IMMUTABLE;
+ else
+ binode_flags &= ~BTRFS_INODE_IMMUTABLE;
+ if (fsflags & FS_APPEND_FL)
+ binode_flags |= BTRFS_INODE_APPEND;
+ else
+ binode_flags &= ~BTRFS_INODE_APPEND;
+ if (fsflags & FS_NODUMP_FL)
+ binode_flags |= BTRFS_INODE_NODUMP;
+ else
+ binode_flags &= ~BTRFS_INODE_NODUMP;
+ if (fsflags & FS_NOATIME_FL)
+ binode_flags |= BTRFS_INODE_NOATIME;
+ else
+ binode_flags &= ~BTRFS_INODE_NOATIME;
+
+ /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
+ if (!fa->flags_valid) {
+ /* 1 item for the inode */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ goto update_flags;
+ }
+
+ if (fsflags & FS_DIRSYNC_FL)
+ binode_flags |= BTRFS_INODE_DIRSYNC;
+ else
+ binode_flags &= ~BTRFS_INODE_DIRSYNC;
+ if (fsflags & FS_NOCOW_FL) {
+ if (S_ISREG(inode->i_mode)) {
+ /*
+ * It's safe to turn csums off here, no extents exist.
+ * Otherwise we want the flag to reflect the real COW
+ * status of the file and will not set it.
+ */
+ if (inode->i_size == 0)
+ binode_flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
+ } else {
+ binode_flags |= BTRFS_INODE_NODATACOW;
+ }
+ } else {
+ /*
+ * Revert back under same assumptions as above
+ */
+ if (S_ISREG(inode->i_mode)) {
+ if (inode->i_size == 0)
+ binode_flags &= ~(BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM);
+ } else {
+ binode_flags &= ~BTRFS_INODE_NODATACOW;
+ }
+ }
+
+ /*
+ * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+ * flag may be changed automatically if compression code won't make
+ * things smaller.
+ */
+ if (fsflags & FS_NOCOMP_FL) {
+ binode_flags &= ~BTRFS_INODE_COMPRESS;
+ binode_flags |= BTRFS_INODE_NOCOMPRESS;
+ } else if (fsflags & FS_COMPR_FL) {
+
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
+ binode_flags |= BTRFS_INODE_COMPRESS;
+ binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
+
+ comp = btrfs_compress_type2str(fs_info->compress_type);
+ if (!comp || comp[0] == 0)
+ comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
+ } else {
+ binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ }
+
+ /*
+ * 1 for inode item
+ * 2 for properties
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ if (comp) {
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
+ strlen(comp), 0);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ } else {
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
+ 0, 0);
+ if (ret && ret != -ENODATA) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
+ }
+
+update_flags:
+ binode->flags = binode_flags;
+ btrfs_sync_inode_flags_to_i_flags(inode);
+ inode_inc_iversion(inode);
+ inode->i_ctime = current_time(inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+
+ out_end_trans:
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+/*
+ * Start exclusive operation @type, return true on success
+ */
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ bool ret = false;
+
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
+ fs_info->exclusive_operation = type;
+ ret = true;
+ }
+ spin_unlock(&fs_info->super_lock);
+
+ return ret;
+}
+
+/*
+ * Conditionally allow to enter the exclusive operation in case it's compatible
+ * with the running one. This must be paired with btrfs_exclop_start_unlock and
+ * btrfs_exclop_finish.
+ *
+ * Compatibility:
+ * - the same type is already running
+ * - when trying to add a device and balance has been paused
+ * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
+ * must check the condition first that would allow none -> @type
+ */
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == type ||
+ (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
+ type == BTRFS_EXCLOP_DEV_ADD))
+ return true;
+
+ spin_unlock(&fs_info->super_lock);
+ return false;
+}
+
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
+{
+ spin_unlock(&fs_info->super_lock);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->super_lock);
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ spin_unlock(&fs_info->super_lock);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op)
+{
+ switch (op) {
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ default:
+ btrfs_warn(fs_info,
+ "invalid exclop balance operation %d requested", op);
+ }
+}
+
+static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
+{
+ return put_user(inode->i_generation, arg);
+}
+
+static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_device *device;
+ struct fstrim_range range;
+ u64 minlen = ULLONG_MAX;
+ u64 num_devices = 0;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * btrfs_trim_block_group() depends on space cache, which is not
+ * available in zoned filesystem. So, disallow fitrim on a zoned
+ * filesystem for now.
+ */
+ if (btrfs_is_zoned(fs_info))
+ return -EOPNOTSUPP;
+
+ /*
+ * If the fs is mounted with nologreplay, which requires it to be
+ * mounted in RO mode as well, we can not allow discard on free space
+ * inside block groups, because log trees refer to extents that are not
+ * pinned in a block group's free space cache (pinning the extents is
+ * precisely the first phase of replaying a log tree).
+ */
+ if (btrfs_test_opt(fs_info, NOLOGREPLAY))
+ return -EROFS;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
+ dev_list) {
+ if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
+ continue;
+ num_devices++;
+ minlen = min_t(u64, bdev_discard_granularity(device->bdev),
+ minlen);
+ }
+ rcu_read_unlock();
+
+ if (!num_devices)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&range, arg, sizeof(range)))
+ return -EFAULT;
+
+ /*
+ * NOTE: Don't truncate the range using super->total_bytes. Bytenr of
+ * block group is in the logical address space, which can be any
+ * sectorsize aligned bytenr in the range [0, U64_MAX].
+ */
+ if (range.len < fs_info->sb->s_blocksize)
+ return -EINVAL;
+
+ range.minlen = max(range.minlen, minlen);
+ ret = btrfs_trim_fs(fs_info, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(arg, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int __pure btrfs_is_empty_uuid(u8 *uuid)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_UUID_SIZE; i++) {
+ if (uuid[i])
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Calculate the number of transaction items to reserve for creating a subvolume
+ * or snapshot, not including the inode, directory entries, or parent directory.
+ */
+static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+{
+ /*
+ * 1 to add root block
+ * 1 to add root item
+ * 1 to add root ref
+ * 1 to add root backref
+ * 1 to add UUID item
+ * 1 to add qgroup info
+ * 1 to add qgroup limit
+ *
+ * Ideally the last two would only be accounted if qgroups are enabled,
+ * but that can change between now and the time we would insert them.
+ */
+ unsigned int num_items = 7;
+
+ if (inherit) {
+ /* 2 to add qgroup relations for each inherited qgroup */
+ num_items += 2 * inherit->num_qgroups;
+ }
+ return num_items;
+}
+
+static noinline int create_subvol(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key;
+ struct btrfs_root_item *root_item;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *new_root;
+ struct btrfs_block_rsv block_rsv;
+ struct timespec64 cur_time = current_time(dir);
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .subvol = true,
+ };
+ unsigned int trans_num_items;
+ int ret;
+ dev_t anon_dev;
+ u64 objectid;
+
+ root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
+ if (!root_item)
+ return -ENOMEM;
+
+ ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
+ if (ret)
+ goto out_root_item;
+
+ /*
+ * Don't create subvolume whose level is not zero. Or qgroup will be
+ * screwed up since it assumes subvolume qgroup's level to be 0.
+ */
+ if (btrfs_qgroup_level(objectid)) {
+ ret = -ENOSPC;
+ goto out_root_item;
+ }
+
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto out_root_item;
+
+ new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
+ if (!new_inode_args.inode) {
+ ret = -ENOMEM;
+ goto out_anon_dev;
+ }
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
+ goto out_inode;
+ trans_num_items += create_subvol_num_items(inherit);
+
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ trans_num_items, false);
+ if (ret)
+ goto out_new_inode_args;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
+ goto out_new_inode_args;
+ }
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
+
+ ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
+ if (ret)
+ goto out;
+
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ goto out;
+ }
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ inode_item = &root_item->inode;
+ btrfs_set_stack_inode_generation(inode_item, 1);
+ btrfs_set_stack_inode_size(inode_item, 3);
+ btrfs_set_stack_inode_nlink(inode_item, 1);
+ btrfs_set_stack_inode_nbytes(inode_item,
+ fs_info->nodesize);
+ btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
+
+ btrfs_set_root_flags(root_item, 0);
+ btrfs_set_root_limit(root_item, 0);
+ btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
+
+ btrfs_set_root_bytenr(root_item, leaf->start);
+ btrfs_set_root_generation(root_item, trans->transid);
+ btrfs_set_root_level(root_item, 0);
+ btrfs_set_root_refs(root_item, 1);
+ btrfs_set_root_used(root_item, leaf->len);
+ btrfs_set_root_last_snapshot(root_item, 0);
+
+ btrfs_set_root_generation_v2(root_item,
+ btrfs_root_generation(root_item));
+ generate_random_guid(root_item->uuid);
+ btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
+ btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
+ root_item->ctime = root_item->otime;
+ btrfs_set_root_ctransid(root_item, trans->transid);
+ btrfs_set_root_otransid(root_item, trans->transid);
+
+ btrfs_tree_unlock(leaf);
+
+ btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
+
+ key.objectid = objectid;
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
+ root_item);
+ if (ret) {
+ /*
+ * Since we don't abort the transaction in this case, free the
+ * tree block so that we don't leak space and leave the
+ * filesystem in an inconsistent state (an extent item in the
+ * extent tree with a backreference for a root that does not
+ * exists).
+ */
+ btrfs_tree_lock(leaf);
+ btrfs_clean_tree_block(leaf);
+ btrfs_tree_unlock(leaf);
+ btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ free_extent_buffer(leaf);
+ goto out;
+ }
+
+ free_extent_buffer(leaf);
+ leaf = NULL;
+
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
+ if (IS_ERR(new_root)) {
+ ret = PTR_ERR(new_root);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ /* anon_dev is owned by new_root now. */
+ anon_dev = 0;
+ BTRFS_I(new_inode_args.inode)->root = new_root;
+ /* ... and new_root is owned by new_inode_args.inode now. */
+
+ ret = btrfs_record_root_in_trans(trans, new_root);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_uuid_tree_add(trans, root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL, objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ d_instantiate_new(dentry, new_inode_args.inode);
+ new_inode_args.inode = NULL;
+
+out:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
+ btrfs_subvolume_release_metadata(root, &block_rsv);
+
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ ret = btrfs_commit_transaction(trans);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ iput(new_inode_args.inode);
+out_anon_dev:
+ if (anon_dev)
+ free_anon_bdev(anon_dev);
+out_root_item:
+ kfree(root_item);
+ return ret;
+}
+
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+ struct dentry *dentry, bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct inode *inode;
+ struct btrfs_pending_snapshot *pending_snapshot;
+ unsigned int trans_num_items;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /* We do not support snapshotting right now. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_warn(fs_info,
+ "extent tree v2 doesn't support snapshotting yet");
+ return -EOPNOTSUPP;
+ }
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return -ENOENT;
+
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return -EINVAL;
+
+ if (atomic_read(&root->nr_swapfiles)) {
+ btrfs_warn(fs_info,
+ "cannot snapshot subvolume with active swapfile");
+ return -ETXTBSY;
+ }
+
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ ret = get_anon_bdev(&pending_snapshot->anon_dev);
+ if (ret < 0)
+ goto free_pending;
+ pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
+ GFP_KERNEL);
+ pending_snapshot->path = btrfs_alloc_path();
+ if (!pending_snapshot->root_item || !pending_snapshot->path) {
+ ret = -ENOMEM;
+ goto free_pending;
+ }
+
+ btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+ BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
+ */
+ trans_num_items = create_subvol_num_items(inherit) + 3;
+ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+ &pending_snapshot->block_rsv,
+ trans_num_items, false);
+ if (ret)
+ goto free_pending;
+
+ pending_snapshot->dentry = dentry;
+ pending_snapshot->root = root;
+ pending_snapshot->readonly = readonly;
+ pending_snapshot->dir = dir;
+ pending_snapshot->inherit = inherit;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto fail;
+ }
+
+ trans->pending_snapshot = pending_snapshot;
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ goto fail;
+
+ ret = pending_snapshot->error;
+ if (ret)
+ goto fail;
+
+ ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+ if (ret)
+ goto fail;
+
+ inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto fail;
+ }
+
+ d_instantiate(dentry, inode);
+ ret = 0;
+ pending_snapshot->anon_dev = 0;
+fail:
+ /* Prevent double freeing of anon_dev */
+ if (ret && pending_snapshot->snap)
+ pending_snapshot->snap->anon_dev = 0;
+ btrfs_put_root(pending_snapshot->snap);
+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
+free_pending:
+ if (pending_snapshot->anon_dev)
+ free_anon_bdev(pending_snapshot->anon_dev);
+ kfree(pending_snapshot->root_item);
+ btrfs_free_path(pending_snapshot->path);
+ kfree(pending_snapshot);
+
+ return ret;
+}
+
+/* copy of may_delete in fs/namei.c()
+ * Check whether we can remove a link victim from directory dir, check
+ * whether the type of victim is right.
+ * 1. We can't do it if dir is read-only (done in permission())
+ * 2. We should have write and exec permissions on dir
+ * 3. We can't remove anything from append-only dir
+ * 4. We can't do anything with immutable dir (done in permission())
+ * 5. If the sticky bit on dir is set we should either
+ * a. be owner of dir, or
+ * b. be owner of victim, or
+ * c. have CAP_FOWNER capability
+ * 6. If the victim is append-only or immutable we can't do anything with
+ * links pointing to it.
+ * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ * nfs_async_unlink().
+ */
+
+static int btrfs_may_delete(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *victim, int isdir)
+{
+ int error;
+
+ if (d_really_is_negative(victim))
+ return -ENOENT;
+
+ BUG_ON(d_inode(victim->d_parent) != dir);
+ audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
+
+ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
+ if (error)
+ return error;
+ if (IS_APPEND(dir))
+ return -EPERM;
+ if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
+ IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
+ IS_SWAPFILE(d_inode(victim)))
+ return -EPERM;
+ if (isdir) {
+ if (!d_is_dir(victim))
+ return -ENOTDIR;
+ if (IS_ROOT(victim))
+ return -EBUSY;
+ } else if (d_is_dir(victim))
+ return -EISDIR;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+ return -EBUSY;
+ return 0;
+}
+
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *child)
+{
+ if (d_really_is_positive(child))
+ return -EEXIST;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
+ return -EOVERFLOW;
+ return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * Create a new subvolume below @parent. This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(const struct path *parent,
+ struct user_namespace *mnt_userns,
+ const char *name, int namelen,
+ struct btrfs_root *snap_src,
+ bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ struct inode *dir = d_inode(parent->dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct dentry *dentry;
+ struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
+ int error;
+
+ error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (error == -EINTR)
+ return error;
+
+ dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
+ error = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ goto out_unlock;
+
+ error = btrfs_may_create(mnt_userns, dir, dentry);
+ if (error)
+ goto out_dput;
+
+ /*
+ * even if this name doesn't exist, we may get hash collisions.
+ * check for them now when we can safely fail
+ */
+ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+ dir->i_ino, &name_str);
+ if (error)
+ goto out_dput;
+
+ down_read(&fs_info->subvol_sem);
+
+ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+ goto out_up_read;
+
+ if (snap_src)
+ error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
+ else
+ error = create_subvol(mnt_userns, dir, dentry, inherit);
+
+ if (!error)
+ fsnotify_mkdir(dir, dentry);
+out_up_read:
+ up_read(&fs_info->subvol_sem);
+out_dput:
+ dput(dentry);
+out_unlock:
+ btrfs_inode_unlock(dir, 0);
+ return error;
+}
+
+static noinline int btrfs_mksnapshot(const struct path *parent,
+ struct user_namespace *mnt_userns,
+ const char *name, int namelen,
+ struct btrfs_root *root,
+ bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ int ret;
+ bool snapshot_force_cow = false;
+
+ /*
+ * Force new buffered writes to reserve space even when NOCOW is
+ * possible. This is to avoid later writeback (running dealloc) to
+ * fallback to COW mode and unexpectedly fail with ENOSPC.
+ */
+ btrfs_drew_read_lock(&root->snapshot_lock);
+
+ ret = btrfs_start_delalloc_snapshot(root, false);
+ if (ret)
+ goto out;
+
+ /*
+ * All previous writes have started writeback in NOCOW mode, so now
+ * we force future writes to fallback to COW mode during snapshot
+ * creation.
+ */
+ atomic_inc(&root->snapshot_force_cow);
+ snapshot_force_cow = true;
+
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
+ root, readonly, inherit);
+out:
+ if (snapshot_force_cow)
+ atomic_dec(&root->snapshot_force_cow);
+ btrfs_drew_read_unlock(&root->snapshot_lock);
+ return ret;
+}
+
+/*
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ * To reduce memory usage in the long run.
+ *
+ * - Extra optimization to skip file extents older than @newer_than
+ * By using btrfs_search_forward() we can skip entire file ranges that
+ * have extents created in past transactions, because btrfs_search_forward()
+ * will not visit leaves and nodes with a generation smaller than given
+ * minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
+ */
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+ u64 start, u64 newer_than)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path path = { 0 };
+ struct extent_map *em;
+ struct btrfs_key key;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (newer_than) {
+ ret = btrfs_search_forward(root, &key, &path, newer_than);
+ if (ret < 0)
+ goto err;
+ /* Can't find anything newer */
+ if (ret > 0)
+ goto not_found;
+ } else {
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ goto err;
+ }
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+ /*
+ * If btrfs_search_slot() makes path to point beyond nritems,
+ * we should not have an empty leaf, as this inode must at
+ * least have its INODE_ITEM.
+ */
+ ASSERT(btrfs_header_nritems(path.nodes[0]));
+ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+ }
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ /* Perfect match, no need to go one slot back */
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.offset == start)
+ goto iterate;
+
+ /* We didn't find a perfect match, needs to go one slot back */
+ if (path.slots[0] > 0) {
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path.slots[0]--;
+ }
+
+iterate:
+ /* Iterate through the path to find a file extent covering @start */
+ while (true) {
+ u64 extent_end;
+
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ goto next;
+
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+
+ /*
+ * We may go one slot back to INODE_REF/XATTR item, then
+ * need to go forward until we reach an EXTENT_DATA.
+ * But we should still has the correct ino as key.objectid.
+ */
+ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
+
+ /* It's beyond our target range, definitely not extent found */
+ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+
+ /*
+ * | |<- File extent ->|
+ * \- start
+ *
+ * This means there is a hole between start and key.offset.
+ */
+ if (key.offset > start) {
+ em->start = start;
+ em->orig_start = start;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->len = key.offset - start;
+ break;
+ }
+
+ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(&path);
+
+ /*
+ * |<- file extent ->| |
+ * \- start
+ *
+ * We haven't reached start, search next slot.
+ */
+ if (extent_end <= start)
+ goto next;
+
+ /* Now this extent covers @start, convert it to em */
+ btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
+ break;
+next:
+ ret = btrfs_next_item(root, &path);
+ if (ret < 0)
+ goto err;
+ if (ret > 0)
+ goto not_found;
+ }
+ btrfs_release_path(&path);
+ return em;
+
+not_found:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return NULL;
+
+err:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return ERR_PTR(ret);
+}
+
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ u64 newer_than, bool locked)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
+
+ /*
+ * hopefully we have this extent in the tree already, try without
+ * the full extent lock
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
+ read_unlock(&em_tree->lock);
+
+ /*
+ * We can get a merged extent, in that case, we need to re-search
+ * tree to get the original em for defrag.
+ *
+ * If @newer_than is 0 or em::generation < newer_than, we can trust
+ * this em, as either we don't care about the generation, or the
+ * merged extent map will be rejected anyway.
+ */
+ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ newer_than && em->generation >= newer_than) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
+ if (!em) {
+ struct extent_state *cached = NULL;
+ u64 end = start + sectorsize - 1;
+
+ /* get the big lock and read metadata off disk */
+ if (!locked)
+ lock_extent(io_tree, start, end, &cached);
+ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
+ if (!locked)
+ unlock_extent(io_tree, start, end, &cached);
+
+ if (IS_ERR(em))
+ return NULL;
+ }
+
+ return em;
+}
+
+static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
+ const struct extent_map *em)
+{
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ return BTRFS_MAX_COMPRESSED;
+ return fs_info->max_extent_size;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ u32 extent_thresh, u64 newer_than, bool locked)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_map *next;
+ bool ret = false;
+
+ /* this is the last extent */
+ if (em->start + em->len >= i_size_read(inode))
+ return false;
+
+ /*
+ * Here we need to pass @newer_then when checking the next extent, or
+ * we will hit a case we mark current extent for defrag, but the next
+ * one will not be a target.
+ * This will just cause extra IO without really reducing the fragments.
+ */
+ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
+ /* No more em or hole */
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ goto out;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ goto out;
+ /*
+ * If the next extent is at its max capacity, defragging current extent
+ * makes no sense, as the total number of extents won't change.
+ */
+ if (next->len >= get_extent_max_capacity(fs_info, em))
+ goto out;
+ /* Skip older extent */
+ if (next->generation < newer_than)
+ goto out;
+ /* Also check extent size */
+ if (next->len >= extent_thresh)
+ goto out;
+
+ ret = true;
+out:
+ free_extent_map(next);
+ return ret;
+}
+
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+ pgoff_t index)
+{
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
+ */
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
+
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
+
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
+
+ /*
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
+ */
+ if (!PageUptodate(page)) {
+ btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Collect all valid target extents.
+ *
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
+ */
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list,
+ u64 *last_scanned_ret)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ bool last_is_target = false;
+ u64 cur = start;
+ int ret = 0;
+
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
+
+ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur,
+ newer_than, locked);
+ if (!em)
+ break;
+
+ /*
+ * If the file extent is an inlined one, we may still want to
+ * defrag it (fallthrough) if it will cause a regular extent.
+ * This is for users who want to convert inline extents to
+ * regular ones through max_inline= mount option.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE &&
+ em->len <= inode->root->fs_info->max_inline)
+ goto next;
+
+ /* Skip hole/delalloc/preallocated extents */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ em->block_start == EXTENT_MAP_DELALLOC ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
+
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
+
+ /* This em is under writeback, no need to defrag */
+ if (em->generation == (u64)-1)
+ goto next;
+
+ /*
+ * Our start offset might be in the middle of an existing extent
+ * map, so take that into account.
+ */
+ range_len = em->len - (cur - em->start);
+ /*
+ * If this range of the extent map is already flagged for delalloc,
+ * skip it, because:
+ *
+ * 1) We could deadlock later, when trying to reserve space for
+ * delalloc, because in case we can't immediately reserve space
+ * the flusher can start delalloc and wait for the respective
+ * ordered extents to complete. The deadlock would happen
+ * because we do the space reservation while holding the range
+ * locked, and starting writeback, or finishing an ordered
+ * extent, requires locking the range;
+ *
+ * 2) If there's delalloc there, it means there's dirty pages for
+ * which writeback has not started yet (we clean the delalloc
+ * flag when starting writeback and after creating an ordered
+ * extent). If we mark pages in an adjacent range for defrag,
+ * then we will have a larger contiguous range for delalloc,
+ * very likely resulting in a larger extent after writeback is
+ * triggered (except in a case of free space fragmentation).
+ */
+ if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC, 0, NULL))
+ goto next;
+
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
+
+ /* Skip too large extent */
+ if (range_len >= extent_thresh)
+ goto next;
+
+ /*
+ * Skip extents already at its max capacity, this is mostly for
+ * compressed extents, which max cap is only 128K.
+ */
+ if (em->len >= get_extent_max_capacity(fs_info, em))
+ goto next;
+
+ /*
+ * Normally there are no more extents after an inline one, thus
+ * @next_mergeable will normally be false and not defragged.
+ * So if an inline extent passed all above checks, just add it
+ * for defrag, and be converted to regular extents.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ goto add;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ extent_thresh, newer_than, locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
+ }
+
+add:
+ last_is_target = true;
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
+ }
+ /* Fall through to allocate a new entry */
+ }
+
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
+ }
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
+
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
+ }
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ }
+ if (!ret && last_scanned_ret) {
+ /*
+ * If the last extent is not a target, the caller can skip to
+ * the end of that extent.
+ * Otherwise, we can only go the end of the specified range.
+ */
+ if (!last_is_target)
+ *last_scanned_ret = max(cur, *last_scanned_ret);
+ else
+ *last_scanned_ret = max(start + len, *last_scanned_ret);
+ }
+ return ret;
+}
+
+#define CLUSTER_SIZE (SZ_256K)
+static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
+
+ ASSERT(last_index - first_index + 1 <= nr_pages);
+
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
+
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ }
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress,
+ u64 *last_scanned_ret)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
+
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
+ wait_on_page_writeback(pages[i]);
+
+ /* Lock the pages range */
+ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+ /*
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
+ */
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list, last_scanned_ret);
+ if (ret < 0)
+ goto unlock_extent;
+
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
+
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+unlock_extent:
+ unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ }
+ kfree(pages);
+ return ret;
+}
+
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors,
+ u64 *last_scanned_ret)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
+
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list, NULL);
+ if (ret < 0)
+ goto out;
+
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
+
+ /* Reached or beyond the limit */
+ if (max_sectors && *sectors_defragged >= max_sectors) {
+ ret = 1;
+ break;
+ }
+
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
+
+ /*
+ * If defrag_one_range() has updated last_scanned_ret,
+ * our range may already be invalid (e.g. hole punched).
+ * Skip if our range is before last_scanned_ret, as there is
+ * no need to defrag the range anymore.
+ */
+ if (entry->start + range_len <= *last_scanned_ret)
+ continue;
+
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress,
+ last_scanned_ret);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len >>
+ inode->root->fs_info->sectorsize_bits;
+ }
+out:
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ if (ret >= 0)
+ *last_scanned_ret = max(*last_scanned_ret, start + len);
+ return ret;
+}
+
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ *
+ * Return <0 for error.
+ * Return >=0 for the number of sectors defragged, and range->start will be updated
+ * to indicate the file offset where next defrag should be started at.
+ * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+ * defragging all the range).
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ unsigned long sectors_defragged = 0;
+ u64 isize = i_size_read(inode);
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ bool ra_allocated = false;
+ int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
+ u32 extent_thresh = range->extent_thresh;
+ pgoff_t start_index;
+
+ if (isize == 0)
+ return 0;
+
+ if (range->start >= isize)
+ return -EINVAL;
+
+ if (do_compress) {
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
+
+ if (extent_thresh == 0)
+ extent_thresh = SZ_256K;
+
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len);
+ } else {
+ /* Defrag until file end */
+ last_byte = isize;
+ }
+
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+
+ /*
+ * If we were not given a ra, allocate a readahead context. As
+ * readahead is just an optimization, defrag will work without it so
+ * we don't error out.
+ */
+ if (!ra) {
+ ra_allocated = true;
+ ra = kzalloc(sizeof(*ra), GFP_KERNEL);
+ if (ra)
+ file_ra_state_init(ra, inode->i_mapping);
+ }
+
+ /*
+ * Make writeback start from the beginning of the range, so that the
+ * defrag range can be written sequentially.
+ */
+ start_index = cur >> PAGE_SHIFT;
+ if (start_index < inode->i_mapping->writeback_index)
+ inode->i_mapping->writeback_index = start_index;
+
+ while (cur < last_byte) {
+ const unsigned long prev_sectors_defragged = sectors_defragged;
+ u64 last_scanned = cur;
+ u64 cluster_end;
+
+ if (btrfs_defrag_cancelled(fs_info)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
+
+ btrfs_inode_lock(inode, 0);
+ if (IS_SWAPFILE(inode)) {
+ ret = -ETXTBSY;
+ btrfs_inode_unlock(inode, 0);
+ break;
+ }
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
+ btrfs_inode_unlock(inode, 0);
+ break;
+ }
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress, &sectors_defragged,
+ max_to_defrag, &last_scanned);
+
+ if (sectors_defragged > prev_sectors_defragged)
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+
+ btrfs_inode_unlock(inode, 0);
+ if (ret < 0)
+ break;
+ cur = max(cluster_end + 1, last_scanned);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ cond_resched();
+ }
+
+ if (ra_allocated)
+ kfree(ra);
+ /*
+ * Update range.start for autodefrag, this will indicate where to start
+ * in next run.
+ */
+ range->start = cur;
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
+ }
+ if (do_compress) {
+ btrfs_inode_lock(inode, 0);
+ BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
+ btrfs_inode_unlock(inode, 0);
+ }
+ return ret;
+}
+
+/*
+ * Try to start exclusive operation @type or cancel it if it's running.
+ *
+ * Return:
+ * 0 - normal mode, newly claimed op started
+ * >0 - normal mode, something else is running,
+ * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
+ * ECANCELED - cancel mode, successful cancel
+ * ENOTCONN - cancel mode, operation not running anymore
+ */
+static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type, bool cancel)
+{
+ if (!cancel) {
+ /* Start normal op */
+ if (!btrfs_exclop_start(fs_info, type))
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ /* Exclusive operation is now claimed */
+ return 0;
+ }
+
+ /* Cancel running op */
+ if (btrfs_exclop_start_try_lock(fs_info, type)) {
+ /*
+ * This blocks any exclop finish from setting it to NONE, so we
+ * request cancellation. Either it runs and we will wait for it,
+ * or it has finished and no waiting will happen.
+ */
+ atomic_inc(&fs_info->reloc_cancel_req);
+ btrfs_exclop_start_unlock(fs_info);
+
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
+ TASK_INTERRUPTIBLE);
+
+ return -ECANCELED;
+ }
+
+ /* Something else is running or none */
+ return -ENOTCONN;
+}
+
+static noinline int btrfs_ioctl_resize(struct file *file,
+ void __user *arg)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 new_size;
+ u64 old_size;
+ u64 devid = 1;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device = NULL;
+ char *sizestr;
+ char *retptr;
+ char *devstr = NULL;
+ int ret = 0;
+ int mod = 0;
+ bool cancel;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ /*
+ * Read the arguments before checking exclusivity to be able to
+ * distinguish regular resize and cancel
+ */
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out_drop;
+ }
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ sizestr = vol_args->name;
+ cancel = (strcmp("cancel", sizestr) == 0);
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
+ if (ret)
+ goto out_free;
+ /* Exclusive operation is now claimed */
+
+ devstr = strchr(sizestr, ':');
+ if (devstr) {
+ sizestr = devstr + 1;
+ *devstr = '\0';
+ devstr = vol_args->name;
+ ret = kstrtoull(devstr, 10, &devid);
+ if (ret)
+ goto out_finish;
+ if (!devid) {
+ ret = -EINVAL;
+ goto out_finish;
+ }
+ btrfs_info(fs_info, "resizing devid %llu", devid);
+ }
+
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ if (!device) {
+ btrfs_info(fs_info, "resizer unable to find device %llu",
+ devid);
+ ret = -ENODEV;
+ goto out_finish;
+ }
+
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ btrfs_info(fs_info,
+ "resizer unable to apply on readonly device %llu",
+ devid);
+ ret = -EPERM;
+ goto out_finish;
+ }
+
+ if (!strcmp(sizestr, "max"))
+ new_size = bdev_nr_bytes(device->bdev);
+ else {
+ if (sizestr[0] == '-') {
+ mod = -1;
+ sizestr++;
+ } else if (sizestr[0] == '+') {
+ mod = 1;
+ sizestr++;
+ }
+ new_size = memparse(sizestr, &retptr);
+ if (*retptr != '\0' || new_size == 0) {
+ ret = -EINVAL;
+ goto out_finish;
+ }
+ }
+
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
+ ret = -EPERM;
+ goto out_finish;
+ }
+
+ old_size = btrfs_device_get_total_bytes(device);
+
+ if (mod < 0) {
+ if (new_size > old_size) {
+ ret = -EINVAL;
+ goto out_finish;
+ }
+ new_size = old_size - new_size;
+ } else if (mod > 0) {
+ if (new_size > ULLONG_MAX - old_size) {
+ ret = -ERANGE;
+ goto out_finish;
+ }
+ new_size = old_size + new_size;
+ }
+
+ if (new_size < SZ_256M) {
+ ret = -EINVAL;
+ goto out_finish;
+ }
+ if (new_size > bdev_nr_bytes(device->bdev)) {
+ ret = -EFBIG;
+ goto out_finish;
+ }
+
+ new_size = round_down(new_size, fs_info->sectorsize);
+
+ if (new_size > old_size) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_finish;
+ }
+ ret = btrfs_grow_device(trans, device, new_size);
+ btrfs_commit_transaction(trans);
+ } else if (new_size < old_size) {
+ ret = btrfs_shrink_device(device, new_size);
+ } /* equal, nothing need to do */
+
+ if (ret == 0 && new_size != old_size)
+ btrfs_info_in_rcu(fs_info,
+ "resize device %s (devid %llu) from %llu to %llu",
+ rcu_str_deref(device->name), device->devid,
+ old_size, new_size);
+out_finish:
+ btrfs_exclop_finish(fs_info);
+out_free:
+ kfree(vol_args);
+out_drop:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static noinline int __btrfs_ioctl_snap_create(struct file *file,
+ struct user_namespace *mnt_userns,
+ const char *name, unsigned long fd, int subvol,
+ bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ int namelen;
+ int ret = 0;
+
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+
+ namelen = strlen(name);
+ if (strchr(name, '/')) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
+
+ if (name[0] == '.' &&
+ (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+ ret = -EEXIST;
+ goto out_drop_write;
+ }
+
+ if (subvol) {
+ ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
+ namelen, NULL, readonly, inherit);
+ } else {
+ struct fd src = fdget(fd);
+ struct inode *src_inode;
+ if (!src.file) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
+
+ src_inode = file_inode(src.file);
+ if (src_inode->i_sb != file_inode(file)->i_sb) {
+ btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
+ "Snapshot src from another FS");
+ ret = -EXDEV;
+ } else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
+ /*
+ * Subvolume creation is not restricted, but snapshots
+ * are limited to own subvolumes only
+ */
+ ret = -EPERM;
+ } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * Snapshots must be made with the src_inode referring
+ * to the subvolume inode, otherwise the permission
+ * checking above is useless because we may have
+ * permission on a lower directory but not the subvol
+ * itself.
+ */
+ ret = -EINVAL;
+ } else {
+ ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
+ name, namelen,
+ BTRFS_I(src_inode)->root,
+ readonly, inherit);
+ }
+ fdput(src);
+ }
+out_drop_write:
+ mnt_drop_write_file(file);
+out:
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+ void __user *arg, int subvol)
+{
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+ ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+ vol_args->name, vol_args->fd, subvol,
+ false, NULL);
+
+ kfree(vol_args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+ void __user *arg, int subvol)
+{
+ struct btrfs_ioctl_vol_args_v2 *vol_args;
+ int ret;
+ bool readonly = false;
+ struct btrfs_qgroup_inherit *inherit = NULL;
+
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+
+ if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
+ ret = -EOPNOTSUPP;
+ goto free_args;
+ }
+
+ if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+ readonly = true;
+ if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+ u64 nums;
+
+ if (vol_args->size < sizeof(*inherit) ||
+ vol_args->size > PAGE_SIZE) {
+ ret = -EINVAL;
+ goto free_args;
+ }
+ inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+ if (IS_ERR(inherit)) {
+ ret = PTR_ERR(inherit);
+ goto free_args;
+ }
+
+ if (inherit->num_qgroups > PAGE_SIZE ||
+ inherit->num_ref_copies > PAGE_SIZE ||
+ inherit->num_excl_copies > PAGE_SIZE) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
+
+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+ 2 * inherit->num_excl_copies;
+ if (vol_args->size != struct_size(inherit, qgroups, nums)) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
+ }
+
+ ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+ vol_args->name, vol_args->fd, subvol,
+ readonly, inherit);
+ if (ret)
+ goto free_inherit;
+free_inherit:
+ kfree(inherit);
+free_args:
+ kfree(vol_args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
+ void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+ u64 flags = 0;
+
+ if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
+ return -EINVAL;
+
+ down_read(&fs_info->subvol_sem);
+ if (btrfs_root_readonly(root))
+ flags |= BTRFS_SUBVOL_RDONLY;
+ up_read(&fs_info->subvol_sem);
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+ void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ u64 root_flags;
+ u64 flags;
+ int ret = 0;
+
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+
+ if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
+
+ if (copy_from_user(&flags, arg, sizeof(flags))) {
+ ret = -EFAULT;
+ goto out_drop_write;
+ }
+
+ if (flags & ~BTRFS_SUBVOL_RDONLY) {
+ ret = -EOPNOTSUPP;
+ goto out_drop_write;
+ }
+
+ down_write(&fs_info->subvol_sem);
+
+ /* nothing to do */
+ if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+ goto out_drop_sem;
+
+ root_flags = btrfs_root_flags(&root->root_item);
+ if (flags & BTRFS_SUBVOL_RDONLY) {
+ btrfs_set_root_flags(&root->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+ } else {
+ /*
+ * Block RO -> RW transition if this subvolume is involved in
+ * send
+ */
+ spin_lock(&root->root_item_lock);
+ if (root->send_in_progress == 0) {
+ btrfs_set_root_flags(&root->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+ spin_unlock(&root->root_item_lock);
+ } else {
+ spin_unlock(&root->root_item_lock);
+ btrfs_warn(fs_info,
+ "Attempt to set subvolume %llu read-write during send",
+ root->root_key.objectid);
+ ret = -EPERM;
+ goto out_drop_sem;
+ }
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_reset;
+ }
+
+ ret = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ if (ret < 0) {
+ btrfs_end_transaction(trans);
+ goto out_reset;
+ }
+
+ ret = btrfs_commit_transaction(trans);
+
+out_reset:
+ if (ret)
+ btrfs_set_root_flags(&root->root_item, root_flags);
+out_drop_sem:
+ up_write(&fs_info->subvol_sem);
+out_drop_write:
+ mnt_drop_write_file(file);
+out:
+ return ret;
+}
+
+static noinline int key_in_sk(struct btrfs_key *key,
+ struct btrfs_ioctl_search_key *sk)
+{
+ struct btrfs_key test;
+ int ret;
+
+ test.objectid = sk->min_objectid;
+ test.type = sk->min_type;
+ test.offset = sk->min_offset;
+
+ ret = btrfs_comp_cpu_keys(key, &test);
+ if (ret < 0)
+ return 0;
+
+ test.objectid = sk->max_objectid;
+ test.type = sk->max_type;
+ test.offset = sk->max_offset;
+
+ ret = btrfs_comp_cpu_keys(key, &test);
+ if (ret > 0)
+ return 0;
+ return 1;
+}
+
+static noinline int copy_to_sk(struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct btrfs_ioctl_search_key *sk,
+ u64 *buf_size,
+ char __user *ubuf,
+ unsigned long *sk_offset,
+ int *num_found)
+{
+ u64 found_transid;
+ struct extent_buffer *leaf;
+ struct btrfs_ioctl_search_header sh;
+ struct btrfs_key test;
+ unsigned long item_off;
+ unsigned long item_len;
+ int nritems;
+ int i;
+ int slot;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ if (btrfs_header_generation(leaf) > sk->max_transid) {
+ i = nritems;
+ goto advance_key;
+ }
+ found_transid = btrfs_header_generation(leaf);
+
+ for (i = slot; i < nritems; i++) {
+ item_off = btrfs_item_ptr_offset(leaf, i);
+ item_len = btrfs_item_size(leaf, i);
+
+ btrfs_item_key_to_cpu(leaf, key, i);
+ if (!key_in_sk(key, sk))
+ continue;
+
+ if (sizeof(sh) + item_len > *buf_size) {
+ if (*num_found) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * return one empty item back for v1, which does not
+ * handle -EOVERFLOW
+ */
+
+ *buf_size = sizeof(sh) + item_len;
+ item_len = 0;
+ ret = -EOVERFLOW;
+ }
+
+ if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
+ ret = 1;
+ goto out;
+ }
+
+ sh.objectid = key->objectid;
+ sh.offset = key->offset;
+ sh.type = key->type;
+ sh.len = item_len;
+ sh.transid = found_transid;
+
+ /*
+ * Copy search result header. If we fault then loop again so we
+ * can fault in the pages and -EFAULT there if there's a
+ * problem. Otherwise we'll fault and then copy the buffer in
+ * properly this next time through
+ */
+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = 0;
+ goto out;
+ }
+
+ *sk_offset += sizeof(sh);
+
+ if (item_len) {
+ char __user *up = ubuf + *sk_offset;
+ /*
+ * Copy the item, same behavior as above, but reset the
+ * * sk_offset so we copy the full thing again.
+ */
+ if (read_extent_buffer_to_user_nofault(leaf, up,
+ item_off, item_len)) {
+ ret = 0;
+ *sk_offset -= sizeof(sh);
+ goto out;
+ }
+
+ *sk_offset += item_len;
+ }
+ (*num_found)++;
+
+ if (ret) /* -EOVERFLOW from above */
+ goto out;
+
+ if (*num_found >= sk->nr_items) {
+ ret = 1;
+ goto out;
+ }
+ }
+advance_key:
+ ret = 0;
+ test.objectid = sk->max_objectid;
+ test.type = sk->max_type;
+ test.offset = sk->max_offset;
+ if (btrfs_comp_cpu_keys(key, &test) >= 0)
+ ret = 1;
+ else if (key->offset < (u64)-1)
+ key->offset++;
+ else if (key->type < (u8)-1) {
+ key->offset = 0;
+ key->type++;
+ } else if (key->objectid < (u64)-1) {
+ key->offset = 0;
+ key->type = 0;
+ key->objectid++;
+ } else
+ ret = 1;
+out:
+ /*
+ * 0: all items from this leaf copied, continue with next
+ * 1: * more items can be copied, but unused buffer is too small
+ * * all items were found
+ * Either way, it will stops the loop which iterates to the next
+ * leaf
+ * -EOVERFLOW: item was to large for buffer
+ * -EFAULT: could not copy extent buffer back to userspace
+ */
+ return ret;
+}
+
+static noinline int search_ioctl(struct inode *inode,
+ struct btrfs_ioctl_search_key *sk,
+ u64 *buf_size,
+ char __user *ubuf)
+{
+ struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+ int num_found = 0;
+ unsigned long sk_offset = 0;
+
+ if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+ *buf_size = sizeof(struct btrfs_ioctl_search_header);
+ return -EOVERFLOW;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (sk->tree_id == 0) {
+ /* search the root of the inode that was passed */
+ root = btrfs_grab_root(BTRFS_I(inode)->root);
+ } else {
+ root = btrfs_get_fs_root(info, sk->tree_id, true);
+ if (IS_ERR(root)) {
+ btrfs_free_path(path);
+ return PTR_ERR(root);
+ }
+ }
+
+ key.objectid = sk->min_objectid;
+ key.type = sk->min_type;
+ key.offset = sk->min_offset;
+
+ while (1) {
+ ret = -EFAULT;
+ /*
+ * Ensure that the whole user buffer is faulted in at sub-page
+ * granularity, otherwise the loop may live-lock.
+ */
+ if (fault_in_subpage_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset))
+ break;
+
+ ret = btrfs_search_forward(root, &key, path, sk->min_transid);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = 0;
+ goto err;
+ }
+ ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
+ &sk_offset, &num_found);
+ btrfs_release_path(path);
+ if (ret)
+ break;
+
+ }
+ if (ret > 0)
+ ret = 0;
+err:
+ sk->nr_items = num_found;
+ btrfs_put_root(root);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args __user *uargs = argp;
+ struct btrfs_ioctl_search_key sk;
+ int ret;
+ u64 buf_size;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+ return -EFAULT;
+
+ buf_size = sizeof(uargs->buf);
+
+ ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+
+ /*
+ * In the origin implementation an overflow is handled by returning a
+ * search header with a len of zero, so reset ret.
+ */
+ if (ret == -EOVERFLOW)
+ ret = 0;
+
+ if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
+ ret = -EFAULT;
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
+ struct btrfs_ioctl_search_args_v2 args;
+ int ret;
+ u64 buf_size;
+ const u64 buf_limit = SZ_16M;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* copy search header and buffer size */
+ if (copy_from_user(&args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ buf_size = args.buf_size;
+
+ /* limit result size to 16MB */
+ if (buf_size > buf_limit)
+ buf_size = buf_limit;
+
+ ret = search_ioctl(inode, &args.key, &buf_size,
+ (char __user *)(&uarg->buf[0]));
+ if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+ ret = -EFAULT;
+ else if (ret == -EOVERFLOW &&
+ copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+/*
+ * Search INODE_REFs to identify path name of 'dirid' directory
+ * in a 'tree_id' tree. and sets path name to 'name'.
+ */
+static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
+ u64 tree_id, u64 dirid, char *name)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ char *ptr;
+ int ret = -1;
+ int slot;
+ int len;
+ int total_len = 0;
+ struct btrfs_inode_ref *iref;
+ struct extent_buffer *l;
+ struct btrfs_path *path;
+
+ if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
+ name[0]='\0';
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
+
+ root = btrfs_get_fs_root(info, tree_id, true);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ root = NULL;
+ goto out;
+ }
+
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_backwards(root, &key, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(l, iref);
+ ptr -= len + 1;
+ total_len += len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+
+ *(ptr + len) = '/';
+ read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
+
+ if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+ break;
+
+ btrfs_release_path(path);
+ key.objectid = key.offset;
+ key.offset = (u64)-1;
+ dirid = key.objectid;
+ }
+ memmove(name, ptr, total_len);
+ name[total_len] = '\0';
+ ret = 0;
+out:
+ btrfs_put_root(root);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
+ struct btrfs_ioctl_ino_lookup_user_args *args)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct super_block *sb = inode->i_sb;
+ struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+ u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+ u64 dirid = args->dirid;
+ unsigned long item_off;
+ unsigned long item_len;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_root_ref *rref;
+ struct btrfs_root *root = NULL;
+ struct btrfs_path *path;
+ struct btrfs_key key, key2;
+ struct extent_buffer *leaf;
+ struct inode *temp_inode;
+ char *ptr;
+ int slot;
+ int len;
+ int total_len = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * If the bottom subvolume does not exist directly under upper_limit,
+ * construct the path in from the bottom up.
+ */
+ if (dirid != upper_limit.objectid) {
+ ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
+
+ root = btrfs_get_fs_root(fs_info, treeid, true);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ while (1) {
+ ret = btrfs_search_backwards(root, &key, path);
+ if (ret < 0)
+ goto out_put;
+ else if (ret > 0) {
+ ret = -ENOENT;
+ goto out_put;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(leaf, iref);
+ ptr -= len + 1;
+ total_len += len + 1;
+ if (ptr < args->path) {
+ ret = -ENAMETOOLONG;
+ goto out_put;
+ }
+
+ *(ptr + len) = '/';
+ read_extent_buffer(leaf, ptr,
+ (unsigned long)(iref + 1), len);
+
+ /* Check the read+exec permission of this directory */
+ ret = btrfs_previous_item(root, path, dirid,
+ BTRFS_INODE_ITEM_KEY);
+ if (ret < 0) {
+ goto out_put;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out_put;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key2, slot);
+ if (key2.objectid != dirid) {
+ ret = -ENOENT;
+ goto out_put;
+ }
+
+ /*
+ * We don't need the path anymore, so release it and
+ * avoid deadlocks and lockdep warnings in case
+ * btrfs_iget() needs to lookup the inode from its root
+ * btree and lock the same leaf.
+ */
+ btrfs_release_path(path);
+ temp_inode = btrfs_iget(sb, key2.objectid, root);
+ if (IS_ERR(temp_inode)) {
+ ret = PTR_ERR(temp_inode);
+ goto out_put;
+ }
+ ret = inode_permission(mnt_userns, temp_inode,
+ MAY_READ | MAY_EXEC);
+ iput(temp_inode);
+ if (ret) {
+ ret = -EACCES;
+ goto out_put;
+ }
+
+ if (key.offset == upper_limit.objectid)
+ break;
+ if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EACCES;
+ goto out_put;
+ }
+
+ key.objectid = key.offset;
+ key.offset = (u64)-1;
+ dirid = key.objectid;
+ }
+
+ memmove(args->path, ptr, total_len);
+ args->path[total_len] = '\0';
+ btrfs_put_root(root);
+ root = NULL;
+ btrfs_release_path(path);
+ }
+
+ /* Get the bottom subvolume's name from ROOT_REF */
+ key.objectid = treeid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = args->treeid;
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ item_off = btrfs_item_ptr_offset(leaf, slot);
+ item_len = btrfs_item_size(leaf, slot);
+ /* Check if dirid in ROOT_REF corresponds to passed dirid */
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Copy subvolume's name */
+ item_off += sizeof(struct btrfs_root_ref);
+ item_len -= sizeof(struct btrfs_root_ref);
+ read_extent_buffer(leaf, args->name, item_off, item_len);
+ args->name[item_len] = 0;
+
+out_put:
+ btrfs_put_root(root);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
+ void __user *argp)
+{
+ struct btrfs_ioctl_ino_lookup_args *args;
+ int ret = 0;
+
+ args = memdup_user(argp, sizeof(*args));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
+
+ /*
+ * Unprivileged query to obtain the containing subvolume root id. The
+ * path is reset so it's consistent with btrfs_search_path_in_tree.
+ */
+ if (args->treeid == 0)
+ args->treeid = root->root_key.objectid;
+
+ if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+ args->name[0] = 0;
+ goto out;
+ }
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ ret = btrfs_search_path_in_tree(root->fs_info,
+ args->treeid, args->objectid,
+ args->name);
+
+out:
+ if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = -EFAULT;
+
+ kfree(args);
+ return ret;
+}
+
+/*
+ * Version of ino_lookup ioctl (unprivileged)
+ *
+ * The main differences from ino_lookup ioctl are:
+ *
+ * 1. Read + Exec permission will be checked using inode_permission() during
+ * path construction. -EACCES will be returned in case of failure.
+ * 2. Path construction will be stopped at the inode number which corresponds
+ * to the fd with which this ioctl is called. If constructed path does not
+ * exist under fd's inode, -EACCES will be returned.
+ * 3. The name of bottom subvolume is also searched and filled.
+ */
+static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_ino_lookup_user_args *args;
+ struct inode *inode;
+ int ret;
+
+ args = memdup_user(argp, sizeof(*args));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
+
+ inode = file_inode(file);
+
+ if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
+ BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * The subvolume does not exist under fd with which this is
+ * called
+ */
+ kfree(args);
+ return -EACCES;
+ }
+
+ ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
+
+ if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = -EFAULT;
+
+ kfree(args);
+ return ret;
+}
+
+/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
+static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
+{
+ struct btrfs_ioctl_get_subvol_info_args *subvol_info;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root_item *root_item;
+ struct btrfs_root_ref *rref;
+ struct extent_buffer *leaf;
+ unsigned long item_off;
+ unsigned long item_len;
+ int slot;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
+ if (!subvol_info) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ fs_info = BTRFS_I(inode)->root->fs_info;
+
+ /* Get root_item of inode's subvolume */
+ key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ root = btrfs_get_fs_root(fs_info, key.objectid, true);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out_free;
+ }
+ root_item = &root->root_item;
+
+ subvol_info->treeid = key.objectid;
+
+ subvol_info->generation = btrfs_root_generation(root_item);
+ subvol_info->flags = btrfs_root_flags(root_item);
+
+ memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
+ memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
+ BTRFS_UUID_SIZE);
+ memcpy(subvol_info->received_uuid, root_item->received_uuid,
+ BTRFS_UUID_SIZE);
+
+ subvol_info->ctransid = btrfs_root_ctransid(root_item);
+ subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
+ subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
+
+ subvol_info->otransid = btrfs_root_otransid(root_item);
+ subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
+ subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
+
+ subvol_info->stransid = btrfs_root_stransid(root_item);
+ subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
+ subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
+
+ subvol_info->rtransid = btrfs_root_rtransid(root_item);
+ subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
+ subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
+
+ if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
+ /* Search root tree for ROOT_BACKREF of this subvolume */
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(fs_info->tree_root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid == subvol_info->treeid &&
+ key.type == BTRFS_ROOT_BACKREF_KEY) {
+ subvol_info->parent_id = key.offset;
+
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
+
+ item_off = btrfs_item_ptr_offset(leaf, slot)
+ + sizeof(struct btrfs_root_ref);
+ item_len = btrfs_item_size(leaf, slot)
+ - sizeof(struct btrfs_root_ref);
+ read_extent_buffer(leaf, subvol_info->name,
+ item_off, item_len);
+ } else {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ btrfs_free_path(path);
+ path = NULL;
+ if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
+ ret = -EFAULT;
+
+out:
+ btrfs_put_root(root);
+out_free:
+ btrfs_free_path(path);
+ kfree(subvol_info);
+ return ret;
+}
+
+/*
+ * Return ROOT_REF information of the subvolume containing this inode
+ * except the subvolume name.
+ */
+static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
+ void __user *argp)
+{
+ struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+ struct btrfs_root_ref *rref;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u64 objectid;
+ int slot;
+ int ret;
+ u8 found;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ rootrefs = memdup_user(argp, sizeof(*rootrefs));
+ if (IS_ERR(rootrefs)) {
+ btrfs_free_path(path);
+ return PTR_ERR(rootrefs);
+ }
+
+ objectid = root->root_key.objectid;
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = rootrefs->min_treeid;
+ found = 0;
+
+ root = root->fs_info->tree_root;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+
+ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+ rootrefs->rootref[found].treeid = key.offset;
+ rootrefs->rootref[found].dirid =
+ btrfs_root_ref_dirid(leaf, rref);
+ found++;
+
+ ret = btrfs_next_item(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+out:
+ btrfs_free_path(path);
+
+ if (!ret || ret == -EOVERFLOW) {
+ rootrefs->num_items = found;
+ /* update min_treeid for next search */
+ if (found)
+ rootrefs->min_treeid =
+ rootrefs->rootref[found - 1].treeid + 1;
+ if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
+ ret = -EFAULT;
+ }
+
+ kfree(rootrefs);
+
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+ void __user *arg,
+ bool destroy_v2)
+{
+ struct dentry *parent = file->f_path.dentry;
+ struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
+ struct dentry *dentry;
+ struct inode *dir = d_inode(parent);
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *dest = NULL;
+ struct btrfs_ioctl_vol_args *vol_args = NULL;
+ struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+ char *subvol_name, *subvol_name_ptr = NULL;
+ int subvol_namelen;
+ int err = 0;
+ bool destroy_parent = false;
+
+ /* We don't support snapshots with extent tree v2 yet. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
+
+ if (destroy_v2) {
+ vol_args2 = memdup_user(arg, sizeof(*vol_args2));
+ if (IS_ERR(vol_args2))
+ return PTR_ERR(vol_args2);
+
+ if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /*
+ * If SPEC_BY_ID is not set, we are looking for the subvolume by
+ * name, same as v1 currently does.
+ */
+ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
+ vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
+ subvol_name = vol_args2->name;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+ } else {
+ struct inode *old_dir;
+
+ if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+
+ dentry = btrfs_get_dentry(fs_info->sb,
+ BTRFS_FIRST_FREE_OBJECTID,
+ vol_args2->subvolid, 0, 0);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_drop_write;
+ }
+
+ /*
+ * Change the default parent since the subvolume being
+ * deleted can be outside of the current mount point.
+ */
+ parent = btrfs_get_parent(dentry);
+
+ /*
+ * At this point dentry->d_name can point to '/' if the
+ * subvolume we want to destroy is outsite of the
+ * current mount point, so we need to release the
+ * current dentry and execute the lookup to return a new
+ * one with ->d_name pointing to the
+ * <mount point>/subvol_name.
+ */
+ dput(dentry);
+ if (IS_ERR(parent)) {
+ err = PTR_ERR(parent);
+ goto out_drop_write;
+ }
+ old_dir = dir;
+ dir = d_inode(parent);
+
+ /*
+ * If v2 was used with SPEC_BY_ID, a new parent was
+ * allocated since the subvolume can be outside of the
+ * current mount point. Later on we need to release this
+ * new parent dentry.
+ */
+ destroy_parent = true;
+
+ /*
+ * On idmapped mounts, deletion via subvolid is
+ * restricted to subvolumes that are immediate
+ * ancestors of the inode referenced by the file
+ * descriptor in the ioctl. Otherwise the idmapping
+ * could potentially be abused to delete subvolumes
+ * anywhere in the filesystem the user wouldn't be able
+ * to delete without an idmapped mount.
+ */
+ if (old_dir != dir && mnt_userns != &init_user_ns) {
+ err = -EOPNOTSUPP;
+ goto free_parent;
+ }
+
+ subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
+ fs_info, vol_args2->subvolid);
+ if (IS_ERR(subvol_name_ptr)) {
+ err = PTR_ERR(subvol_name_ptr);
+ goto free_parent;
+ }
+ /* subvol_name_ptr is already nul terminated */
+ subvol_name = (char *)kbasename(subvol_name_ptr);
+ }
+ } else {
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
+ subvol_name = vol_args->name;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+ }
+
+ subvol_namelen = strlen(subvol_name);
+
+ if (strchr(subvol_name, '/') ||
+ strncmp(subvol_name, "..", subvol_namelen) == 0) {
+ err = -EINVAL;
+ goto free_subvol_name;
+ }
+
+ if (!S_ISDIR(dir->i_mode)) {
+ err = -ENOTDIR;
+ goto free_subvol_name;
+ }
+
+ err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (err == -EINTR)
+ goto free_subvol_name;
+ dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_unlock_dir;
+ }
+
+ if (d_really_is_negative(dentry)) {
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ inode = d_inode(dentry);
+ dest = BTRFS_I(inode)->root;
+ if (!capable(CAP_SYS_ADMIN)) {
+ /*
+ * Regular user. Only allow this with a special mount
+ * option, when the user has write+exec access to the
+ * subvol root, and when rmdir(2) would have been
+ * allowed.
+ *
+ * Note that this is _not_ check that the subvol is
+ * empty or doesn't contain data that we wouldn't
+ * otherwise be able to delete.
+ *
+ * Users who want to delete empty subvols should try
+ * rmdir(2).
+ */
+ err = -EPERM;
+ if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
+ goto out_dput;
+
+ /*
+ * Do not allow deletion if the parent dir is the same
+ * as the dir to be deleted. That means the ioctl
+ * must be called on the dentry referencing the root
+ * of the subvol, not a random directory contained
+ * within it.
+ */
+ err = -EINVAL;
+ if (root == dest)
+ goto out_dput;
+
+ err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
+ if (err)
+ goto out_dput;
+ }
+
+ /* check if subvolume may be deleted by a user */
+ err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
+ if (err)
+ goto out_dput;
+
+ if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out_dput;
+ }
+
+ btrfs_inode_lock(inode, 0);
+ err = btrfs_delete_subvolume(dir, dentry);
+ btrfs_inode_unlock(inode, 0);
+ if (!err)
+ d_delete_notify(dir, dentry);
+
+out_dput:
+ dput(dentry);
+out_unlock_dir:
+ btrfs_inode_unlock(dir, 0);
+free_subvol_name:
+ kfree(subvol_name_ptr);
+free_parent:
+ if (destroy_parent)
+ dput(parent);
+out_drop_write:
+ mnt_drop_write_file(file);
+out:
+ kfree(vol_args2);
+ kfree(vol_args);
+ return err;
+}
+
+static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ioctl_defrag_range_args range = {0};
+ int ret;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
+ }
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+ ret = btrfs_defrag_root(root);
+ break;
+ case S_IFREG:
+ /*
+ * Note that this does not check the file descriptor for write
+ * access. This prevents defragmenting executables that are
+ * running and allows defrag on files open in read-only mode.
+ */
+ if (!capable(CAP_SYS_ADMIN) &&
+ inode_permission(&init_user_ns, inode, MAY_WRITE)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (argp) {
+ if (copy_from_user(&range, argp, sizeof(range))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ /* compression requires us to start the IO */
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
+ range.extent_thresh = (u32)-1;
+ }
+ } else {
+ /* the rest are all set to zero by kzalloc */
+ range.len = (u64)-1;
+ }
+ ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
+ &range, BTRFS_OLDEST_GENERATION, 0);
+ if (ret > 0)
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+out:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
+{
+ struct btrfs_ioctl_vol_args *vol_args;
+ bool restore_op = false;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device add not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
+ if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+
+ /*
+ * We can do the device add because we have a paused balanced,
+ * change the exclusive op type and remember we should bring
+ * back the paused balance
+ */
+ fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
+ btrfs_exclop_start_unlock(fs_info);
+ restore_op = true;
+ }
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_init_new_device(fs_info, vol_args->name);
+
+ if (!ret)
+ btrfs_info(fs_info, "disk added %s", vol_args->name);
+
+ kfree(vol_args);
+out:
+ if (restore_op)
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ else
+ btrfs_exclop_finish(fs_info);
+ return ret;
+}
+
+static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_ioctl_vol_args_v2 *vol_args;
+ struct block_device *bdev = NULL;
+ fmode_t mode;
+ int ret;
+ bool cancel = false;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ args.devid = vol_args->devid;
+ } else if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
+ cancel);
+ if (ret)
+ goto err_drop;
+
+ /* Exclusive operation is now claimed */
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
+
+ btrfs_exclop_finish(fs_info);
+
+ if (!ret) {
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
+ btrfs_info(fs_info, "device deleted: id %llu",
+ vol_args->devid);
+ else
+ btrfs_info(fs_info, "device deleted: %s",
+ vol_args->name);
+ }
+err_drop:
+ mnt_drop_write_file(file);
+ if (bdev)
+ blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
+ return ret;
+}
+
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct block_device *bdev = NULL;
+ fmode_t mode;
+ int ret;
+ bool cancel = false;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
+ cancel);
+ if (ret == 0) {
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
+ if (!ret)
+ btrfs_info(fs_info, "disk deleted %s", vol_args->name);
+ btrfs_exclop_finish(fs_info);
+ }
+
+ mnt_drop_write_file(file);
+ if (bdev)
+ blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
+ return ret;
+}
+
+static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_ioctl_fs_info_args *fi_args;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u64 flags_in;
+ int ret = 0;
+
+ fi_args = memdup_user(arg, sizeof(*fi_args));
+ if (IS_ERR(fi_args))
+ return PTR_ERR(fi_args);
+
+ flags_in = fi_args->flags;
+ memset(fi_args, 0, sizeof(*fi_args));
+
+ rcu_read_lock();
+ fi_args->num_devices = fs_devices->num_devices;
+
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
+ if (device->devid > fi_args->max_id)
+ fi_args->max_id = device->devid;
+ }
+ rcu_read_unlock();
+
+ memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
+ fi_args->nodesize = fs_info->nodesize;
+ fi_args->sectorsize = fs_info->sectorsize;
+ fi_args->clone_alignment = fs_info->sectorsize;
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
+ fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
+ fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
+ fi_args->generation = fs_info->generation;
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
+ memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
+ sizeof(fi_args->metadata_uuid));
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
+ }
+
+ if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
+ ret = -EFAULT;
+
+ kfree(fi_args);
+ return ret;
+}
+
+static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct btrfs_ioctl_dev_info_args *di_args;
+ struct btrfs_device *dev;
+ int ret = 0;
+
+ di_args = memdup_user(arg, sizeof(*di_args));
+ if (IS_ERR(di_args))
+ return PTR_ERR(di_args);
+
+ args.devid = di_args->devid;
+ if (!btrfs_is_empty_uuid(di_args->uuid))
+ args.uuid = di_args->uuid;
+
+ rcu_read_lock();
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ di_args->devid = dev->devid;
+ di_args->bytes_used = btrfs_device_get_bytes_used(dev);
+ di_args->total_bytes = btrfs_device_get_total_bytes(dev);
+ memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+ if (dev->name)
+ strscpy(di_args->path, rcu_str_deref(dev->name), sizeof(di_args->path));
+ else
+ di_args->path[0] = '\0';
+
+out:
+ rcu_read_unlock();
+ if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
+ ret = -EFAULT;
+
+ kfree(di_args);
+ return ret;
+}
+
+static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *new_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path = NULL;
+ struct btrfs_disk_key disk_key;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
+ u64 objectid = 0;
+ u64 dir_id;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!objectid)
+ objectid = BTRFS_FS_TREE_OBJECTID;
+
+ new_root = btrfs_get_fs_root(fs_info, objectid, true);
+ if (IS_ERR(new_root)) {
+ ret = PTR_ERR(new_root);
+ goto out;
+ }
+ if (!is_fstree(new_root->root_key.objectid)) {
+ ret = -ENOENT;
+ goto out_free;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_free;
+ }
+
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
+ dir_id, &name, 1);
+ if (IS_ERR_OR_NULL(di)) {
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ btrfs_err(fs_info,
+ "Umm, you don't have the default diritem, this isn't going to work");
+ ret = -ENOENT;
+ goto out_free;
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
+ btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+
+ btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
+ btrfs_end_transaction(trans);
+out_free:
+ btrfs_put_root(new_root);
+ btrfs_free_path(path);
+out:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static void get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space)
+{
+ struct btrfs_block_group *block_group;
+
+ space->total_bytes = 0;
+ space->used_bytes = 0;
+ space->flags = 0;
+ list_for_each_entry(block_group, groups_list, list) {
+ space->flags = block_group->flags;
+ space->total_bytes += block_group->length;
+ space->used_bytes += block_group->used;
+ }
+}
+
+static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_ioctl_space_args space_args = { 0 };
+ struct btrfs_ioctl_space_info space;
+ struct btrfs_ioctl_space_info *dest;
+ struct btrfs_ioctl_space_info *dest_orig;
+ struct btrfs_ioctl_space_info __user *user_dest;
+ struct btrfs_space_info *info;
+ static const u64 types[] = {
+ BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
+ };
+ int num_types = 4;
+ int alloc_size;
+ int ret = 0;
+ u64 slot_count = 0;
+ int i, c;
+
+ if (copy_from_user(&space_args,
+ (struct btrfs_ioctl_space_args __user *)arg,
+ sizeof(space_args)))
+ return -EFAULT;
+
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ info = NULL;
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
+ if (tmp->flags == types[i]) {
+ info = tmp;
+ break;
+ }
+ }
+
+ if (!info)
+ continue;
+
+ down_read(&info->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&info->block_groups[c]))
+ slot_count++;
+ }
+ up_read(&info->groups_sem);
+ }
+
+ /*
+ * Global block reserve, exported as a space_info
+ */
+ slot_count++;
+
+ /* space_slots == 0 means they are asking for a count */
+ if (space_args.space_slots == 0) {
+ space_args.total_spaces = slot_count;
+ goto out;
+ }
+
+ slot_count = min_t(u64, space_args.space_slots, slot_count);
+
+ alloc_size = sizeof(*dest) * slot_count;
+
+ /* we generally have at most 6 or so space infos, one for each raid
+ * level. So, a whole page should be more than enough for everyone
+ */
+ if (alloc_size > PAGE_SIZE)
+ return -ENOMEM;
+
+ space_args.total_spaces = 0;
+ dest = kmalloc(alloc_size, GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest_orig = dest;
+
+ /* now we have a buffer to copy into */
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ if (!slot_count)
+ break;
+
+ info = NULL;
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
+ if (tmp->flags == types[i]) {
+ info = tmp;
+ break;
+ }
+ }
+
+ if (!info)
+ continue;
+ down_read(&info->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&info->block_groups[c])) {
+ get_block_group_info(&info->block_groups[c],
+ &space);
+ memcpy(dest, &space, sizeof(space));
+ dest++;
+ space_args.total_spaces++;
+ slot_count--;
+ }
+ if (!slot_count)
+ break;
+ }
+ up_read(&info->groups_sem);
+ }
+
+ /*
+ * Add global block reserve
+ */
+ if (slot_count) {
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+
+ spin_lock(&block_rsv->lock);
+ space.total_bytes = block_rsv->size;
+ space.used_bytes = block_rsv->size - block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+ space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
+ memcpy(dest, &space, sizeof(space));
+ space_args.total_spaces++;
+ }
+
+ user_dest = (struct btrfs_ioctl_space_info __user *)
+ (arg + sizeof(struct btrfs_ioctl_space_args));
+
+ if (copy_to_user(user_dest, dest_orig, alloc_size))
+ ret = -EFAULT;
+
+ kfree(dest_orig);
+out:
+ if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+ void __user *argp)
+{
+ struct btrfs_trans_handle *trans;
+ u64 transid;
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ return PTR_ERR(trans);
+
+ /* No running transaction, don't bother */
+ transid = root->fs_info->last_trans_committed;
+ goto out;
+ }
+ transid = trans->transid;
+ btrfs_commit_transaction_async(trans);
+out:
+ if (argp)
+ if (copy_to_user(argp, &transid, sizeof(transid)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
+ void __user *argp)
+{
+ u64 transid;
+
+ if (argp) {
+ if (copy_from_user(&transid, argp, sizeof(transid)))
+ return -EFAULT;
+ } else {
+ transid = 0; /* current trans */
+ }
+ return btrfs_wait_for_commit(fs_info, transid);
+}
+
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+ struct btrfs_ioctl_scrub_args *sa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
+ &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+ 0);
+
+ /*
+ * Copy scrub args to user space even if btrfs_scrub_dev() returned an
+ * error. This is important as it allows user space to know how much
+ * progress scrub has done. For example, if scrub is canceled we get
+ * -ECANCELED from btrfs_scrub_dev() and return that error back to user
+ * space. Later user space can inspect the progress from the structure
+ * btrfs_ioctl_scrub_args and resume scrub from where it left off
+ * previously (btrfs-progs does this).
+ * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
+ * then return -EFAULT to signal the structure was not copied or it may
+ * be corrupt and unreliable due to a partial copy.
+ */
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ if (!(sa->flags & BTRFS_SCRUB_READONLY))
+ mnt_drop_write_file(file);
+out:
+ kfree(sa);
+ return ret;
+}
+
+static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btrfs_scrub_cancel(fs_info);
+}
+
+static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_ioctl_scrub_args *sa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
+
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ kfree(sa);
+ return ret;
+}
+
+static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_ioctl_get_dev_stats *sa;
+ int ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
+ kfree(sa);
+ return -EPERM;
+ }
+
+ ret = btrfs_get_dev_stats(fs_info, sa);
+
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ kfree(sa);
+ return ret;
+}
+
+static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_ioctl_dev_replace_args *p;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device replace not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
+ p = memdup_user(arg, sizeof(*p));
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ switch (p->cmd) {
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+ if (sb_rdonly(fs_info->sb)) {
+ ret = -EROFS;
+ goto out;
+ }
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ } else {
+ ret = btrfs_dev_replace_by_ioctl(fs_info, p);
+ btrfs_exclop_finish(fs_info);
+ }
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+ btrfs_dev_replace_status(fs_info, p);
+ ret = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+ p->result = btrfs_dev_replace_cancel(fs_info);
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
+ ret = -EFAULT;
+out:
+ kfree(p);
+ return ret;
+}
+
+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+{
+ int ret = 0;
+ int i;
+ u64 rel_ptr;
+ int size;
+ struct btrfs_ioctl_ino_path_args *ipa = NULL;
+ struct inode_fs_paths *ipath = NULL;
+ struct btrfs_path *path;
+
+ if (!capable(CAP_DAC_READ_SEARCH))
+ return -EPERM;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ipa = memdup_user(arg, sizeof(*ipa));
+ if (IS_ERR(ipa)) {
+ ret = PTR_ERR(ipa);
+ ipa = NULL;
+ goto out;
+ }
+
+ size = min_t(u32, ipa->size, 4096);
+ ipath = init_ipath(size, root, path);
+ if (IS_ERR(ipath)) {
+ ret = PTR_ERR(ipath);
+ ipath = NULL;
+ goto out;
+ }
+
+ ret = paths_from_inode(ipa->inum, ipath);
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+ rel_ptr = ipath->fspath->val[i] -
+ (u64)(unsigned long)ipath->fspath->val;
+ ipath->fspath->val[i] = rel_ptr;
+ }
+
+ btrfs_free_path(path);
+ path = NULL;
+ ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
+ ipath->fspath, size);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+out:
+ btrfs_free_path(path);
+ free_ipath(ipath);
+ kfree(ipa);
+
+ return ret;
+}
+
+static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
+ void __user *arg, int version)
+{
+ int ret = 0;
+ int size;
+ struct btrfs_ioctl_logical_ino_args *loi;
+ struct btrfs_data_container *inodes = NULL;
+ struct btrfs_path *path = NULL;
+ bool ignore_offset;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ loi = memdup_user(arg, sizeof(*loi));
+ if (IS_ERR(loi))
+ return PTR_ERR(loi);
+
+ if (version == 1) {
+ ignore_offset = false;
+ size = min_t(u32, loi->size, SZ_64K);
+ } else {
+ /* All reserved bits must be 0 for now */
+ if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
+ ret = -EINVAL;
+ goto out_loi;
+ }
+ /* Only accept flags we have defined so far */
+ if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
+ ret = -EINVAL;
+ goto out_loi;
+ }
+ ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
+ size = min_t(u32, loi->size, SZ_16M);
+ }
+
+ inodes = init_data_container(size);
+ if (IS_ERR(inodes)) {
+ ret = PTR_ERR(inodes);
+ goto out_loi;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
+ inodes, ignore_offset);
+ btrfs_free_path(path);
+ if (ret == -EINVAL)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
+ size);
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kvfree(inodes);
+out_loi:
+ kfree(loi);
+
+ return ret;
+}
+
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ bargs->flags = bctl->flags;
+
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
+ bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+ if (atomic_read(&fs_info->balance_pause_req))
+ bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+ if (atomic_read(&fs_info->balance_cancel_req))
+ bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+ memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+ memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+ memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+ spin_lock(&fs_info->balance_lock);
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ spin_unlock(&fs_info->balance_lock);
+}
+
+/**
+ * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
+ * required.
+ *
+ * @fs_info: the filesystem
+ * @excl_acquired: ptr to boolean value which is set to false in case balance
+ * is being resumed
+ *
+ * Return 0 on success in which case both fs_info::balance is acquired as well
+ * as exclusive ops are blocked. In case of failure return an error code.
+ */
+static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired)
+{
+ int ret;
+
+ /*
+ * Exclusive operation is locked. Three possibilities:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
+ while (1) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ *excl_acquired = true;
+ mutex_lock(&fs_info->balance_mutex);
+ return 0;
+ }
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* This is either (2) or (3) */
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (2) */
+ ret = -EINPROGRESS;
+ goto out_failure;
+
+ } else {
+ mutex_unlock(&fs_info->balance_mutex);
+ /*
+ * Lock released to allow other waiters to
+ * continue, we'll reexamine the status again.
+ */
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (3) */
+ *excl_acquired = false;
+ return 0;
+ }
+ }
+ } else {
+ /* This is (1) */
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out_failure;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ }
+
+out_failure:
+ mutex_unlock(&fs_info->balance_mutex);
+ *excl_acquired = false;
+ return ret;
+}
+
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_balance_control *bctl;
+ bool need_unlock = true;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ bargs = NULL;
+ goto out;
+ }
+
+ ret = btrfs_try_lock_balance(fs_info, &need_unlock);
+ if (ret)
+ goto out;
+
+ lockdep_assert_held(&fs_info->balance_mutex);
+
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out_unlock;
+ }
+
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
+
+ goto do_balance;
+ }
+
+ if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (fs_info->balance_ctl) {
+ ret = -EINPROGRESS;
+ goto out_unlock;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+ bctl->flags = bargs->flags;
+do_balance:
+ /*
+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
+ * bctl is freed in reset_balance_state, or, if restriper was paused
+ * all the way until unmount, in free_fs_info. The flag should be
+ * cleared after reset_balance_state.
+ */
+ need_unlock = false;
+
+ ret = btrfs_balance(fs_info, bctl, bargs);
+ bctl = NULL;
+
+ if (ret == 0 || ret == -ECANCELED) {
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+ }
+
+ kfree(bctl);
+out_unlock:
+ mutex_unlock(&fs_info->balance_mutex);
+ if (need_unlock)
+ btrfs_exclop_finish(fs_info);
+out:
+ mnt_drop_write_file(file);
+ kfree(bargs);
+ return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case BTRFS_BALANCE_CTL_PAUSE:
+ return btrfs_pause_balance(fs_info);
+ case BTRFS_BALANCE_CTL_CANCEL:
+ return btrfs_cancel_balance(fs_info);
+ }
+
+ return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_ioctl_balance_args *bargs;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
+ if (!bargs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ btrfs_update_ioctl_balance_args(fs_info, bargs);
+
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_ioctl_quota_ctl_args *sa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ down_write(&fs_info->subvol_sem);
+
+ switch (sa->cmd) {
+ case BTRFS_QUOTA_CTL_ENABLE:
+ ret = btrfs_quota_enable(fs_info);
+ break;
+ case BTRFS_QUOTA_CTL_DISABLE:
+ ret = btrfs_quota_disable(fs_info);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ kfree(sa);
+ up_write(&fs_info->subvol_sem);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ioctl_qgroup_assign_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ if (sa->assign) {
+ ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
+ } else {
+ ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
+ }
+
+ /* update qgroup status and info */
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ err = btrfs_run_qgroups(trans);
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (err < 0)
+ btrfs_handle_fs_error(fs_info, err,
+ "failed to update qgroup status and info");
+ err = btrfs_end_transaction(trans);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ioctl_qgroup_create_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ if (!sa->qgroupid) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ if (sa->create) {
+ ret = btrfs_create_qgroup(trans, sa->qgroupid);
+ } else {
+ ret = btrfs_remove_qgroup(trans, sa->qgroupid);
+ }
+
+ err = btrfs_end_transaction(trans);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ioctl_qgroup_limit_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+ u64 qgroupid;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ qgroupid = sa->qgroupid;
+ if (!qgroupid) {
+ /* take the current subvol as qgroup */
+ qgroupid = root->root_key.objectid;
+ }
+
+ ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
+
+ err = btrfs_end_transaction(trans);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_ioctl_quota_rescan_args *qsa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ qsa = memdup_user(arg, sizeof(*qsa));
+ if (IS_ERR(qsa)) {
+ ret = PTR_ERR(qsa);
+ goto drop_write;
+ }
+
+ if (qsa->flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_qgroup_rescan(fs_info);
+
+out:
+ kfree(qsa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_ioctl_quota_rescan_args qsa = {0};
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ qsa.flags = 1;
+ qsa.progress = fs_info->qgroup_rescan_progress.objectid;
+ }
+
+ if (copy_to_user(arg, &qsa, sizeof(qsa)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btrfs_qgroup_wait_for_completion(fs_info, true);
+}
+
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+ struct user_namespace *mnt_userns,
+ struct btrfs_ioctl_received_subvol_args *sa)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root_item *root_item = &root->root_item;
+ struct btrfs_trans_handle *trans;
+ struct timespec64 ct = current_time(inode);
+ int ret = 0;
+ int received_uuid_changed;
+
+ if (!inode_owner_or_capable(mnt_userns, inode))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret < 0)
+ return ret;
+
+ down_write(&fs_info->subvol_sem);
+
+ if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * 1 - root item
+ * 2 - uuid items (received uuid + subvol uuid)
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ sa->rtransid = trans->transid;
+ sa->rtime.sec = ct.tv_sec;
+ sa->rtime.nsec = ct.tv_nsec;
+
+ received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
+ BTRFS_UUID_SIZE);
+ if (received_uuid_changed &&
+ !btrfs_is_empty_uuid(root_item->received_uuid)) {
+ ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ root->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
+ memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
+ btrfs_set_root_stransid(root_item, sa->stransid);
+ btrfs_set_root_rtransid(root_item, sa->rtransid);
+ btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
+ btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
+ btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
+ btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
+
+ ret = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ if (ret < 0) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
+ ret = btrfs_uuid_tree_add(trans, sa->uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ root->root_key.objectid);
+ if (ret < 0 && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
+ ret = btrfs_commit_transaction(trans);
+out:
+ up_write(&fs_info->subvol_sem);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+ struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+ int ret = 0;
+
+ args32 = memdup_user(arg, sizeof(*args32));
+ if (IS_ERR(args32))
+ return PTR_ERR(args32);
+
+ args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
+ if (!args64) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+ args64->stransid = args32->stransid;
+ args64->rtransid = args32->rtransid;
+ args64->stime.sec = args32->stime.sec;
+ args64->stime.nsec = args32->stime.nsec;
+ args64->rtime.sec = args32->rtime.sec;
+ args64->rtime.nsec = args32->rtime.nsec;
+ args64->flags = args32->flags;
+
+ ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
+ if (ret)
+ goto out;
+
+ memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+ args32->stransid = args64->stransid;
+ args32->rtransid = args64->rtransid;
+ args32->stime.sec = args64->stime.sec;
+ args32->stime.nsec = args64->stime.nsec;
+ args32->rtime.sec = args64->rtime.sec;
+ args32->rtime.nsec = args64->rtime.nsec;
+ args32->flags = args64->flags;
+
+ ret = copy_to_user(arg, args32, sizeof(*args32));
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(args32);
+ kfree(args64);
+ return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args *sa = NULL;
+ int ret = 0;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
+
+ if (ret)
+ goto out;
+
+ ret = copy_to_user(arg, sa, sizeof(*sa));
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(sa);
+ return ret;
+}
+
+static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ size_t len;
+ int ret;
+ char label[BTRFS_LABEL_SIZE];
+
+ spin_lock(&fs_info->super_lock);
+ memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
+ spin_unlock(&fs_info->super_lock);
+
+ len = strnlen(label, BTRFS_LABEL_SIZE);
+
+ if (len == BTRFS_LABEL_SIZE) {
+ btrfs_warn(fs_info,
+ "label is too long, return the first %zu bytes",
+ --len);
+ }
+
+ ret = copy_to_user(arg, label, len);
+
+ return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_super_block *super_block = fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ char label[BTRFS_LABEL_SIZE];
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, arg, sizeof(label)))
+ return -EFAULT;
+
+ if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+ btrfs_err(fs_info,
+ "unable to set label with more than %d bytes",
+ BTRFS_LABEL_SIZE - 1);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ spin_lock(&fs_info->super_lock);
+ strcpy(super_block->label, label);
+ spin_unlock(&fs_info->super_lock);
+ ret = btrfs_commit_transaction(trans);
+
+out_unlock:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+#define INIT_FEATURE_FLAGS(suffix) \
+ { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
+ .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
+ .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
+
+int btrfs_ioctl_get_supported_features(void __user *arg)
+{
+ static const struct btrfs_ioctl_feature_flags features[3] = {
+ INIT_FEATURE_FLAGS(SUPP),
+ INIT_FEATURE_FLAGS(SAFE_SET),
+ INIT_FEATURE_FLAGS(SAFE_CLEAR)
+ };
+
+ if (copy_to_user(arg, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
+ void __user *arg)
+{
+ struct btrfs_super_block *super_block = fs_info->super_copy;
+ struct btrfs_ioctl_feature_flags features;
+
+ features.compat_flags = btrfs_super_compat_flags(super_block);
+ features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
+ features.incompat_flags = btrfs_super_incompat_flags(super_block);
+
+ if (copy_to_user(arg, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int check_feature_bits(struct btrfs_fs_info *fs_info,
+ enum btrfs_feature_set set,
+ u64 change_mask, u64 flags, u64 supported_flags,
+ u64 safe_set, u64 safe_clear)
+{
+ const char *type = btrfs_feature_set_name(set);
+ char *names;
+ u64 disallowed, unsupported;
+ u64 set_mask = flags & change_mask;
+ u64 clear_mask = ~flags & change_mask;
+
+ unsupported = set_mask & ~supported_flags;
+ if (unsupported) {
+ names = btrfs_printable_features(set, unsupported);
+ if (names) {
+ btrfs_warn(fs_info,
+ "this kernel does not support the %s feature bit%s",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(fs_info,
+ "this kernel does not support %s bits 0x%llx",
+ type, unsupported);
+ return -EOPNOTSUPP;
+ }
+
+ disallowed = set_mask & ~safe_set;
+ if (disallowed) {
+ names = btrfs_printable_features(set, disallowed);
+ if (names) {
+ btrfs_warn(fs_info,
+ "can't set the %s feature bit%s while mounted",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(fs_info,
+ "can't set %s bits 0x%llx while mounted",
+ type, disallowed);
+ return -EPERM;
+ }
+
+ disallowed = clear_mask & ~safe_clear;
+ if (disallowed) {
+ names = btrfs_printable_features(set, disallowed);
+ if (names) {
+ btrfs_warn(fs_info,
+ "can't clear the %s feature bit%s while mounted",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(fs_info,
+ "can't clear %s bits 0x%llx while mounted",
+ type, disallowed);
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+#define check_feature(fs_info, change_mask, flags, mask_base) \
+check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \
+ BTRFS_FEATURE_ ## mask_base ## _SUPP, \
+ BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
+ BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
+
+static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_super_block *super_block = fs_info->super_copy;
+ struct btrfs_ioctl_feature_flags flags[2];
+ struct btrfs_trans_handle *trans;
+ u64 newflags;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ /* Nothing to do */
+ if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
+ !flags[0].incompat_flags)
+ return 0;
+
+ ret = check_feature(fs_info, flags[0].compat_flags,
+ flags[1].compat_flags, COMPAT);
+ if (ret)
+ return ret;
+
+ ret = check_feature(fs_info, flags[0].compat_ro_flags,
+ flags[1].compat_ro_flags, COMPAT_RO);
+ if (ret)
+ return ret;
+
+ ret = check_feature(fs_info, flags[0].incompat_flags,
+ flags[1].incompat_flags, INCOMPAT);
+ if (ret)
+ return ret;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_drop_write;
+ }
+
+ spin_lock(&fs_info->super_lock);
+ newflags = btrfs_super_compat_flags(super_block);
+ newflags |= flags[0].compat_flags & flags[1].compat_flags;
+ newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
+ btrfs_set_super_compat_flags(super_block, newflags);
+
+ newflags = btrfs_super_compat_ro_flags(super_block);
+ newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
+ newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
+ btrfs_set_super_compat_ro_flags(super_block, newflags);
+
+ newflags = btrfs_super_incompat_flags(super_block);
+ newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
+ newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
+ btrfs_set_super_incompat_flags(super_block, newflags);
+ spin_unlock(&fs_info->super_lock);
+
+ ret = btrfs_commit_transaction(trans);
+out_drop_write:
+ mnt_drop_write_file(file);
+
+ return ret;
+}
+
+static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
+{
+ struct btrfs_ioctl_send_args *arg;
+ int ret;
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_send_args_32 args32 = { 0 };
+
+ ret = copy_from_user(&args32, argp, sizeof(args32));
+ if (ret)
+ return -EFAULT;
+ arg = kzalloc(sizeof(*arg), GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+ arg->send_fd = args32.send_fd;
+ arg->clone_sources_count = args32.clone_sources_count;
+ arg->clone_sources = compat_ptr(args32.clone_sources);
+ arg->parent_root = args32.parent_root;
+ arg->flags = args32.flags;
+ arg->version = args32.version;
+ memcpy(arg->reserved, args32.reserved,
+ sizeof(args32.reserved));
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ arg = memdup_user(argp, sizeof(*arg));
+ if (IS_ERR(arg))
+ return PTR_ERR(arg);
+ }
+ ret = btrfs_ioctl_send(inode, arg);
+ kfree(arg);
+ return ret;
+}
+
+static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
+ bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args = { 0 };
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+ flags);
+ size_t copy_end;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+ flags);
+ if (copy_from_user(&args32, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+ if (args.flags != 0) {
+ ret = -EINVAL;
+ goto out_acct;
+ }
+
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_iov;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_iov;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ if (ret >= 0) {
+ fsnotify_access(file);
+ if (copy_to_user(argp + copy_end,
+ (char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel))
+ ret = -EFAULT;
+ }
+
+out_iov:
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ return ret;
+}
+
+static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EBADF;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ if (copy_from_user(&args32, argp, sizeof(args32))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+ args.len = args32.len;
+ args.unencoded_len = args32.unencoded_len;
+ args.unencoded_offset = args32.unencoded_offset;
+ args.compression = args32.compression;
+ args.encryption = args32.encryption;
+ memcpy(args.reserved, args32.reserved, sizeof(args.reserved));
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ ret = -EINVAL;
+ if (args.flags != 0)
+ goto out_acct;
+ if (memchr_inv(args.reserved, 0, sizeof(args.reserved)))
+ goto out_acct;
+ if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
+ args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ goto out_acct;
+ if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
+ args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
+ goto out_acct;
+ if (args.unencoded_offset > args.unencoded_len)
+ goto out_acct;
+ if (args.len > args.unencoded_len - args.unencoded_offset)
+ goto out_acct;
+
+ ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ file_start_write(file);
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_end_write;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(WRITE, file, &pos, args.len);
+ if (ret < 0)
+ goto out_end_write;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = kiocb_set_rw_flags(&kiocb, 0);
+ if (ret)
+ goto out_end_write;
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_do_write_iter(&kiocb, &iter, &args);
+ if (ret > 0)
+ fsnotify_modify(file);
+
+out_end_write:
+ file_end_write(file);
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
+long btrfs_ioctl(struct file *file, unsigned int
+ cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case FS_IOC_GETVERSION:
+ return btrfs_ioctl_getversion(inode, argp);
+ case FS_IOC_GETFSLABEL:
+ return btrfs_ioctl_get_fslabel(fs_info, argp);
+ case FS_IOC_SETFSLABEL:
+ return btrfs_ioctl_set_fslabel(file, argp);
+ case FITRIM:
+ return btrfs_ioctl_fitrim(fs_info, argp);
+ case BTRFS_IOC_SNAP_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 0);
+ case BTRFS_IOC_SNAP_CREATE_V2:
+ return btrfs_ioctl_snap_create_v2(file, argp, 0);
+ case BTRFS_IOC_SUBVOL_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 1);
+ case BTRFS_IOC_SUBVOL_CREATE_V2:
+ return btrfs_ioctl_snap_create_v2(file, argp, 1);
+ case BTRFS_IOC_SNAP_DESTROY:
+ return btrfs_ioctl_snap_destroy(file, argp, false);
+ case BTRFS_IOC_SNAP_DESTROY_V2:
+ return btrfs_ioctl_snap_destroy(file, argp, true);
+ case BTRFS_IOC_SUBVOL_GETFLAGS:
+ return btrfs_ioctl_subvol_getflags(inode, argp);
+ case BTRFS_IOC_SUBVOL_SETFLAGS:
+ return btrfs_ioctl_subvol_setflags(file, argp);
+ case BTRFS_IOC_DEFAULT_SUBVOL:
+ return btrfs_ioctl_default_subvol(file, argp);
+ case BTRFS_IOC_DEFRAG:
+ return btrfs_ioctl_defrag(file, NULL);
+ case BTRFS_IOC_DEFRAG_RANGE:
+ return btrfs_ioctl_defrag(file, argp);
+ case BTRFS_IOC_RESIZE:
+ return btrfs_ioctl_resize(file, argp);
+ case BTRFS_IOC_ADD_DEV:
+ return btrfs_ioctl_add_dev(fs_info, argp);
+ case BTRFS_IOC_RM_DEV:
+ return btrfs_ioctl_rm_dev(file, argp);
+ case BTRFS_IOC_RM_DEV_V2:
+ return btrfs_ioctl_rm_dev_v2(file, argp);
+ case BTRFS_IOC_FS_INFO:
+ return btrfs_ioctl_fs_info(fs_info, argp);
+ case BTRFS_IOC_DEV_INFO:
+ return btrfs_ioctl_dev_info(fs_info, argp);
+ case BTRFS_IOC_TREE_SEARCH:
+ return btrfs_ioctl_tree_search(inode, argp);
+ case BTRFS_IOC_TREE_SEARCH_V2:
+ return btrfs_ioctl_tree_search_v2(inode, argp);
+ case BTRFS_IOC_INO_LOOKUP:
+ return btrfs_ioctl_ino_lookup(root, argp);
+ case BTRFS_IOC_INO_PATHS:
+ return btrfs_ioctl_ino_to_path(root, argp);
+ case BTRFS_IOC_LOGICAL_INO:
+ return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
+ case BTRFS_IOC_LOGICAL_INO_V2:
+ return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
+ case BTRFS_IOC_SPACE_INFO:
+ return btrfs_ioctl_space_info(fs_info, argp);
+ case BTRFS_IOC_SYNC: {
+ int ret;
+
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
+ if (ret)
+ return ret;
+ ret = btrfs_sync_fs(inode->i_sb, 1);
+ /*
+ * The transaction thread may want to do more work,
+ * namely it pokes the cleaner kthread that will start
+ * processing uncleaned subvols.
+ */
+ wake_up_process(fs_info->transaction_kthread);
+ return ret;
+ }
+ case BTRFS_IOC_START_SYNC:
+ return btrfs_ioctl_start_sync(root, argp);
+ case BTRFS_IOC_WAIT_SYNC:
+ return btrfs_ioctl_wait_sync(fs_info, argp);
+ case BTRFS_IOC_SCRUB:
+ return btrfs_ioctl_scrub(file, argp);
+ case BTRFS_IOC_SCRUB_CANCEL:
+ return btrfs_ioctl_scrub_cancel(fs_info);
+ case BTRFS_IOC_SCRUB_PROGRESS:
+ return btrfs_ioctl_scrub_progress(fs_info, argp);
+ case BTRFS_IOC_BALANCE_V2:
+ return btrfs_ioctl_balance(file, argp);
+ case BTRFS_IOC_BALANCE_CTL:
+ return btrfs_ioctl_balance_ctl(fs_info, arg);
+ case BTRFS_IOC_BALANCE_PROGRESS:
+ return btrfs_ioctl_balance_progress(fs_info, argp);
+ case BTRFS_IOC_SET_RECEIVED_SUBVOL:
+ return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+ case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+ return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
+ case BTRFS_IOC_SEND:
+ return _btrfs_ioctl_send(inode, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_SEND_32:
+ return _btrfs_ioctl_send(inode, argp, true);
+#endif
+ case BTRFS_IOC_GET_DEV_STATS:
+ return btrfs_ioctl_get_dev_stats(fs_info, argp);
+ case BTRFS_IOC_QUOTA_CTL:
+ return btrfs_ioctl_quota_ctl(file, argp);
+ case BTRFS_IOC_QGROUP_ASSIGN:
+ return btrfs_ioctl_qgroup_assign(file, argp);
+ case BTRFS_IOC_QGROUP_CREATE:
+ return btrfs_ioctl_qgroup_create(file, argp);
+ case BTRFS_IOC_QGROUP_LIMIT:
+ return btrfs_ioctl_qgroup_limit(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN:
+ return btrfs_ioctl_quota_rescan(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN_STATUS:
+ return btrfs_ioctl_quota_rescan_status(fs_info, argp);
+ case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+ return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
+ case BTRFS_IOC_DEV_REPLACE:
+ return btrfs_ioctl_dev_replace(fs_info, argp);
+ case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+ return btrfs_ioctl_get_supported_features(argp);
+ case BTRFS_IOC_GET_FEATURES:
+ return btrfs_ioctl_get_features(fs_info, argp);
+ case BTRFS_IOC_SET_FEATURES:
+ return btrfs_ioctl_set_features(file, argp);
+ case BTRFS_IOC_GET_SUBVOL_INFO:
+ return btrfs_ioctl_get_subvol_info(inode, argp);
+ case BTRFS_IOC_GET_SUBVOL_ROOTREF:
+ return btrfs_ioctl_get_subvol_rootref(root, argp);
+ case BTRFS_IOC_INO_LOOKUP_USER:
+ return btrfs_ioctl_ino_lookup_user(file, argp);
+ case FS_IOC_ENABLE_VERITY:
+ return fsverity_ioctl_enable(file, (const void __user *)argp);
+ case FS_IOC_MEASURE_VERITY:
+ return fsverity_ioctl_measure(file, argp);
+ case BTRFS_IOC_ENCODED_READ:
+ return btrfs_ioctl_encoded_read(file, argp, false);
+ case BTRFS_IOC_ENCODED_WRITE:
+ return btrfs_ioctl_encoded_write(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+ return btrfs_ioctl_encoded_read(file, argp, true);
+ case BTRFS_IOC_ENCODED_WRITE_32:
+ return btrfs_ioctl_encoded_write(file, argp, true);
+#endif
+ }
+
+ return -ENOTTY;
+}
+
+#ifdef CONFIG_COMPAT
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ /*
+ * These all access 32-bit values anyway so no further
+ * handling is necessary.
+ */
+ switch (cmd) {
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ }
+
+ return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000..df98e64d7
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "misc.h"
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+/*
+ * Lockdep class keys for extent_buffer->lock's in this root. For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets. As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
+ * roots have separate keysets.
+ *
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock. As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
+ *
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked. It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
+ *
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if BTRFS_MAX_LEVEL != 8
+#error
+#endif
+
+#define DEFINE_LEVEL(stem, level) \
+ .names[level] = "btrfs-" stem "-0" #level,
+
+#define DEFINE_NAME(stem) \
+ DEFINE_LEVEL(stem, 0) \
+ DEFINE_LEVEL(stem, 1) \
+ DEFINE_LEVEL(stem, 2) \
+ DEFINE_LEVEL(stem, 3) \
+ DEFINE_LEVEL(stem, 4) \
+ DEFINE_LEVEL(stem, 5) \
+ DEFINE_LEVEL(stem, 6) \
+ DEFINE_LEVEL(stem, 7)
+
+static struct btrfs_lockdep_keyset {
+ u64 id; /* root objectid */
+ /* Longest entry: btrfs-block-group-00 */
+ char names[BTRFS_MAX_LEVEL][24];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL];
+} btrfs_lockdep_keysets[] = {
+ { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
+ { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
+ { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
+ { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
+ { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
+ { .id = 0, DEFINE_NAME("tree") },
+};
+
+#undef DEFINE_LEVEL
+#undef DEFINE_NAME
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level)
+{
+ struct btrfs_lockdep_keyset *ks;
+
+ BUG_ON(level >= ARRAY_SIZE(ks->keys));
+
+ /* Find the matching keyset, id 0 is the default entry */
+ for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+ if (ks->id == objectid)
+ break;
+
+ lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]);
+}
+
+void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
+{
+ if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid,
+ eb, btrfs_header_level(eb));
+}
+
+#endif
+
+/*
+ * Extent buffer locking
+ * =====================
+ *
+ * We use a rw_semaphore for tree locking, and the semantics are exactly the
+ * same:
+ *
+ * - reader/writer exclusion
+ * - writer/writer exclusion
+ * - reader/reader sharing
+ * - try-lock semantics for readers and writers
+ *
+ * The rwsem implementation does opportunistic spinning which reduces number of
+ * times the locking task needs to sleep.
+ */
+
+/*
+ * __btrfs_tree_read_lock - lock extent buffer for read
+ * @eb: the eb to be locked
+ * @nest: the nesting level to be used for lockdep
+ *
+ * This takes the read lock on the extent buffer, using the specified nesting
+ * level for lockdep purposes.
+ */
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+{
+ u64 start_ns = 0;
+
+ if (trace_btrfs_tree_read_lock_enabled())
+ start_ns = ktime_get_ns();
+
+ down_read_nested(&eb->lock, nest);
+ trace_btrfs_tree_read_lock(eb, start_ns);
+}
+
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL);
+}
+
+/*
+ * Try-lock for read.
+ *
+ * Return 1 if the rwlock has been taken, 0 otherwise
+ */
+int btrfs_try_tree_read_lock(struct extent_buffer *eb)
+{
+ if (down_read_trylock(&eb->lock)) {
+ trace_btrfs_try_tree_read_lock(eb);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Try-lock for write.
+ *
+ * Return 1 if the rwlock has been taken, 0 otherwise
+ */
+int btrfs_try_tree_write_lock(struct extent_buffer *eb)
+{
+ if (down_write_trylock(&eb->lock)) {
+ eb->lock_owner = current->pid;
+ trace_btrfs_try_tree_write_lock(eb);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Release read lock.
+ */
+void btrfs_tree_read_unlock(struct extent_buffer *eb)
+{
+ trace_btrfs_tree_read_unlock(eb);
+ up_read(&eb->lock);
+}
+
+/*
+ * __btrfs_tree_lock - lock eb for write
+ * @eb: the eb to lock
+ * @nest: the nesting to use for the lock
+ *
+ * Returns with the eb->lock write locked.
+ */
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+ __acquires(&eb->lock)
+{
+ u64 start_ns = 0;
+
+ if (trace_btrfs_tree_lock_enabled())
+ start_ns = ktime_get_ns();
+
+ down_write_nested(&eb->lock, nest);
+ eb->lock_owner = current->pid;
+ trace_btrfs_tree_lock(eb, start_ns);
+}
+
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
+}
+
+/*
+ * Release the write lock.
+ */
+void btrfs_tree_unlock(struct extent_buffer *eb)
+{
+ trace_btrfs_tree_unlock(eb);
+ eb->lock_owner = 0;
+ up_write(&eb->lock);
+}
+
+/*
+ * This releases any locks held in the path starting at level and going all the
+ * way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few corner
+ * cases, such as COW of the block at slot zero in the node. This ignores
+ * those rules, and it should only be called when there are no more updates to
+ * be done higher up in the tree.
+ */
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+ int i;
+
+ if (path->keep_locks)
+ return;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ continue;
+ if (!path->locks[i])
+ continue;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ }
+}
+
+/*
+ * Loop around taking references on and locking the root node of the tree until
+ * we end up with a lock on the root node.
+ *
+ * Return: root extent buffer with write lock held
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+
+ btrfs_maybe_reset_lockdep_class(root, eb);
+ btrfs_tree_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * Loop around taking references on and locking the root node of the tree until
+ * we end up with a lock on the root node.
+ *
+ * Return: root extent buffer with read lock held
+ */
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+
+ btrfs_maybe_reset_lockdep_class(root, eb);
+ btrfs_tree_read_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * Loop around taking references on and locking the root node of the tree in
+ * nowait mode until we end up with a lock on the root node or returning to
+ * avoid blocking.
+ *
+ * Return: root extent buffer with read lock held or -EAGAIN.
+ */
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ if (!btrfs_try_tree_read_lock(eb)) {
+ free_extent_buffer(eb);
+ return ERR_PTR(-EAGAIN);
+ }
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * DREW locks
+ * ==========
+ *
+ * DREW stands for double-reader-writer-exclusion lock. It's used in situation
+ * where you want to provide A-B exclusion but not AA or BB.
+ *
+ * Currently implementation gives more priority to reader. If a reader and a
+ * writer both race to acquire their respective sides of the lock the writer
+ * would yield its lock as soon as it detects a concurrent reader. Additionally
+ * if there are pending readers no new writers would be allowed to come in and
+ * acquire the lock.
+ */
+
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
+{
+ int ret;
+
+ ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ atomic_set(&lock->readers, 0);
+ init_waitqueue_head(&lock->pending_readers);
+ init_waitqueue_head(&lock->pending_writers);
+
+ return 0;
+}
+
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
+{
+ percpu_counter_destroy(&lock->writers);
+}
+
+/* Return true if acquisition is successful, false otherwise */
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
+{
+ if (atomic_read(&lock->readers))
+ return false;
+
+ percpu_counter_inc(&lock->writers);
+
+ /* Ensure writers count is updated before we check for pending readers */
+ smp_mb();
+ if (atomic_read(&lock->readers)) {
+ btrfs_drew_write_unlock(lock);
+ return false;
+ }
+
+ return true;
+}
+
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
+{
+ while (true) {
+ if (btrfs_drew_try_write_lock(lock))
+ return;
+ wait_event(lock->pending_writers, !atomic_read(&lock->readers));
+ }
+}
+
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
+{
+ percpu_counter_dec(&lock->writers);
+ cond_wake_up(&lock->pending_readers);
+}
+
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
+{
+ atomic_inc(&lock->readers);
+
+ /*
+ * Ensure the pending reader count is perceieved BEFORE this reader
+ * goes to sleep in case of active writers. This guarantees new writers
+ * won't be allowed and that the current reader will be woken up when
+ * the last active writer finishes its jobs.
+ */
+ smp_mb__after_atomic();
+
+ wait_event(lock->pending_readers,
+ percpu_counter_sum(&lock->writers) == 0);
+}
+
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
+{
+ /*
+ * atomic_dec_and_test implies a full barrier, so woken up writers
+ * are guaranteed to see the decrement
+ */
+ if (atomic_dec_and_test(&lock->readers))
+ wake_up(&lock->pending_writers);
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000..490c7a79e
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_LOCKING_H
+#define BTRFS_LOCKING_H
+
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
+#include "extent_io.h"
+
+#define BTRFS_WRITE_LOCK 1
+#define BTRFS_READ_LOCK 2
+
+/*
+ * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
+ * the time of this patch is 8, which is how many we use. Keep this in mind if
+ * you decide you want to add another subclass.
+ */
+enum btrfs_lock_nesting {
+ BTRFS_NESTING_NORMAL,
+
+ /*
+ * When we COW a block we are holding the lock on the original block,
+ * and since our lockdep maps are rootid+level, this confuses lockdep
+ * when we lock the newly allocated COW'd block. Handle this by having
+ * a subclass for COW'ed blocks so that lockdep doesn't complain.
+ */
+ BTRFS_NESTING_COW,
+
+ /*
+ * Oftentimes we need to lock adjacent nodes on the same level while
+ * still holding the lock on the original node we searched to, such as
+ * for searching forward or for split/balance.
+ *
+ * Because of this we need to indicate to lockdep that this is
+ * acceptable by having a different subclass for each of these
+ * operations.
+ */
+ BTRFS_NESTING_LEFT,
+ BTRFS_NESTING_RIGHT,
+
+ /*
+ * When splitting we will be holding a lock on the left/right node when
+ * we need to cow that node, thus we need a new set of subclasses for
+ * these two operations.
+ */
+ BTRFS_NESTING_LEFT_COW,
+ BTRFS_NESTING_RIGHT_COW,
+
+ /*
+ * When splitting we may push nodes to the left or right, but still use
+ * the subsequent nodes in our path, keeping our locks on those adjacent
+ * blocks. Thus when we go to allocate a new split block we've already
+ * used up all of our available subclasses, so this subclass exists to
+ * handle this case where we need to allocate a new split block.
+ */
+ BTRFS_NESTING_SPLIT,
+
+ /*
+ * When promoting a new block to a root we need to have a special
+ * subclass so we don't confuse lockdep, as it will appear that we are
+ * locking a higher level node before a lower level one. Copying also
+ * has this problem as it appears we're locking the same block again
+ * when we make a snapshot of an existing root.
+ */
+ BTRFS_NESTING_NEW_ROOT,
+
+ /*
+ * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * add this in here and add a static_assert to keep us from going over
+ * the limit. As of this writing we're limited to 8, and we're
+ * definitely using 8, hence this check to keep us from messing up in
+ * the future.
+ */
+ BTRFS_NESTING_MAX,
+};
+
+static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
+ "too many lock subclasses defined");
+
+struct btrfs_path;
+
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
+void btrfs_tree_lock(struct extent_buffer *eb);
+void btrfs_tree_unlock(struct extent_buffer *eb);
+
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
+void btrfs_tree_read_lock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock(struct extent_buffer *eb);
+int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+ lockdep_assert_held_write(&eb->lock);
+}
+#else
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
+#endif
+
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
+
+static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
+{
+ if (rw == BTRFS_WRITE_LOCK)
+ btrfs_tree_unlock(eb);
+ else if (rw == BTRFS_READ_LOCK)
+ btrfs_tree_read_unlock(eb);
+ else
+ BUG();
+}
+
+struct btrfs_drew_lock {
+ atomic_t readers;
+ struct percpu_counter writers;
+ wait_queue_head_t pending_writers;
+ wait_queue_head_t pending_readers;
+};
+
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level);
+void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb);
+#else
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level)
+{
+}
+static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+}
+#endif
+
+#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000..89bc5f825
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,494 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include <linux/refcount.h>
+#include "compression.h"
+#include "ctree.h"
+
+#define LZO_LEN 4
+
+/*
+ * Btrfs LZO compression format
+ *
+ * Regular and inlined LZO compressed data extents consist of:
+ *
+ * 1. Header
+ * Fixed size. LZO_LEN (4) bytes long, LE32.
+ * Records the total size (including the header) of compressed data.
+ *
+ * 2. Segment(s)
+ * Variable size. Each segment includes one segment header, followed by data
+ * payload.
+ * One regular LZO compressed extent can have one or more segments.
+ * For inlined LZO compressed extent, only one segment is allowed.
+ * One segment represents at most one sector of uncompressed data.
+ *
+ * 2.1 Segment header
+ * Fixed size. LZO_LEN (4) bytes long, LE32.
+ * Records the total size of the segment (not including the header).
+ * Segment header never crosses sector boundary, thus it's possible to
+ * have at most 3 padding zeros at the end of the sector.
+ *
+ * 2.2 Data Payload
+ * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ * which is 4419 for a 4KiB sectorsize.
+ *
+ * Example with 4K sectorsize:
+ * Page 1:
+ * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
+ * 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
+ * ...
+ * 0x0ff0 | SegHdr N | Data payload N ... |00|
+ * ^^ padding zeros
+ * Page 2:
+ * 0x1000 | SegHdr N+1| Data payload N+1 ... |
+ */
+
+#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+
+struct workspace {
+ void *mem;
+ void *buf; /* where decompressed data goes */
+ void *cbuf; /* where compressed data goes */
+ struct list_head list;
+};
+
+static struct workspace_manager wsm;
+
+void lzo_free_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ kvfree(workspace->buf);
+ kvfree(workspace->cbuf);
+ kvfree(workspace->mem);
+ kfree(workspace);
+}
+
+struct list_head *lzo_alloc_workspace(unsigned int level)
+{
+ struct workspace *workspace;
+
+ workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
+ if (!workspace)
+ return ERR_PTR(-ENOMEM);
+
+ workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+ workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL);
+ workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL);
+ if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+ goto fail;
+
+ INIT_LIST_HEAD(&workspace->list);
+
+ return &workspace->list;
+fail:
+ lzo_free_workspace(&workspace->list);
+ return ERR_PTR(-ENOMEM);
+}
+
+static inline void write_compress_length(char *buf, size_t len)
+{
+ __le32 dlen;
+
+ dlen = cpu_to_le32(len);
+ memcpy(buf, &dlen, LZO_LEN);
+}
+
+static inline size_t read_compress_length(const char *buf)
+{
+ __le32 dlen;
+
+ memcpy(&dlen, buf, LZO_LEN);
+ return le32_to_cpu(dlen);
+}
+
+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+ size_t compressed_size,
+ struct page **out_pages,
+ unsigned long max_nr_page,
+ u32 *cur_out,
+ const u32 sectorsize)
+{
+ u32 sector_bytes_left;
+ u32 orig_out;
+ struct page *cur_page;
+ char *kaddr;
+
+ if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ return -E2BIG;
+
+ /*
+ * We never allow a segment header crossing sector boundary, previous
+ * run should ensure we have enough space left inside the sector.
+ */
+ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+
+ kaddr = kmap_local_page(cur_page);
+ write_compress_length(kaddr + offset_in_page(*cur_out),
+ compressed_size);
+ *cur_out += LZO_LEN;
+
+ orig_out = *cur_out;
+
+ /* Copy compressed data */
+ while (*cur_out - orig_out < compressed_size) {
+ u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+ orig_out + compressed_size - *cur_out);
+
+ kunmap_local(kaddr);
+
+ if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ return -E2BIG;
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+ kaddr = kmap_local_page(cur_page);
+
+ memcpy(kaddr + offset_in_page(*cur_out),
+ compressed_data + *cur_out - orig_out, copy_len);
+
+ *cur_out += copy_len;
+ }
+
+ /*
+ * Check if we can fit the next segment header into the remaining space
+ * of the sector.
+ */
+ sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+ if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+ goto out;
+
+ /* The remaining size is not enough, pad it with zeros */
+ memset(kaddr + offset_in_page(*cur_out), 0,
+ sector_bytes_left);
+ *cur_out += sector_bytes_left;
+
+out:
+ kunmap_local(kaddr);
+ return 0;
+}
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+ struct page *page_in = NULL;
+ char *sizes_ptr;
+ const unsigned long max_nr_page = *out_pages;
+ int ret = 0;
+ /* Points to the file offset of input data */
+ u64 cur_in = start;
+ /* Points to the current output byte */
+ u32 cur_out = 0;
+ u32 len = *total_out;
+
+ ASSERT(max_nr_page > 0);
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ /*
+ * Skip the header for now, we will later come back and write the total
+ * compressed size
+ */
+ cur_out += LZO_LEN;
+ while (cur_in < start + len) {
+ char *data_in;
+ const u32 sectorsize_mask = sectorsize - 1;
+ u32 sector_off = (cur_in - start) & sectorsize_mask;
+ u32 in_len;
+ size_t out_len;
+
+ /* Get the input page first */
+ if (!page_in) {
+ page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+ ASSERT(page_in);
+ }
+
+ /* Compress at most one sector of data each time */
+ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+ ASSERT(in_len);
+ data_in = kmap_local_page(page_in);
+ ret = lzo1x_1_compress(data_in +
+ offset_in_page(cur_in), in_len,
+ workspace->cbuf, &out_len,
+ workspace->mem);
+ kunmap_local(data_in);
+ if (ret < 0) {
+ pr_debug("BTRFS: lzo in loop returned %d\n", ret);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ pages, max_nr_page,
+ &cur_out, sectorsize);
+ if (ret < 0)
+ goto out;
+
+ cur_in += in_len;
+
+ /*
+ * Check if we're making it bigger after two sectors. And if
+ * it is so, give up.
+ */
+ if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ /* Check if we have reached page boundary */
+ if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+ put_page(page_in);
+ page_in = NULL;
+ }
+ }
+
+ /* Store the size of all chunks of compressed data */
+ sizes_ptr = kmap_local_page(pages[0]);
+ write_compress_length(sizes_ptr, cur_out);
+ kunmap_local(sizes_ptr);
+
+ ret = 0;
+ *total_out = cur_out;
+ *total_in = cur_in - start;
+out:
+ if (page_in)
+ put_page(page_in);
+ *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+ return ret;
+}
+
+/*
+ * Copy the compressed segment payload into @dest.
+ *
+ * For the payload there will be no padding, just need to do page switching.
+ */
+static void copy_compressed_segment(struct compressed_bio *cb,
+ char *dest, u32 len, u32 *cur_in)
+{
+ u32 orig_in = *cur_in;
+
+ while (*cur_in < orig_in + len) {
+ struct page *cur_page;
+ u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
+ orig_in + len - *cur_in);
+
+ ASSERT(copy_len);
+ cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+
+ memcpy_from_page(dest + *cur_in - orig_in, cur_page,
+ offset_in_page(*cur_in), copy_len);
+
+ *cur_in += copy_len;
+ }
+}
+
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ char *kaddr;
+ int ret;
+ /* Compressed data length, can be unaligned */
+ u32 len_in;
+ /* Offset inside the compressed data */
+ u32 cur_in = 0;
+ /* Bytes decompressed so far */
+ u32 cur_out = 0;
+
+ kaddr = kmap_local_page(cb->compressed_pages[0]);
+ len_in = read_compress_length(kaddr);
+ kunmap_local(kaddr);
+ cur_in += LZO_LEN;
+
+ /*
+ * LZO header length check
+ *
+ * The total length should not exceed the maximum extent length,
+ * and all sectors should be used.
+ * If this happens, it means the compressed extent is corrupted.
+ */
+ if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+ round_up(len_in, sectorsize) < cb->compressed_len) {
+ btrfs_err(fs_info,
+ "invalid lzo header, lzo len %u compressed len %u",
+ len_in, cb->compressed_len);
+ return -EUCLEAN;
+ }
+
+ /* Go through each lzo segment */
+ while (cur_in < len_in) {
+ struct page *cur_page;
+ /* Length of the compressed segment */
+ u32 seg_len;
+ u32 sector_bytes_left;
+ size_t out_len = lzo1x_worst_compress(sectorsize);
+
+ /*
+ * We should always have enough space for one segment header
+ * inside current sector.
+ */
+ ASSERT(cur_in / sectorsize ==
+ (cur_in + LZO_LEN - 1) / sectorsize);
+ cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
+ ASSERT(cur_page);
+ kaddr = kmap_local_page(cur_page);
+ seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ kunmap_local(kaddr);
+ cur_in += LZO_LEN;
+
+ if (seg_len > WORKSPACE_CBUF_LENGTH) {
+ /*
+ * seg_len shouldn't be larger than we have allocated
+ * for workspace->cbuf
+ */
+ btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
+ seg_len);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* Copy the compressed segment payload into workspace */
+ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
+
+ /* Decompress the data */
+ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
+ workspace->buf, &out_len);
+ if (ret != LZO_E_OK) {
+ btrfs_err(fs_info, "failed to decompress");
+ ret = -EIO;
+ goto out;
+ }
+
+ /* Copy the data into inode pages */
+ ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out);
+ cur_out += out_len;
+
+ /* All data read, exit */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ /* Check if the sector has enough space for a segment header */
+ sector_bytes_left = sectorsize - (cur_in % sectorsize);
+ if (sector_bytes_left >= LZO_LEN)
+ continue;
+
+ /* Skip the padding zeros */
+ cur_in += sector_bytes_left;
+ }
+out:
+ if (!ret)
+ zero_fill_bio(cb->orig_bio);
+ return ret;
+}
+
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ size_t in_len;
+ size_t out_len;
+ size_t max_segment_len = WORKSPACE_BUF_LENGTH;
+ int ret = 0;
+ char *kaddr;
+ unsigned long bytes;
+
+ if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
+ return -EUCLEAN;
+
+ in_len = read_compress_length(data_in);
+ if (in_len != srclen)
+ return -EUCLEAN;
+ data_in += LZO_LEN;
+
+ in_len = read_compress_length(data_in);
+ if (in_len != srclen - LZO_LEN * 2) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ data_in += LZO_LEN;
+
+ out_len = PAGE_SIZE;
+ ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+ if (ret != LZO_E_OK) {
+ pr_warn("BTRFS: decompress failed!\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ if (out_len < start_byte) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * the caller is already checking against PAGE_SIZE, but lets
+ * move this check closer to the memcpy/memset
+ */
+ destlen = min_t(unsigned long, destlen, PAGE_SIZE);
+ bytes = min_t(unsigned long, destlen, out_len - start_byte);
+
+ kaddr = kmap_local_page(dest_page);
+ memcpy(kaddr, workspace->buf + start_byte, bytes);
+
+ /*
+ * btrfs_getblock is doing a zero on the tail of the page too,
+ * but this will cover anything missing from the decompressed
+ * data.
+ */
+ if (bytes < destlen)
+ memset(kaddr+bytes, 0, destlen-bytes);
+ kunmap_local(kaddr);
+out:
+ return ret;
+}
+
+const struct btrfs_compress_op btrfs_lzo_compress = {
+ .workspace_manager = &wsm,
+ .max_level = 1,
+ .default_level = 1,
+};
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
new file mode 100644
index 000000000..f9850edfd
--- /dev/null
+++ b/fs/btrfs/misc.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_MISC_H
+#define BTRFS_MISC_H
+
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/math64.h>
+#include <linux/rbtree.h>
+
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+
+static inline void cond_wake_up(struct wait_queue_head *wq)
+{
+ /*
+ * This implies a full smp_mb barrier, see comments for
+ * waitqueue_active why.
+ */
+ if (wq_has_sleeper(wq))
+ wake_up(wq);
+}
+
+static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
+{
+ /*
+ * Special case for conditional wakeup where the barrier required for
+ * waitqueue_active is implied by some of the preceding code. Eg. one
+ * of such atomic operations (atomic_dec_and_return, ...), or a
+ * unlock/lock sequence, etc.
+ */
+ if (waitqueue_active(wq))
+ wake_up(wq);
+}
+
+static inline u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ return div_u64(num, 10);
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor == 100)
+ return num;
+ num *= factor;
+ return div_u64(num, 100);
+}
+
+/* Copy of is_power_of_two that is 64bit safe */
+static inline bool is_power_of_two_u64(u64 n)
+{
+ return n != 0 && (n & (n - 1)) == 0;
+}
+
+static inline bool has_single_bit_set(u64 n)
+{
+ return is_power_of_two_u64(n);
+}
+
+/*
+ * Simple bytenr based rb_tree relate structures
+ *
+ * Any structure wants to use bytenr as single search index should have their
+ * structure start with these members.
+ */
+struct rb_simple_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+};
+
+static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *node = root->rb_node;
+ struct rb_simple_node *entry;
+
+ while (node) {
+ entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr)
+ node = node->rb_left;
+ else if (bytenr > entry->bytenr)
+ node = node->rb_right;
+ else
+ return node;
+ }
+ return NULL;
+}
+
+/*
+ * Search @root from an entry that starts or comes after @bytenr.
+ *
+ * @root: the root to search.
+ * @bytenr: bytenr to search from.
+ *
+ * Return the rb_node that start at or after @bytenr. If there is no entry at
+ * or after @bytner return NULL.
+ */
+static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+ u64 bytenr)
+{
+ struct rb_node *node = root->rb_node, *ret = NULL;
+ struct rb_simple_node *entry, *ret_entry = NULL;
+
+ while (node) {
+ entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr) {
+ if (!ret || entry->bytenr < ret_entry->bytenr) {
+ ret = node;
+ ret_entry = entry;
+ }
+
+ node = node->rb_left;
+ } else if (bytenr > entry->bytenr) {
+ node = node->rb_right;
+ } else {
+ return node;
+ }
+ }
+
+ return ret;
+}
+
+static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rb_simple_node *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000..1b2af4785
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,1172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/sched/mm.h>
+#include "misc.h"
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+#include "disk-io.h"
+#include "compression.h"
+#include "delalloc-space.h"
+#include "qgroup.h"
+#include "subpage.h"
+
+static struct kmem_cache *btrfs_ordered_extent_cache;
+
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+ if (entry->file_offset + entry->num_bytes < entry->file_offset)
+ return (u64)-1;
+ return entry->file_offset + entry->num_bytes;
+}
+
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_ordered_extent *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+
+ if (file_offset < entry->file_offset)
+ p = &(*p)->rb_left;
+ else if (file_offset >= entry_end(entry))
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+ struct rb_node **prev_ret)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *test;
+ struct btrfs_ordered_extent *entry;
+ struct btrfs_ordered_extent *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (file_offset < entry->file_offset)
+ n = n->rb_left;
+ else if (file_offset >= entry_end(entry))
+ n = n->rb_right;
+ else
+ return n;
+ }
+ if (!prev_ret)
+ return NULL;
+
+ while (prev && file_offset >= entry_end(prev_entry)) {
+ test = rb_next(prev);
+ if (!test)
+ break;
+ prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+ rb_node);
+ if (file_offset < entry_end(prev_entry))
+ break;
+
+ prev = test;
+ }
+ if (prev)
+ prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+ rb_node);
+ while (prev && file_offset < entry_end(prev_entry)) {
+ test = rb_prev(prev);
+ if (!test)
+ break;
+ prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+ rb_node);
+ prev = test;
+ }
+ *prev_ret = prev;
+ return NULL;
+}
+
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+ u64 len)
+{
+ if (file_offset + len <= entry->file_offset ||
+ entry->file_offset + entry->num_bytes <= file_offset)
+ return 0;
+ return 1;
+}
+
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+ u64 file_offset)
+{
+ struct rb_root *root = &tree->tree;
+ struct rb_node *prev = NULL;
+ struct rb_node *ret;
+ struct btrfs_ordered_extent *entry;
+
+ if (tree->last) {
+ entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+ rb_node);
+ if (in_range(file_offset, entry->file_offset, entry->num_bytes))
+ return tree->last;
+ }
+ ret = __tree_search(root, file_offset, &prev);
+ if (!ret)
+ ret = prev;
+ if (ret)
+ tree->last = ret;
+ return ret;
+}
+
+/**
+ * Add an ordered extent to the per-inode tree.
+ *
+ * @inode: Inode that this extent is for.
+ * @file_offset: Logical offset in file where the extent starts.
+ * @num_bytes: Logical length of extent in file.
+ * @ram_bytes: Full length of unencoded data.
+ * @disk_bytenr: Offset of extent on disk.
+ * @disk_num_bytes: Size of extent on disk.
+ * @offset: Offset into unencoded data where file data starts.
+ * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @compress_type: Compression algorithm used for data.
+ *
+ * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+ * tree is given a single reference on the ordered extent that was inserted.
+ *
+ * Return: 0 or -ENOMEM.
+ */
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry;
+ int ret;
+ u64 qgroup_rsv = 0;
+
+ if (flags &
+ ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
+ /* For nocow write, we can release the qgroup rsv right now */
+ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+ } else {
+ /*
+ * The ordered extent has reserved qgroup space, release now
+ * and pass the reserved number for qgroup_record to free.
+ */
+ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
+ if (ret < 0)
+ return ret;
+ }
+ entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->file_offset = file_offset;
+ entry->num_bytes = num_bytes;
+ entry->ram_bytes = ram_bytes;
+ entry->disk_bytenr = disk_bytenr;
+ entry->disk_num_bytes = disk_num_bytes;
+ entry->offset = offset;
+ entry->bytes_left = num_bytes;
+ entry->inode = igrab(&inode->vfs_inode);
+ entry->compress_type = compress_type;
+ entry->truncated_len = (u64)-1;
+ entry->qgroup_rsv = qgroup_rsv;
+ entry->physical = (u64)-1;
+
+ ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
+ entry->flags = flags;
+
+ percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
+ fs_info->delalloc_batch);
+
+ /* one ref for the tree */
+ refcount_set(&entry->refs, 1);
+ init_waitqueue_head(&entry->wait);
+ INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->log_list);
+ INIT_LIST_HEAD(&entry->root_extent_list);
+ INIT_LIST_HEAD(&entry->work_list);
+ init_completion(&entry->completion);
+
+ trace_btrfs_ordered_extent_add(inode, entry);
+
+ spin_lock_irq(&tree->lock);
+ node = tree_insert(&tree->tree, file_offset,
+ &entry->rb_node);
+ if (node)
+ btrfs_panic(fs_info, -EEXIST,
+ "inconsistency in ordered tree at offset %llu",
+ file_offset);
+ spin_unlock_irq(&tree->lock);
+
+ spin_lock(&root->ordered_extent_lock);
+ list_add_tail(&entry->root_extent_list,
+ &root->ordered_extents);
+ root->nr_ordered_extents++;
+ if (root->nr_ordered_extents == 1) {
+ spin_lock(&fs_info->ordered_root_lock);
+ BUG_ON(!list_empty(&root->ordered_root));
+ list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
+
+ /*
+ * We don't need the count_max_extents here, we can assume that all of
+ * that work has been done at higher layers, so this is truly the
+ * smallest the extent is going to get.
+ */
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, 1);
+ spin_unlock(&inode->lock);
+
+ return 0;
+}
+
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished. If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum)
+{
+ struct btrfs_ordered_inode_tree *tree;
+
+ tree = &BTRFS_I(entry->inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ list_add_tail(&sum->list, &entry->list);
+ spin_unlock_irq(&tree->lock);
+}
+
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered_extent;
+
+ ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+ btrfs_finish_ordered_io(ordered_extent);
+}
+
+/*
+ * Mark all ordered extents io inside the specified range finished.
+ *
+ * @page: The involved page for the operation.
+ * For uncompressed buffered IO, the page status also needs to be
+ * updated to indicate whether the pending ordered io is finished.
+ * Can be NULL for direct IO and compressed write.
+ * For these cases, callers are ensured they won't execute the
+ * endio function twice.
+ *
+ * This function is called for endio, thus the range must have ordered
+ * extent(s) covering it.
+ */
+void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ struct page *page, u64 file_offset,
+ u64 num_bytes, bool uptodate)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_workqueue *wq;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
+ u64 cur = file_offset;
+
+ if (btrfs_is_free_space_inode(inode))
+ wq = fs_info->endio_freespace_worker;
+ else
+ wq = fs_info->endio_write_workers;
+
+ if (page)
+ ASSERT(page->mapping && page_offset(page) <= file_offset &&
+ file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
+
+ spin_lock_irqsave(&tree->lock, flags);
+ while (cur < file_offset + num_bytes) {
+ u64 entry_end;
+ u64 end;
+ u32 len;
+
+ node = tree_search(tree, cur);
+ /* No ordered extents at all */
+ if (!node)
+ break;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ entry_end = entry->file_offset + entry->num_bytes;
+ /*
+ * |<-- OE --->| |
+ * cur
+ * Go to next OE.
+ */
+ if (cur >= entry_end) {
+ node = rb_next(node);
+ /* No more ordered extents, exit */
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_ordered_extent,
+ rb_node);
+
+ /* Go to next ordered extent and continue */
+ cur = entry->file_offset;
+ continue;
+ }
+ /*
+ * | |<--- OE --->|
+ * cur
+ * Go to the start of OE.
+ */
+ if (cur < entry->file_offset) {
+ cur = entry->file_offset;
+ continue;
+ }
+
+ /*
+ * Now we are definitely inside one ordered extent.
+ *
+ * |<--- OE --->|
+ * |
+ * cur
+ */
+ end = min(entry->file_offset + entry->num_bytes,
+ file_offset + num_bytes) - 1;
+ ASSERT(end + 1 - cur < U32_MAX);
+ len = end + 1 - cur;
+
+ if (page) {
+ /*
+ * Ordered (Private2) bit indicates whether we still
+ * have pending io unfinished for the ordered extent.
+ *
+ * If there's no such bit, we need to skip to next range.
+ */
+ if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
+ cur += len;
+ continue;
+ }
+ btrfs_page_clear_ordered(fs_info, page, cur, len);
+ }
+
+ /* Now we're fine to update the accounting */
+ if (unlikely(len > entry->bytes_left)) {
+ WARN_ON(1);
+ btrfs_crit(fs_info,
+"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode),
+ entry->file_offset,
+ entry->num_bytes,
+ len, entry->bytes_left);
+ entry->bytes_left = 0;
+ } else {
+ entry->bytes_left -= len;
+ }
+
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+ /*
+ * All the IO of the ordered extent is finished, we need to queue
+ * the finish_func to be executed.
+ */
+ if (entry->bytes_left == 0) {
+ set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ cond_wake_up(&entry->wait);
+ refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_mark_finished(inode, entry);
+ spin_unlock_irqrestore(&tree->lock, flags);
+ btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(wq, &entry->work);
+ spin_lock_irqsave(&tree->lock, flags);
+ }
+ cur += len;
+ }
+ spin_unlock_irqrestore(&tree->lock, flags);
+}
+
+/*
+ * Finish IO for one ordered extent across a given range. The range can only
+ * contain one ordered extent.
+ *
+ * @cached: The cached ordered extent. If not NULL, we can skip the tree
+ * search and use the ordered extent directly.
+ * Will be also used to store the finished ordered extent.
+ * @file_offset: File offset for the finished IO
+ * @io_size: Length of the finish IO range
+ *
+ * Return true if the ordered extent is finished in the range, and update
+ * @cached.
+ * Return false otherwise.
+ *
+ * NOTE: The range can NOT cross multiple ordered extents.
+ * Thus caller should ensure the range doesn't cross ordered extents.
+ */
+bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
+ bool finished = false;
+
+ spin_lock_irqsave(&tree->lock, flags);
+ if (cached && *cached) {
+ entry = *cached;
+ goto have_entry;
+ }
+
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
+ if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
+ goto out;
+
+ if (io_size > entry->bytes_left)
+ btrfs_crit(inode->root->fs_info,
+ "bad ordered accounting left %llu size %llu",
+ entry->bytes_left, io_size);
+
+ entry->bytes_left -= io_size;
+
+ if (entry->bytes_left == 0) {
+ /*
+ * Ensure only one caller can set the flag and finished_ret
+ * accordingly
+ */
+ finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /* test_and_set_bit implies a barrier */
+ cond_wake_up_nomb(&entry->wait);
+ }
+out:
+ if (finished && cached && entry) {
+ *cached = entry;
+ refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
+ }
+ spin_unlock_irqrestore(&tree->lock, flags);
+ return finished;
+}
+
+/*
+ * used to drop a reference on an ordered extent. This will free
+ * the extent if the last reference is dropped
+ */
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ struct list_head *cur;
+ struct btrfs_ordered_sum *sum;
+
+ trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
+
+ if (refcount_dec_and_test(&entry->refs)) {
+ ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(list_empty(&entry->log_list));
+ ASSERT(RB_EMPTY_NODE(&entry->rb_node));
+ if (entry->inode)
+ btrfs_add_delayed_iput(entry->inode);
+ while (!list_empty(&entry->list)) {
+ cur = entry->list.next;
+ sum = list_entry(cur, struct btrfs_ordered_sum, list);
+ list_del(&sum->list);
+ kvfree(sum);
+ }
+ kmem_cache_free(btrfs_ordered_extent_cache, entry);
+ }
+}
+
+/*
+ * remove an ordered extent from the tree. No references are dropped
+ * and waiters are woken up.
+ */
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
+ struct btrfs_ordered_extent *entry)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_root *root = btrfs_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *node;
+ bool pending;
+ bool freespace_inode;
+
+ /*
+ * If this is a free space inode the thread has not acquired the ordered
+ * extents lockdep map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
+
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
+ /* This is paired with btrfs_add_ordered_extent. */
+ spin_lock(&btrfs_inode->lock);
+ btrfs_mod_outstanding_extents(btrfs_inode, -1);
+ spin_unlock(&btrfs_inode->lock);
+ if (root != fs_info->tree_root) {
+ u64 release;
+
+ if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
+ release = entry->disk_num_bytes;
+ else
+ release = entry->num_bytes;
+ btrfs_delalloc_release_metadata(btrfs_inode, release,
+ test_bit(BTRFS_ORDERED_IOERR,
+ &entry->flags));
+ }
+
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
+ fs_info->delalloc_batch);
+
+ tree = &btrfs_inode->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ node = &entry->rb_node;
+ rb_erase(node, &tree->tree);
+ RB_CLEAR_NODE(node);
+ if (tree->last == node)
+ tree->last = NULL;
+ set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
+ spin_unlock_irq(&tree->lock);
+
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (pending) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ refcount_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ASSERT(trans || BTRFS_FS_ERROR(fs_info));
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
+ btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
+
+ spin_lock(&root->ordered_extent_lock);
+ list_del_init(&entry->root_extent_list);
+ root->nr_ordered_extents--;
+
+ trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
+
+ if (!root->nr_ordered_extents) {
+ spin_lock(&fs_info->ordered_root_lock);
+ BUG_ON(list_empty(&root->ordered_root));
+ list_del_init(&root->ordered_root);
+ spin_unlock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
+ wake_up(&entry->wait);
+ if (!freespace_inode)
+ btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
+}
+
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+ btrfs_start_ordered_extent(ordered, 1);
+ complete(&ordered->completion);
+}
+
+/*
+ * wait for all the ordered extents in a root. This is done when balancing
+ * space between drives.
+ */
+u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
+ const u64 range_start, const u64 range_len)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ LIST_HEAD(splice);
+ LIST_HEAD(skipped);
+ LIST_HEAD(works);
+ struct btrfs_ordered_extent *ordered, *next;
+ u64 count = 0;
+ const u64 range_end = range_start + range_len;
+
+ mutex_lock(&root->ordered_extent_mutex);
+ spin_lock(&root->ordered_extent_lock);
+ list_splice_init(&root->ordered_extents, &splice);
+ while (!list_empty(&splice) && nr) {
+ ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+ root_extent_list);
+
+ if (range_end <= ordered->disk_bytenr ||
+ ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
+ list_move_tail(&ordered->root_extent_list, &skipped);
+ cond_resched_lock(&root->ordered_extent_lock);
+ continue;
+ }
+
+ list_move_tail(&ordered->root_extent_list,
+ &root->ordered_extents);
+ refcount_inc(&ordered->refs);
+ spin_unlock(&root->ordered_extent_lock);
+
+ btrfs_init_work(&ordered->flush_work,
+ btrfs_run_ordered_extent_work, NULL, NULL);
+ list_add_tail(&ordered->work_list, &works);
+ btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
+
+ cond_resched();
+ spin_lock(&root->ordered_extent_lock);
+ if (nr != U64_MAX)
+ nr--;
+ count++;
+ }
+ list_splice_tail(&skipped, &root->ordered_extents);
+ list_splice_tail(&splice, &root->ordered_extents);
+ spin_unlock(&root->ordered_extent_lock);
+
+ list_for_each_entry_safe(ordered, next, &works, work_list) {
+ list_del_init(&ordered->work_list);
+ wait_for_completion(&ordered->completion);
+ btrfs_put_ordered_extent(ordered);
+ cond_resched();
+ }
+ mutex_unlock(&root->ordered_extent_mutex);
+
+ return count;
+}
+
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ const u64 range_start, const u64 range_len)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+ u64 done;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&fs_info->ordered_operations_mutex);
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice) && nr) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ root = btrfs_grab_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->ordered_root,
+ &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
+
+ done = btrfs_wait_ordered_extents(root, nr,
+ range_start, range_len);
+ btrfs_put_root(root);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ if (nr != U64_MAX) {
+ nr -= done;
+ }
+ }
+ list_splice_tail(&splice, &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
+ mutex_unlock(&fs_info->ordered_operations_mutex);
+}
+
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
+{
+ u64 start = entry->file_offset;
+ u64 end = start + entry->num_bytes - 1;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ bool freespace_inode;
+
+ trace_btrfs_ordered_extent_start(inode, entry);
+
+ /*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
+ /*
+ * pages in the range can be dirty, clean or writeback. We
+ * start IO on any dirty ones so the wait doesn't stall waiting
+ * for the flusher thread to find them
+ */
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ if (wait) {
+ if (!freespace_inode)
+ btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
+ wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+ &entry->flags));
+ }
+}
+
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+ int ret = 0;
+ int ret_wb = 0;
+ u64 end;
+ u64 orig_end;
+ struct btrfs_ordered_extent *ordered;
+
+ if (start + len < start) {
+ orig_end = INT_LIMIT(loff_t);
+ } else {
+ orig_end = start + len - 1;
+ if (orig_end > INT_LIMIT(loff_t))
+ orig_end = INT_LIMIT(loff_t);
+ }
+
+ /* start IO across the range first to instantiate any delalloc
+ * extents
+ */
+ ret = btrfs_fdatawrite_range(inode, start, orig_end);
+ if (ret)
+ return ret;
+
+ /*
+ * If we have a writeback error don't return immediately. Wait first
+ * for any ordered extents that haven't completed yet. This is to make
+ * sure no one can dirty the same page ranges and call writepages()
+ * before the ordered extents complete - to avoid failures (-EEXIST)
+ * when adding the new ordered extents to the ordered tree.
+ */
+ ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+
+ end = orig_end;
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
+ if (!ordered)
+ break;
+ if (ordered->file_offset > orig_end) {
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered->file_offset + ordered->num_bytes <= start) {
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ btrfs_start_ordered_extent(ordered, 1);
+ end = ordered->file_offset;
+ /*
+ * If the ordered extent had an error save the error but don't
+ * exit without waiting first for all other ordered extents in
+ * the range to complete.
+ */
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ ret = -EIO;
+ btrfs_put_ordered_extent(ordered);
+ if (end == 0 || end == start)
+ break;
+ end--;
+ }
+ return ret_wb ? ret_wb : ret;
+}
+
+/*
+ * find an ordered extent corresponding to file_offset. return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
+ u64 file_offset)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
+
+ tree = &inode->ordered_tree;
+ spin_lock_irqsave(&tree->lock, flags);
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
+ entry = NULL;
+ if (entry) {
+ refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup(inode, entry);
+ }
+out:
+ spin_unlock_irqrestore(&tree->lock, flags);
+ return entry;
+}
+
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &inode->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ node = tree_search(tree, file_offset + len);
+ if (!node)
+ goto out;
+ }
+
+ while (1) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ break;
+
+ if (entry->file_offset >= file_offset + len) {
+ entry = NULL;
+ break;
+ }
+ entry = NULL;
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ if (entry) {
+ refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_range(inode, entry);
+ }
+ spin_unlock_irq(&tree->lock);
+ return entry;
+}
+
+/*
+ * Adds all ordered extents to the given list. The list ends up sorted by the
+ * file_offset of the ordered extents.
+ */
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *n;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+
+ ASSERT(list_empty(&ordered->log_list));
+ list_add_tail(&ordered->log_list, list);
+ refcount_inc(&ordered->refs);
+ trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
+ * lookup and return any extent before 'file_offset'. NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &inode->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first(inode, entry);
+out:
+ spin_unlock_irq(&tree->lock);
+ return entry;
+}
+
+/*
+ * Lookup the first ordered extent that overlaps the range
+ * [@file_offset, @file_offset + @len).
+ *
+ * The difference between this and btrfs_lookup_first_ordered_extent() is
+ * that this one won't return any ordered extent that does not overlap the range.
+ * And the difference against btrfs_lookup_ordered_extent() is, this function
+ * ensures the first ordered extent gets returned.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *node;
+ struct rb_node *cur;
+ struct rb_node *prev;
+ struct rb_node *next;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ spin_lock_irq(&tree->lock);
+ node = tree->tree.rb_node;
+ /*
+ * Here we don't want to use tree_search() which will use tree->last
+ * and screw up the search order.
+ * And __tree_search() can't return the adjacent ordered extents
+ * either, thus here we do our own search.
+ */
+ while (node) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+ if (file_offset < entry->file_offset) {
+ node = node->rb_left;
+ } else if (file_offset >= entry_end(entry)) {
+ node = node->rb_right;
+ } else {
+ /*
+ * Direct hit, got an ordered extent that starts at
+ * @file_offset
+ */
+ goto out;
+ }
+ }
+ if (!entry) {
+ /* Empty tree */
+ goto out;
+ }
+
+ cur = &entry->rb_node;
+ /* We got an entry around @file_offset, check adjacent entries */
+ if (entry->file_offset < file_offset) {
+ prev = cur;
+ next = rb_next(cur);
+ } else {
+ prev = rb_prev(cur);
+ next = cur;
+ }
+ if (prev) {
+ entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ goto out;
+ }
+ if (next) {
+ entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ goto out;
+ }
+ /* No ordered extent in the range */
+ entry = NULL;
+out:
+ if (entry) {
+ refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
+ }
+
+ spin_unlock_irq(&tree->lock);
+ return entry;
+}
+
+/*
+ * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
+ * ordered extents in it are run to completion.
+ *
+ * @inode: Inode whose ordered tree is to be searched
+ * @start: Beginning of range to flush
+ * @end: Last byte of range to lock
+ * @cached_state: If passed, will return the extent state responsible for the
+ * locked range. It's the caller's responsibility to free the cached state.
+ *
+ * This function always returns with the given range locked, ensuring after it's
+ * called no order extent can be pending.
+ */
+void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
+ u64 end,
+ struct extent_state **cached_state)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cache = NULL;
+ struct extent_state **cachedp = &cache;
+
+ if (cached_state)
+ cachedp = cached_state;
+
+ while (1) {
+ lock_extent(&inode->io_tree, start, end, cachedp);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ end - start + 1);
+ if (!ordered) {
+ /*
+ * If no external cached_state has been passed then
+ * decrement the extra ref taken for cachedp since we
+ * aren't exposing it outside of this function
+ */
+ if (!cached_state)
+ refcount_dec(&cache->refs);
+ break;
+ }
+ unlock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+/*
+ * Lock the passed range and ensure all pending ordered extents in it are run
+ * to completion in nowait mode.
+ *
+ * Return true if btrfs_lock_ordered_range does not return any extents,
+ * otherwise false.
+ */
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, start, end))
+ return false;
+
+ ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
+ if (!ordered)
+ return true;
+
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&inode->io_tree, start, end, NULL);
+
+ return false;
+}
+
+
+static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
+ u64 len)
+{
+ struct inode *inode = ordered->inode;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ u64 file_offset = ordered->file_offset + pos;
+ u64 disk_bytenr = ordered->disk_bytenr + pos;
+ unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
+
+ /*
+ * The splitting extent is already counted and will be added again in
+ * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
+ */
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
+ fs_info->delalloc_batch);
+ WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
+ return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
+ disk_bytenr, len, 0, flags,
+ ordered->compress_type);
+}
+
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ u64 post)
+{
+ struct inode *inode = ordered->inode;
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct rb_node *node;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret = 0;
+
+ trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+
+ spin_lock_irq(&tree->lock);
+ /* Remove from tree once */
+ node = &ordered->rb_node;
+ rb_erase(node, &tree->tree);
+ RB_CLEAR_NODE(node);
+ if (tree->last == node)
+ tree->last = NULL;
+
+ ordered->file_offset += pre;
+ ordered->disk_bytenr += pre;
+ ordered->num_bytes -= (pre + post);
+ ordered->disk_num_bytes -= (pre + post);
+ ordered->bytes_left -= (pre + post);
+
+ /* Re-insert the node */
+ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+ if (node)
+ btrfs_panic(fs_info, -EEXIST,
+ "zoned: inconsistency in ordered tree at offset %llu",
+ ordered->file_offset);
+
+ spin_unlock_irq(&tree->lock);
+
+ if (pre)
+ ret = clone_ordered_extent(ordered, 0, pre);
+ if (ret == 0 && post)
+ ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
+ post);
+
+ return ret;
+}
+
+int __init ordered_data_init(void)
+{
+ btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+ sizeof(struct btrfs_ordered_extent), 0,
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_ordered_extent_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __cold ordered_data_exit(void)
+{
+ kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000..cc3ca4bb9
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_ORDERED_DATA_H
+#define BTRFS_ORDERED_DATA_H
+
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+ spinlock_t lock;
+ struct rb_root tree;
+ struct rb_node *last;
+};
+
+struct btrfs_ordered_sum {
+ /* bytenr is the start of this extent on disk */
+ u64 bytenr;
+
+ /*
+ * this is the length in bytes covered by the sums array below.
+ */
+ u32 len;
+ struct list_head list;
+ /* last field is a variable length array of csums */
+ u8 sums[];
+};
+
+/*
+ * Bits for btrfs_ordered_extent::flags.
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters. It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+enum {
+ /*
+ * Different types for ordered extents, one and only one of the 4 types
+ * need to be set when creating ordered extent.
+ *
+ * REGULAR: For regular non-compressed COW write
+ * NOCOW: For NOCOW write into existing non-hole extent
+ * PREALLOC: For NOCOW write into preallocated extent
+ * COMPRESSED: For compressed COW write
+ */
+ BTRFS_ORDERED_REGULAR,
+ BTRFS_ORDERED_NOCOW,
+ BTRFS_ORDERED_PREALLOC,
+ BTRFS_ORDERED_COMPRESSED,
+
+ /*
+ * Extra bit for direct io, can only be set for
+ * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent.
+ */
+ BTRFS_ORDERED_DIRECT,
+
+ /* Extra status bits for ordered extents */
+
+ /* set when all the pages are written */
+ BTRFS_ORDERED_IO_DONE,
+ /* set when removed from the tree */
+ BTRFS_ORDERED_COMPLETE,
+ /* We had an io error when writing this out */
+ BTRFS_ORDERED_IOERR,
+ /* Set when we have to truncate an extent */
+ BTRFS_ORDERED_TRUNCATED,
+ /* Used during fsync to track already logged extents */
+ BTRFS_ORDERED_LOGGED,
+ /* We have already logged all the csums of the ordered extent */
+ BTRFS_ORDERED_LOGGED_CSUM,
+ /* We wait for this extent to complete in the current transaction */
+ BTRFS_ORDERED_PENDING,
+ /* BTRFS_IOC_ENCODED_WRITE */
+ BTRFS_ORDERED_ENCODED,
+};
+
+/* BTRFS_ORDERED_* flags that specify the type of the extent. */
+#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \
+ (1UL << BTRFS_ORDERED_NOCOW) | \
+ (1UL << BTRFS_ORDERED_PREALLOC) | \
+ (1UL << BTRFS_ORDERED_COMPRESSED) | \
+ (1UL << BTRFS_ORDERED_DIRECT) | \
+ (1UL << BTRFS_ORDERED_ENCODED))
+
+struct btrfs_ordered_extent {
+ /* logical offset in the file */
+ u64 file_offset;
+
+ /*
+ * These fields directly correspond to the same fields in
+ * btrfs_file_extent_item.
+ */
+ u64 num_bytes;
+ u64 ram_bytes;
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 offset;
+
+ /* number of bytes that still need writing */
+ u64 bytes_left;
+
+ /*
+ * the end of the ordered extent which is behind it but
+ * didn't update disk_i_size. Please see the comment of
+ * btrfs_ordered_update_i_size();
+ */
+ u64 outstanding_isize;
+
+ /*
+ * If we get truncated we need to adjust the file extent we enter for
+ * this ordered extent so that we do not expose stale data.
+ */
+ u64 truncated_len;
+
+ /* flags (described above) */
+ unsigned long flags;
+
+ /* compression algorithm */
+ int compress_type;
+
+ /* Qgroup reserved space */
+ int qgroup_rsv;
+
+ /* reference count */
+ refcount_t refs;
+
+ /* the inode we belong to */
+ struct inode *inode;
+
+ /* list of checksums for insertion when the extent io is done */
+ struct list_head list;
+
+ /* used for fast fsyncs */
+ struct list_head log_list;
+
+ /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+ wait_queue_head_t wait;
+
+ /* our friendly rbtree entry */
+ struct rb_node rb_node;
+
+ /* a per root list of all the pending ordered extents */
+ struct list_head root_extent_list;
+
+ struct btrfs_work work;
+
+ struct completion completion;
+ struct btrfs_work flush_work;
+ struct list_head work_list;
+
+ /*
+ * Used to reverse-map physical address returned from ZONE_APPEND write
+ * command in a workqueue context
+ */
+ u64 physical;
+ struct block_device *bdev;
+};
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+ spin_lock_init(&t->lock);
+ t->tree = RB_ROOT;
+ t->last = NULL;
+}
+
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
+
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
+ struct btrfs_ordered_extent *entry);
+void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ struct page *page, u64 file_offset,
+ u64 num_bytes, bool uptodate);
+bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type);
+void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
+ u64 file_offset);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
+ struct btrfs_inode *inode,
+ u64 file_offset,
+ u64 len);
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list);
+u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
+ const u64 range_start, const u64 range_len);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ const u64 range_start, const u64 range_len);
+void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
+ u64 end,
+ struct extent_state **cached_state);
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end);
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ u64 post);
+int __init ordered_data_init(void);
+void __cold ordered_data_exit(void);
+
+#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000..aa534108c
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2008 Red Hat. All rights reserved.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret) { /* JDM: Really? */
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000..228eeb04d
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+struct root_name_map {
+ u64 id;
+ char name[16];
+};
+
+static const struct root_name_map root_map[] = {
+ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" },
+ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" },
+ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" },
+ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" },
+ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" },
+ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" },
+ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" },
+ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
+ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
+ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
+ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+};
+
+const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
+{
+ int i;
+
+ if (key->objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
+ "TREE_RELOC offset=%llu", key->offset);
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(root_map); i++) {
+ if (root_map[i].id == key->objectid)
+ return root_map[i].name;
+ }
+
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", key->objectid);
+ return buf;
+}
+
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+ int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+ int i;
+ pr_info("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
+ btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk),
+ btrfs_chunk_type(eb, chunk), num_stripes);
+ for (i = 0 ; i < num_stripes ; i++) {
+ pr_info("\t\t\tstripe %d devid %llu offset %llu\n", i,
+ btrfs_stripe_devid_nr(eb, chunk, i),
+ btrfs_stripe_offset_nr(eb, chunk, i));
+ }
+}
+static void print_dev_item(struct extent_buffer *eb,
+ struct btrfs_dev_item *dev_item)
+{
+ pr_info("\t\tdev item devid %llu total_bytes %llu bytes used %llu\n",
+ btrfs_device_id(eb, dev_item),
+ btrfs_device_total_bytes(eb, dev_item),
+ btrfs_device_bytes_used(eb, dev_item));
+}
+static void print_extent_data_ref(struct extent_buffer *eb,
+ struct btrfs_extent_data_ref *ref)
+{
+ pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n",
+ btrfs_extent_data_ref_root(eb, ref),
+ btrfs_extent_data_ref_objectid(eb, ref),
+ btrfs_extent_data_ref_offset(eb, ref),
+ btrfs_extent_data_ref_count(eb, ref));
+}
+
+static void print_extent_item(struct extent_buffer *eb, int slot, int type)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct btrfs_disk_key key;
+ unsigned long end;
+ unsigned long ptr;
+ u32 item_size = btrfs_item_size(eb, slot);
+ u64 flags;
+ u64 offset;
+ int ref_index = 0;
+
+ if (unlikely(item_size < sizeof(*ei))) {
+ btrfs_print_v0_err(eb->fs_info);
+ btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
+ }
+
+ ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(eb, ei);
+
+ pr_info("\t\textent refs %llu gen %llu flags %llu\n",
+ btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
+ flags);
+
+ if ((type == BTRFS_EXTENT_ITEM_KEY) &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ btrfs_tree_block_key(eb, info, &key);
+ pr_info("\t\ttree block key (%llu %u %llu) level %d\n",
+ btrfs_disk_key_objectid(&key), key.type,
+ btrfs_disk_key_offset(&key),
+ btrfs_tree_block_level(eb, info));
+ iref = (struct btrfs_extent_inline_ref *)(info + 1);
+ } else {
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ }
+
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ while (ptr < end) {
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(eb, iref);
+ offset = btrfs_extent_inline_ref_offset(eb, iref);
+ pr_info("\t\tref#%d: ", ref_index++);
+ switch (type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ pr_cont("tree block backref root %llu\n", offset);
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ pr_cont("shared block backref parent %llu\n", offset);
+ /*
+ * offset is supposed to be a tree block which
+ * must be aligned to nodesize.
+ */
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ print_extent_data_ref(eb, dref);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ pr_cont("shared data backref parent %llu count %u\n",
+ offset, btrfs_shared_data_ref_count(eb, sref));
+ /*
+ * Offset is supposed to be a tree block which must be
+ * aligned to sectorsize.
+ */
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
+ break;
+ default:
+ pr_cont("(extent %llu has INVALID ref type %d)\n",
+ eb->start, type);
+ return;
+ }
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+ WARN_ON(ptr > end);
+}
+
+static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
+ u32 item_size)
+{
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ pr_warn("BTRFS: uuid item with illegal size %lu!\n",
+ (unsigned long)item_size);
+ return;
+ }
+ while (item_size) {
+ __le64 subvol_id;
+
+ read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
+ pr_info("\t\tsubvol_id %llu\n", le64_to_cpu(subvol_id));
+ item_size -= sizeof(u64);
+ offset += sizeof(u64);
+ }
+}
+
+/*
+ * Helper to output refs and locking status of extent buffer. Useful to debug
+ * race condition related problems.
+ */
+static void print_eb_refs_lock(struct extent_buffer *eb)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+ btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
+ atomic_read(&eb->refs), eb->lock_owner, current->pid);
+#endif
+}
+
+void btrfs_print_leaf(struct extent_buffer *l)
+{
+ struct btrfs_fs_info *fs_info;
+ int i;
+ u32 type, nr;
+ struct btrfs_root_item *ri;
+ struct btrfs_dir_item *di;
+ struct btrfs_inode_item *ii;
+ struct btrfs_block_group_item *bi;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+
+ if (!l)
+ return;
+
+ fs_info = l->fs_info;
+ nr = btrfs_header_nritems(l);
+
+ btrfs_info(fs_info,
+ "leaf %llu gen %llu total ptrs %d free space %d owner %llu",
+ btrfs_header_bytenr(l), btrfs_header_generation(l), nr,
+ btrfs_leaf_free_space(l), btrfs_header_owner(l));
+ print_eb_refs_lock(l);
+ for (i = 0 ; i < nr ; i++) {
+ btrfs_item_key_to_cpu(l, &key, i);
+ type = key.type;
+ pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
+ i, key.objectid, type, key.offset,
+ btrfs_item_offset(l, i), btrfs_item_size(l, i));
+ switch (type) {
+ case BTRFS_INODE_ITEM_KEY:
+ ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+ pr_info("\t\tinode generation %llu size %llu mode %o\n",
+ btrfs_inode_generation(l, ii),
+ btrfs_inode_size(l, ii),
+ btrfs_inode_mode(l, ii));
+ break;
+ case BTRFS_DIR_ITEM_KEY:
+ di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(l, di, &found_key);
+ pr_info("\t\tdir oid %llu type %u\n",
+ found_key.objectid,
+ btrfs_dir_type(l, di));
+ break;
+ case BTRFS_ROOT_ITEM_KEY:
+ ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+ pr_info("\t\troot data bytenr %llu refs %u\n",
+ btrfs_disk_root_bytenr(l, ri),
+ btrfs_disk_root_refs(l, ri));
+ break;
+ case BTRFS_EXTENT_ITEM_KEY:
+ case BTRFS_METADATA_ITEM_KEY:
+ print_extent_item(l, i, type);
+ break;
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ pr_info("\t\ttree block backref\n");
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ pr_info("\t\tshared block backref\n");
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = btrfs_item_ptr(l, i,
+ struct btrfs_extent_data_ref);
+ print_extent_data_ref(l, dref);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = btrfs_item_ptr(l, i,
+ struct btrfs_shared_data_ref);
+ pr_info("\t\tshared data backref count %u\n",
+ btrfs_shared_data_ref_count(l, sref));
+ break;
+ case BTRFS_EXTENT_DATA_KEY:
+ fi = btrfs_item_ptr(l, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(l, fi) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ pr_info("\t\tinline extent data size %llu\n",
+ btrfs_file_extent_ram_bytes(l, fi));
+ break;
+ }
+ pr_info("\t\textent data disk bytenr %llu nr %llu\n",
+ btrfs_file_extent_disk_bytenr(l, fi),
+ btrfs_file_extent_disk_num_bytes(l, fi));
+ pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
+ btrfs_file_extent_offset(l, fi),
+ btrfs_file_extent_num_bytes(l, fi),
+ btrfs_file_extent_ram_bytes(l, fi));
+ break;
+ case BTRFS_EXTENT_REF_V0_KEY:
+ btrfs_print_v0_err(fs_info);
+ btrfs_handle_fs_error(fs_info, -EINVAL, NULL);
+ break;
+ case BTRFS_BLOCK_GROUP_ITEM_KEY:
+ bi = btrfs_item_ptr(l, i,
+ struct btrfs_block_group_item);
+ pr_info(
+ "\t\tblock group used %llu chunk_objectid %llu flags %llu\n",
+ btrfs_block_group_used(l, bi),
+ btrfs_block_group_chunk_objectid(l, bi),
+ btrfs_block_group_flags(l, bi));
+ break;
+ case BTRFS_CHUNK_ITEM_KEY:
+ print_chunk(l, btrfs_item_ptr(l, i,
+ struct btrfs_chunk));
+ break;
+ case BTRFS_DEV_ITEM_KEY:
+ print_dev_item(l, btrfs_item_ptr(l, i,
+ struct btrfs_dev_item));
+ break;
+ case BTRFS_DEV_EXTENT_KEY:
+ dev_extent = btrfs_item_ptr(l, i,
+ struct btrfs_dev_extent);
+ pr_info("\t\tdev extent chunk_tree %llu\n\t\tchunk objectid %llu chunk offset %llu length %llu\n",
+ btrfs_dev_extent_chunk_tree(l, dev_extent),
+ btrfs_dev_extent_chunk_objectid(l, dev_extent),
+ btrfs_dev_extent_chunk_offset(l, dev_extent),
+ btrfs_dev_extent_length(l, dev_extent));
+ break;
+ case BTRFS_PERSISTENT_ITEM_KEY:
+ pr_info("\t\tpersistent item objectid %llu offset %llu\n",
+ key.objectid, key.offset);
+ switch (key.objectid) {
+ case BTRFS_DEV_STATS_OBJECTID:
+ pr_info("\t\tdevice stats\n");
+ break;
+ default:
+ pr_info("\t\tunknown persistent item\n");
+ }
+ break;
+ case BTRFS_TEMPORARY_ITEM_KEY:
+ pr_info("\t\ttemporary item objectid %llu offset %llu\n",
+ key.objectid, key.offset);
+ switch (key.objectid) {
+ case BTRFS_BALANCE_OBJECTID:
+ pr_info("\t\tbalance status\n");
+ break;
+ default:
+ pr_info("\t\tunknown temporary item\n");
+ }
+ break;
+ case BTRFS_DEV_REPLACE_KEY:
+ pr_info("\t\tdev replace\n");
+ break;
+ case BTRFS_UUID_KEY_SUBVOL:
+ case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ print_uuid_item(l, btrfs_item_ptr_offset(l, i),
+ btrfs_item_size(l, i));
+ break;
+ }
+ }
+}
+
+void btrfs_print_tree(struct extent_buffer *c, bool follow)
+{
+ struct btrfs_fs_info *fs_info;
+ int i; u32 nr;
+ struct btrfs_key key;
+ int level;
+
+ if (!c)
+ return;
+ fs_info = c->fs_info;
+ nr = btrfs_header_nritems(c);
+ level = btrfs_header_level(c);
+ if (level == 0) {
+ btrfs_print_leaf(c);
+ return;
+ }
+ btrfs_info(fs_info,
+ "node %llu level %d gen %llu total ptrs %d free spc %u owner %llu",
+ btrfs_header_bytenr(c), level, btrfs_header_generation(c),
+ nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr,
+ btrfs_header_owner(c));
+ print_eb_refs_lock(c);
+ for (i = 0; i < nr; i++) {
+ btrfs_node_key_to_cpu(c, &key, i);
+ pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n",
+ i, key.objectid, key.type, key.offset,
+ btrfs_node_blockptr(c, i),
+ btrfs_node_ptr_generation(c, i));
+ }
+ if (!follow)
+ return;
+ for (i = 0; i < nr; i++) {
+ struct btrfs_key first_key;
+ struct extent_buffer *next;
+
+ btrfs_node_key_to_cpu(c, &first_key, i);
+ next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
+ btrfs_header_owner(c),
+ btrfs_node_ptr_generation(c, i),
+ level - 1, &first_key);
+ if (IS_ERR(next))
+ continue;
+ if (!extent_buffer_uptodate(next)) {
+ free_extent_buffer(next);
+ continue;
+ }
+
+ if (btrfs_is_leaf(next) &&
+ level != 1)
+ BUG();
+ if (btrfs_header_level(next) !=
+ level - 1)
+ BUG();
+ btrfs_print_tree(next, follow);
+ free_extent_buffer(next);
+ }
+}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000..8c3e9319e
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_PRINT_TREE_H
+#define BTRFS_PRINT_TREE_H
+
+/* Buffer size to contain tree name and possibly additional data (offset) */
+#define BTRFS_ROOT_NAME_BUF_LEN 48
+
+void btrfs_print_leaf(struct extent_buffer *l);
+void btrfs_print_tree(struct extent_buffer *c, bool follow);
+const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
+
+#endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
new file mode 100644
index 000000000..055a63127
--- /dev/null
+++ b/fs/btrfs/props.c
@@ -0,0 +1,467 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ */
+
+#include <linux/hashtable.h>
+#include "props.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "ctree.h"
+#include "xattr.h"
+#include "compression.h"
+
+#define BTRFS_PROP_HANDLERS_HT_BITS 8
+static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
+
+struct prop_handler {
+ struct hlist_node node;
+ const char *xattr_name;
+ int (*validate)(const struct btrfs_inode *inode, const char *value,
+ size_t len);
+ int (*apply)(struct inode *inode, const char *value, size_t len);
+ const char *(*extract)(struct inode *inode);
+ bool (*ignore)(const struct btrfs_inode *inode);
+ int inheritable;
+};
+
+static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
+{
+ struct hlist_head *h;
+
+ h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)];
+ if (hlist_empty(h))
+ return NULL;
+
+ return h;
+}
+
+static const struct prop_handler *
+find_prop_handler(const char *name,
+ const struct hlist_head *handlers)
+{
+ struct prop_handler *h;
+
+ if (!handlers) {
+ u64 hash = btrfs_name_hash(name, strlen(name));
+
+ handlers = find_prop_handlers_by_hash(hash);
+ if (!handlers)
+ return NULL;
+ }
+
+ hlist_for_each_entry(h, handlers, node)
+ if (!strcmp(h->xattr_name, name))
+ return h;
+
+ return NULL;
+}
+
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len)
+{
+ const struct prop_handler *handler;
+
+ if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
+ return -EINVAL;
+
+ handler = find_prop_handler(name, NULL);
+ if (!handler)
+ return -EINVAL;
+
+ if (value_len == 0)
+ return 0;
+
+ return handler->validate(inode, value, value_len);
+}
+
+/*
+ * Check if a property should be ignored (not set) for an inode.
+ *
+ * @inode: The target inode.
+ * @name: The property's name.
+ *
+ * The caller must be sure the given property name is valid, for example by
+ * having previously called btrfs_validate_prop().
+ *
+ * Returns: true if the property should be ignored for the given inode
+ * false if the property must not be ignored for the given inode
+ */
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
+{
+ const struct prop_handler *handler;
+
+ handler = find_prop_handler(name, NULL);
+ ASSERT(handler != NULL);
+
+ return handler->ignore(inode);
+}
+
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+ const char *name, const char *value, size_t value_len,
+ int flags)
+{
+ const struct prop_handler *handler;
+ int ret;
+
+ handler = find_prop_handler(name, NULL);
+ if (!handler)
+ return -EINVAL;
+
+ if (value_len == 0) {
+ ret = btrfs_setxattr(trans, inode, handler->xattr_name,
+ NULL, 0, flags);
+ if (ret)
+ return ret;
+
+ ret = handler->apply(inode, NULL, 0);
+ ASSERT(ret == 0);
+
+ return ret;
+ }
+
+ ret = btrfs_setxattr(trans, inode, handler->xattr_name, value,
+ value_len, flags);
+ if (ret)
+ return ret;
+ ret = handler->apply(inode, value, value_len);
+ if (ret) {
+ btrfs_setxattr(trans, inode, handler->xattr_name, NULL,
+ 0, flags);
+ return ret;
+ }
+
+ set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+
+ return 0;
+}
+
+static int iterate_object_props(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid,
+ void (*iterator)(void *,
+ const struct prop_handler *,
+ const char *,
+ size_t),
+ void *ctx)
+{
+ int ret;
+ char *name_buf = NULL;
+ char *value_buf = NULL;
+ int name_buf_len = 0;
+ int value_buf_len = 0;
+
+ while (1) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct extent_buffer *leaf;
+ u32 total_len, cur, this_len;
+ int slot;
+ const struct hlist_head *handlers;
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid)
+ break;
+ if (key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ handlers = find_prop_handlers_by_hash(key.offset);
+ if (!handlers)
+ goto next_slot;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ cur = 0;
+ total_len = btrfs_item_size(leaf, slot);
+
+ while (cur < total_len) {
+ u32 name_len = btrfs_dir_name_len(leaf, di);
+ u32 data_len = btrfs_dir_data_len(leaf, di);
+ unsigned long name_ptr, data_ptr;
+ const struct prop_handler *handler;
+
+ this_len = sizeof(*di) + name_len + data_len;
+ name_ptr = (unsigned long)(di + 1);
+ data_ptr = name_ptr + name_len;
+
+ if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
+ memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
+ name_ptr,
+ XATTR_BTRFS_PREFIX_LEN))
+ goto next_dir_item;
+
+ if (name_len >= name_buf_len) {
+ kfree(name_buf);
+ name_buf_len = name_len + 1;
+ name_buf = kmalloc(name_buf_len, GFP_NOFS);
+ if (!name_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ read_extent_buffer(leaf, name_buf, name_ptr, name_len);
+ name_buf[name_len] = '\0';
+
+ handler = find_prop_handler(name_buf, handlers);
+ if (!handler)
+ goto next_dir_item;
+
+ if (data_len > value_buf_len) {
+ kfree(value_buf);
+ value_buf_len = data_len;
+ value_buf = kmalloc(data_len, GFP_NOFS);
+ if (!value_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ read_extent_buffer(leaf, value_buf, data_ptr, data_len);
+
+ iterator(ctx, handler, value_buf, data_len);
+next_dir_item:
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *) di + this_len);
+ }
+
+next_slot:
+ path->slots[0]++;
+ }
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ kfree(name_buf);
+ kfree(value_buf);
+
+ return ret;
+}
+
+static void inode_prop_iterator(void *ctx,
+ const struct prop_handler *handler,
+ const char *value,
+ size_t len)
+{
+ struct inode *inode = ctx;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ ret = handler->apply(inode, value, len);
+ if (unlikely(ret))
+ btrfs_warn(root->fs_info,
+ "error applying prop %s to ino %llu (root %llu): %d",
+ handler->xattr_name, btrfs_ino(BTRFS_I(inode)),
+ root->root_key.objectid, ret);
+ else
+ set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+}
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 ino = btrfs_ino(BTRFS_I(inode));
+
+ return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+}
+
+static int prop_compression_validate(const struct btrfs_inode *inode,
+ const char *value, size_t len)
+{
+ if (!btrfs_inode_can_compress(inode))
+ return -EINVAL;
+
+ if (!value)
+ return 0;
+
+ if (btrfs_compress_is_valid_type(value, len))
+ return 0;
+
+ if ((len == 2 && strncmp("no", value, 2) == 0) ||
+ (len == 4 && strncmp("none", value, 4) == 0))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int prop_compression_apply(struct inode *inode, const char *value,
+ size_t len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int type;
+
+ /* Reset to defaults */
+ if (len == 0) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ return 0;
+ }
+
+ /* Set NOCOMPRESS flag */
+ if ((len == 2 && strncmp("no", value, 2) == 0) ||
+ (len == 4 && strncmp("none", value, 4) == 0)) {
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+
+ return 0;
+ }
+
+ if (!strncmp("lzo", value, 3)) {
+ type = BTRFS_COMPRESS_LZO;
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ } else if (!strncmp("zlib", value, 4)) {
+ type = BTRFS_COMPRESS_ZLIB;
+ } else if (!strncmp("zstd", value, 4)) {
+ type = BTRFS_COMPRESS_ZSTD;
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ } else {
+ return -EINVAL;
+ }
+
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->prop_compress = type;
+
+ return 0;
+}
+
+static bool prop_compression_ignore(const struct btrfs_inode *inode)
+{
+ /*
+ * Compression only has effect for regular files, and for directories
+ * we set it just to propagate it to new files created inside them.
+ * Everything else (symlinks, devices, sockets, fifos) is pointless as
+ * it will do nothing, so don't waste metadata space on a compression
+ * xattr for anything that is neither a file nor a directory.
+ */
+ if (!S_ISREG(inode->vfs_inode.i_mode) &&
+ !S_ISDIR(inode->vfs_inode.i_mode))
+ return true;
+
+ return false;
+}
+
+static const char *prop_compression_extract(struct inode *inode)
+{
+ switch (BTRFS_I(inode)->prop_compress) {
+ case BTRFS_COMPRESS_ZLIB:
+ case BTRFS_COMPRESS_LZO:
+ case BTRFS_COMPRESS_ZSTD:
+ return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+static struct prop_handler prop_handlers[] = {
+ {
+ .xattr_name = XATTR_BTRFS_PREFIX "compression",
+ .validate = prop_compression_validate,
+ .apply = prop_compression_apply,
+ .extract = prop_compression_extract,
+ .ignore = prop_compression_ignore,
+ .inheritable = 1
+ },
+};
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *parent)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+ int i;
+ bool need_reserve = false;
+
+ if (!test_bit(BTRFS_INODE_HAS_PROPS,
+ &BTRFS_I(parent)->runtime_flags))
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ const struct prop_handler *h = &prop_handlers[i];
+ const char *value;
+ u64 num_bytes = 0;
+
+ if (!h->inheritable)
+ continue;
+
+ if (h->ignore(BTRFS_I(inode)))
+ continue;
+
+ value = h->extract(parent);
+ if (!value)
+ continue;
+
+ /*
+ * This is not strictly necessary as the property should be
+ * valid, but in case it isn't, don't propagate it further.
+ */
+ ret = h->validate(BTRFS_I(inode), value, strlen(value));
+ if (ret)
+ continue;
+
+ /*
+ * Currently callers should be reserving 1 item for properties,
+ * since we only have 1 property that we currently support. If
+ * we add more in the future we need to try and reserve more
+ * space for them. But we should also revisit how we do space
+ * reservations if we do add more properties in the future.
+ */
+ if (need_reserve) {
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ ret = btrfs_block_rsv_add(fs_info, trans->block_rsv,
+ num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (ret)
+ return ret;
+ }
+
+ ret = btrfs_setxattr(trans, inode, h->xattr_name, value,
+ strlen(value), 0);
+ if (!ret) {
+ ret = h->apply(inode, value, strlen(value));
+ if (ret)
+ btrfs_setxattr(trans, inode, h->xattr_name,
+ NULL, 0, 0);
+ else
+ set_bit(BTRFS_INODE_HAS_PROPS,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+
+ if (need_reserve) {
+ btrfs_block_rsv_release(fs_info, trans->block_rsv,
+ num_bytes, NULL);
+ if (ret)
+ return ret;
+ }
+ need_reserve = true;
+ }
+
+ return 0;
+}
+
+void __init btrfs_props_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ struct prop_handler *p = &prop_handlers[i];
+ u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
+
+ hash_add(prop_handlers_ht, &p->node, h);
+ }
+}
+
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
new file mode 100644
index 000000000..ca9dd3df1
--- /dev/null
+++ b/fs/btrfs/props.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ */
+
+#ifndef BTRFS_PROPS_H
+#define BTRFS_PROPS_H
+
+#include "ctree.h"
+
+void __init btrfs_props_init(void);
+
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+ const char *name, const char *value, size_t value_len,
+ int flags);
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len);
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct inode *dir);
+
+#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644
index 000000000..96ec9ccc2
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,4431 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/btrfs.h>
+#include <linux/sched/mm.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "ulist.h"
+#include "backref.h"
+#include "extent_io.h"
+#include "qgroup.h"
+#include "block-group.h"
+#include "sysfs.h"
+#include "tree-mod-log.h"
+
+/*
+ * Helpers to access qgroup reservation
+ *
+ * Callers should ensure the lock context and type are valid
+ */
+
+static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
+{
+ u64 ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ ret += qgroup->rsv.values[i];
+
+ return ret;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
+{
+ if (type == BTRFS_QGROUP_RSV_DATA)
+ return "data";
+ if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
+ return "meta_pertrans";
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+ return "meta_prealloc";
+ return NULL;
+}
+#endif
+
+static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
+ qgroup->rsv.values[type] += num_bytes;
+}
+
+static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
+ if (qgroup->rsv.values[type] >= num_bytes) {
+ qgroup->rsv.values[type] -= num_bytes;
+ return;
+ }
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_RATELIMIT(1,
+ "qgroup %llu %s reserved space underflow, have %llu to free %llu",
+ qgroup->qgroupid, qgroup_rsv_type_str(type),
+ qgroup->rsv.values[type], num_bytes);
+#endif
+ qgroup->rsv.values[type] = 0;
+}
+
+static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *dest,
+ struct btrfs_qgroup *src)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
+}
+
+static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *dest,
+ struct btrfs_qgroup *src)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
+}
+
+static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq;
+ qg->old_refcnt += mod;
+}
+
+static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq;
+ qg->new_refcnt += mod;
+}
+
+static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->old_refcnt < seq)
+ return 0;
+ return qg->old_refcnt - seq;
+}
+
+static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->new_refcnt < seq)
+ return 0;
+ return qg->new_refcnt - seq;
+}
+
+/*
+ * glue structure to represent the relations between qgroups.
+ */
+struct btrfs_qgroup_list {
+ struct list_head next_group;
+ struct list_head next_member;
+ struct btrfs_qgroup *group;
+ struct btrfs_qgroup *member;
+};
+
+static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
+{
+ return (u64)(uintptr_t)qg;
+}
+
+static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
+{
+ return (struct btrfs_qgroup *)(uintptr_t)n->aux;
+}
+
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+
+/* must be called with qgroup_ioctl_lock held */
+static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+ u64 qgroupid)
+{
+ struct rb_node *n = fs_info->qgroup_tree.rb_node;
+ struct btrfs_qgroup *qgroup;
+
+ while (n) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ if (qgroup->qgroupid < qgroupid)
+ n = n->rb_left;
+ else if (qgroup->qgroupid > qgroupid)
+ n = n->rb_right;
+ else
+ return qgroup;
+ }
+ return NULL;
+}
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+ u64 qgroupid)
+{
+ struct rb_node **p = &fs_info->qgroup_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_qgroup *qgroup;
+
+ while (*p) {
+ parent = *p;
+ qgroup = rb_entry(parent, struct btrfs_qgroup, node);
+
+ if (qgroup->qgroupid < qgroupid)
+ p = &(*p)->rb_left;
+ else if (qgroup->qgroupid > qgroupid)
+ p = &(*p)->rb_right;
+ else
+ return qgroup;
+ }
+
+ qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
+ if (!qgroup)
+ return ERR_PTR(-ENOMEM);
+
+ qgroup->qgroupid = qgroupid;
+ INIT_LIST_HEAD(&qgroup->groups);
+ INIT_LIST_HEAD(&qgroup->members);
+ INIT_LIST_HEAD(&qgroup->dirty);
+
+ rb_link_node(&qgroup->node, parent, p);
+ rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+
+ return qgroup;
+}
+
+static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ struct btrfs_qgroup_list *list;
+
+ list_del(&qgroup->dirty);
+ while (!list_empty(&qgroup->groups)) {
+ list = list_first_entry(&qgroup->groups,
+ struct btrfs_qgroup_list, next_group);
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ }
+
+ while (!list_empty(&qgroup->members)) {
+ list = list_first_entry(&qgroup->members,
+ struct btrfs_qgroup_list, next_member);
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ }
+}
+
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+ struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+
+ if (!qgroup)
+ return -ENOENT;
+
+ rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+ __del_qgroup_rb(fs_info, qgroup);
+ return 0;
+}
+
+/*
+ * Add relation specified by two qgroups.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the qgroups is NULL
+ * <0 other errors
+ */
+static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
+{
+ struct btrfs_qgroup_list *list;
+
+ if (!member || !parent)
+ return -ENOENT;
+
+ list = kzalloc(sizeof(*list), GFP_ATOMIC);
+ if (!list)
+ return -ENOMEM;
+
+ list->group = parent;
+ list->member = member;
+ list_add_tail(&list->next_group, &member->groups);
+ list_add_tail(&list->next_member, &parent->members);
+
+ return 0;
+}
+
+/*
+ * Add relation specified by two qgroup ids.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the ids does not exist
+ * <0 other errors
+ */
+static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+
+ return __add_relation_rb(member, parent);
+}
+
+/* Must be called with qgroup_lock held */
+static int del_relation_rb(struct btrfs_fs_info *fs_info,
+ u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+ if (!member || !parent)
+ return -ENOENT;
+
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent) {
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl)
+{
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup)
+ return -EINVAL;
+ if (qgroup->rfer != rfer || qgroup->excl != excl)
+ return -EINVAL;
+ return 0;
+}
+#endif
+
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+{
+ fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
+ BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+}
+
+/*
+ * The full config is read in one go, only called from open_ctree()
+ * It doesn't use any locking, as at this point we're still single-threaded
+ */
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *l;
+ int slot;
+ int ret = 0;
+ u64 flags = 0;
+ u64 rescan_progress = 0;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
+ /* default this to quota off, in case no status key is found */
+ fs_info->qgroup_flags = 0;
+
+ /*
+ * pass 1: read status, all qgroup infos and limits
+ */
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+ ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
+ if (ret)
+ goto out;
+
+ while (1) {
+ struct btrfs_qgroup *qgroup;
+
+ slot = path->slots[0];
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
+ struct btrfs_qgroup_status_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_status_item);
+
+ if (btrfs_qgroup_status_version(l, ptr) !=
+ BTRFS_QGROUP_STATUS_VERSION) {
+ btrfs_err(fs_info,
+ "old qgroup version, quota disabled");
+ goto out;
+ }
+ if (btrfs_qgroup_status_generation(l, ptr) !=
+ fs_info->generation) {
+ qgroup_mark_inconsistent(fs_info);
+ btrfs_err(fs_info,
+ "qgroup generation mismatch, marked as inconsistent");
+ }
+ fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
+ ptr);
+ rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
+ goto next1;
+ }
+
+ if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
+ found_key.type != BTRFS_QGROUP_LIMIT_KEY)
+ goto next1;
+
+ qgroup = find_qgroup_rb(fs_info, found_key.offset);
+ if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
+ (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
+ btrfs_err(fs_info, "inconsistent qgroup config");
+ qgroup_mark_inconsistent(fs_info);
+ }
+ if (!qgroup) {
+ qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ goto out;
+ }
+ }
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+
+ switch (found_key.type) {
+ case BTRFS_QGROUP_INFO_KEY: {
+ struct btrfs_qgroup_info_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_info_item);
+ qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
+ qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
+ qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
+ qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
+ /* generation currently unused */
+ break;
+ }
+ case BTRFS_QGROUP_LIMIT_KEY: {
+ struct btrfs_qgroup_limit_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_limit_item);
+ qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
+ qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
+ qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
+ qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
+ qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
+ break;
+ }
+ }
+next1:
+ ret = btrfs_next_item(quota_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * pass 2: read all qgroup relations
+ */
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
+ if (ret)
+ goto out;
+ while (1) {
+ slot = path->slots[0];
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
+ goto next2;
+
+ if (found_key.objectid > found_key.offset) {
+ /* parent <- member, not needed to build config */
+ /* FIXME should we omit the key completely? */
+ goto next2;
+ }
+
+ ret = add_relation_rb(fs_info, found_key.objectid,
+ found_key.offset);
+ if (ret == -ENOENT) {
+ btrfs_warn(fs_info,
+ "orphan qgroup relation 0x%llx->0x%llx",
+ found_key.objectid, found_key.offset);
+ ret = 0; /* ignore the error */
+ }
+ if (ret)
+ goto out;
+next2:
+ ret = btrfs_next_item(quota_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+ }
+out:
+ btrfs_free_path(path);
+ fs_info->qgroup_flags |= flags;
+ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+ ret >= 0)
+ ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+
+ if (ret < 0) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ btrfs_sysfs_del_qgroups(fs_info);
+ }
+
+ return ret < 0 ? ret : 0;
+}
+
+/*
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
+ * leak some reserved space.
+ *
+ * Return false if no reserved space is left.
+ * Return true if some reserved space is leaked.
+ */
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ bool ret = false;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return ret;
+ /*
+ * Since we're unmounting, there is no race and no need to grab qgroup
+ * lock. And here we don't go post-order to provide a more user
+ * friendly sorted result.
+ */
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
+ struct btrfs_qgroup *qgroup;
+ int i;
+
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
+ if (qgroup->rsv.values[i]) {
+ ret = true;
+ btrfs_warn(fs_info,
+ "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ i, qgroup->rsv.values[i]);
+ }
+ }
+ }
+ return ret;
+}
+
+/*
+ * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
+ * first two are in single-threaded paths.And for the third one, we have set
+ * quota_root to be null with qgroup_lock held before, so it is safe to clean
+ * up the in-memory structures without qgroup_lock held.
+ */
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup *qgroup;
+
+ while ((n = rb_first(&fs_info->qgroup_tree))) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ rb_erase(n, &fs_info->qgroup_tree);
+ __del_qgroup_rb(fs_info, qgroup);
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
+ }
+ /*
+ * We call btrfs_free_qgroup_config() when unmounting
+ * filesystem and disabling quota, so we set qgroup_ulist
+ * to be null here to avoid double free.
+ */
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ btrfs_sysfs_del_qgroups(fs_info);
+}
+
+static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
+ u64 dst)
+{
+ int ret;
+ struct btrfs_root *quota_root = trans->fs_info->quota_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = src;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = dst;
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
+ u64 dst)
+{
+ int ret;
+ struct btrfs_root *quota_root = trans->fs_info->quota_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = src;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = dst;
+
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int add_qgroup_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root, u64 qgroupid)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_qgroup_info_item *qgroup_info;
+ struct btrfs_qgroup_limit_item *qgroup_limit;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ if (btrfs_is_testing(quota_root->fs_info))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroupid;
+
+ /*
+ * Avoid a transaction abort by catching -EEXIST here. In that
+ * case, we proceed by re-initializing the existing structure
+ * on disk.
+ */
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*qgroup_info));
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ leaf = path->nodes[0];
+ qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_info_item);
+ btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
+ btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*qgroup_limit));
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ leaf = path->nodes[0];
+ qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_limit_item);
+ btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
+{
+ int ret;
+ struct btrfs_root *quota_root = trans->fs_info->quota_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroupid;
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
+ struct btrfs_qgroup *qgroup)
+{
+ struct btrfs_root *quota_root = trans->fs_info->quota_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_limit_item *qgroup_limit;
+ int ret;
+ int slot;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ key.offset = qgroup->qgroupid;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
+ btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
+ btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
+ btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
+ btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
+ btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
+ struct btrfs_qgroup *qgroup)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_info_item *qgroup_info;
+ int ret;
+ int slot;
+
+ if (btrfs_is_testing(fs_info))
+ return 0;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroup->qgroupid;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
+ btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
+ btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
+ btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
+ btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
+ btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_status_item *ptr;
+ int ret;
+ int slot;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_STATUS_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
+ btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
+ btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
+ btrfs_set_qgroup_status_rescan(l, ptr,
+ fs_info->qgroup_rescan_progress.objectid);
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called with qgroup_lock held
+ */
+static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf = NULL;
+ int ret;
+ int nr = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.offset = 0;
+ key.type = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ leaf = path->nodes[0];
+ nr = btrfs_header_nritems(leaf);
+ if (!nr)
+ break;
+ /*
+ * delete the leaf one by one
+ * since the whole tree is going
+ * to be deleted.
+ */
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_path *path = NULL;
+ struct btrfs_qgroup_status_item *ptr;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_qgroup *qgroup = NULL;
+ struct btrfs_trans_handle *trans = NULL;
+ struct ulist *ulist = NULL;
+ int ret = 0;
+ int slot;
+
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to enable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "qgroups are currently unsupported in extent tree v2");
+ return -EINVAL;
+ }
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (fs_info->quota_root)
+ goto out;
+
+ ulist = ulist_alloc(GFP_KERNEL);
+ if (!ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Unlock qgroup_ioctl_lock before starting the transaction. This is to
+ * avoid lock acquisition inversion problems (reported by lockdep) between
+ * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
+ * start a transaction.
+ * After we started the transaction lock qgroup_ioctl_lock again and
+ * check if someone else created the quota root in the meanwhile. If so,
+ * just return success and release the transaction handle.
+ *
+ * Also we don't need to worry about someone else calling
+ * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
+ * that function returns 0 (success) when the sysfs entries already exist.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
+ * 1 for quota root item
+ * 1 for BTRFS_QGROUP_STATUS item
+ *
+ * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
+ * per subvolume. However those are not currently reserved since it
+ * would be a lot of overkill.
+ */
+ trans = btrfs_start_transaction(tree_root, 2);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ if (fs_info->quota_root)
+ goto out;
+
+ fs_info->qgroup_ulist = ulist;
+ ulist = NULL;
+
+ /*
+ * initially create the quota tree
+ */
+ quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
+ if (IS_ERR(quota_root)) {
+ ret = PTR_ERR(quota_root);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_root;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_STATUS_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*ptr));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_status_item);
+ btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
+ btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
+ fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
+ btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ key.objectid = 0;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = 0;
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
+ if (ret > 0)
+ goto out_add_root;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+
+ while (1) {
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.type == BTRFS_ROOT_REF_KEY) {
+
+ /* Release locks on tree_root before we access quota_root */
+ btrfs_release_path(path);
+
+ ret = add_qgroup_item(trans, quota_root,
+ found_key.offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+
+ qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ ret = btrfs_search_slot_for_read(tree_root, &found_key,
+ path, 1, 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ if (ret > 0) {
+ /*
+ * Shouldn't happen, but in case it does we
+ * don't need to do the btrfs_next_item, just
+ * continue.
+ */
+ continue;
+ }
+ }
+ ret = btrfs_next_item(tree_root, path);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ if (ret)
+ break;
+ }
+
+out_add_root:
+ btrfs_release_path(path);
+ ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+
+ qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+ * a deadlock with tasks concurrently doing other qgroup operations, such
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
+ * because all qgroup operations first start or join a transaction and then
+ * lock the qgroup_ioctl_lock mutex.
+ * We are safe from a concurrent task trying to enable quotas, by calling
+ * this function, since we are serialized by fs_info->subvol_sem.
+ */
+ ret = btrfs_commit_transaction(trans);
+ trans = NULL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (ret)
+ goto out_free_path;
+
+ /*
+ * Set quota enabled flag after committing the transaction, to avoid
+ * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
+ * creation.
+ */
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->quota_root = quota_root;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (!ret) {
+ qgroup_rescan_zero_tracking(fs_info);
+ fs_info->qgroup_rescan_running = true;
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
+ }
+
+out_free_path:
+ btrfs_free_path(path);
+out_free_root:
+ if (ret)
+ btrfs_put_root(quota_root);
+out:
+ if (ret) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ btrfs_sysfs_del_qgroups(fs_info);
+ }
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+ ulist_free(ulist);
+ return ret;
+}
+
+int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_trans_handle *trans = NULL;
+ int ret = 0;
+
+ /*
+ * We need to have subvol_sem write locked to prevent races with
+ * snapshot creation.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
+ /*
+ * Lock the cleaner mutex to prevent races with concurrent relocation,
+ * because relocation may be building backrefs for blocks of the quota
+ * root while we are deleting the root. This is like dropping fs roots
+ * of deleted snapshots/subvolumes, we need the same protection.
+ *
+ * This also prevents races between concurrent tasks trying to disable
+ * quotas, because we will unlock and relock qgroup_ioctl_lock across
+ * BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ mutex_lock(&fs_info->cleaner_mutex);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root)
+ goto out;
+
+ /*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
+ * Request qgroup rescan worker to complete and wait for it. This wait
+ * must be done before transaction start for quota disable since it may
+ * deadlock with transaction by the qgroup rescan worker.
+ */
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /*
+ * 1 For the root item
+ *
+ * We should also reserve enough items for the quota tree deletion in
+ * btrfs_clean_quota_tree but this is not done.
+ *
+ * Also, we must always start a transaction without holding the mutex
+ * qgroup_ioctl_lock, see btrfs_quota_enable().
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ goto out;
+ }
+
+ if (!fs_info->quota_root)
+ goto out;
+
+ spin_lock(&fs_info->qgroup_lock);
+ quota_root = fs_info->quota_root;
+ fs_info->quota_root = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ btrfs_free_qgroup_config(fs_info);
+
+ ret = btrfs_clean_quota_tree(trans, quota_root);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_del_root(trans, &quota_root->root_key);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ spin_lock(&fs_info->trans_lock);
+ list_del(&quota_root->dirty_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_tree_lock(quota_root->node);
+ btrfs_clean_tree_block(quota_root->node);
+ btrfs_tree_unlock(quota_root->node);
+ btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+ quota_root->node, 0, 1);
+
+ btrfs_put_root(quota_root);
+
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+ mutex_unlock(&fs_info->cleaner_mutex);
+
+ return ret;
+}
+
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ if (list_empty(&qgroup->dirty))
+ list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
+}
+
+/*
+ * The easy accounting, we're updating qgroup relationship whose child qgroup
+ * only has exclusive extents.
+ *
+ * In this case, all exclusive extents will also be exclusive for parent, so
+ * excl/rfer just get added/removed.
+ *
+ * So is qgroup reservation space, which should also be added/removed to
+ * parent.
+ * Or when child tries to release reservation space, parent will underflow its
+ * reservation (for relationship adding case).
+ *
+ * Caller should hold fs_info->qgroup_lock.
+ */
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+ struct ulist *tmp, u64 ref_root,
+ struct btrfs_qgroup *src, int sign)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *glist;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ u64 num_bytes = src->excl;
+ int ret = 0;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ qgroup->rfer += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes;
+
+ WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ qgroup->excl += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+
+ if (sign > 0)
+ qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
+ else
+ qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
+
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Get all of the parent groups that contain this qgroup */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+
+ /* Iterate all of the parents and adjust their reference counts */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ qgroup = unode_aux_to_qgroup(unode);
+ qgroup->rfer += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes;
+ WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ qgroup->excl += sign * num_bytes;
+ if (sign > 0)
+ qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
+ else
+ qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
+ qgroup->excl_cmpr += sign * num_bytes;
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+
+/*
+ * Quick path for updating qgroup with only excl refs.
+ *
+ * In that case, just update all parent will be enough.
+ * Or we needs to do a full rescan.
+ * Caller should also hold fs_info->qgroup_lock.
+ *
+ * Return 0 for quick update, return >0 for need to full rescan
+ * and mark INCONSISTENT flag.
+ * Return < 0 for other error.
+ */
+static int quick_update_accounting(struct btrfs_fs_info *fs_info,
+ struct ulist *tmp, u64 src, u64 dst,
+ int sign)
+{
+ struct btrfs_qgroup *qgroup;
+ int ret = 1;
+ int err = 0;
+
+ qgroup = find_qgroup_rb(fs_info, src);
+ if (!qgroup)
+ goto out;
+ if (qgroup->excl == qgroup->rfer) {
+ ret = 0;
+ err = __qgroup_excl_accounting(fs_info, tmp, dst,
+ qgroup, sign);
+ if (err < 0) {
+ ret = err;
+ goto out;
+ }
+ }
+out:
+ if (ret)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
+ u64 dst)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup_list *list;
+ struct ulist *tmp;
+ unsigned int nofs_flag;
+ int ret = 0;
+
+ /* Check the level of src and dst first */
+ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+ return -EINVAL;
+
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
+ tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ if (!tmp)
+ return -ENOMEM;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ member = find_qgroup_rb(fs_info, src);
+ parent = find_qgroup_rb(fs_info, dst);
+ if (!member || !parent) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* check if such qgroup relation exist firstly */
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+
+ ret = add_qgroup_relation_item(trans, src, dst);
+ if (ret)
+ goto out;
+
+ ret = add_qgroup_relation_item(trans, dst, src);
+ if (ret) {
+ del_qgroup_relation_item(trans, src, dst);
+ goto out;
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+ ret = __add_relation_rb(member, parent);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ goto out;
+ }
+ ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
+ spin_unlock(&fs_info->qgroup_lock);
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ ulist_free(tmp);
+ return ret;
+}
+
+static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
+ u64 dst)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup_list *list;
+ struct ulist *tmp;
+ bool found = false;
+ unsigned int nofs_flag;
+ int ret = 0;
+ int ret2;
+
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
+ tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ if (!tmp)
+ return -ENOMEM;
+
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ member = find_qgroup_rb(fs_info, src);
+ parent = find_qgroup_rb(fs_info, dst);
+ /*
+ * The parent/member pair doesn't exist, then try to delete the dead
+ * relation items only.
+ */
+ if (!member || !parent)
+ goto delete_item;
+
+ /* check if such qgroup relation exist firstly */
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent) {
+ found = true;
+ break;
+ }
+ }
+
+delete_item:
+ ret = del_qgroup_relation_item(trans, src, dst);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ ret2 = del_qgroup_relation_item(trans, dst, src);
+ if (ret2 < 0 && ret2 != -ENOENT)
+ goto out;
+
+ /* At least one deletion succeeded, return 0 */
+ if (!ret || !ret2)
+ ret = 0;
+
+ if (found) {
+ spin_lock(&fs_info->qgroup_lock);
+ del_relation_rb(fs_info, src, dst);
+ ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+ spin_unlock(&fs_info->qgroup_lock);
+ }
+out:
+ ulist_free(tmp);
+ return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
+ u64 dst)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ ret = __del_qgroup_relation(trans, src, dst);
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ quota_root = fs_info->quota_root;
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (qgroup) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = add_qgroup_item(trans, quota_root, qgroupid);
+ if (ret)
+ goto out;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = add_qgroup_rb(fs_info, qgroupid);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ goto out;
+ }
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *list;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ /* Check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = del_qgroup_item(trans, qgroupid);
+ if (ret && ret != -ENOENT)
+ goto out;
+
+ while (!list_empty(&qgroup->groups)) {
+ list = list_first_entry(&qgroup->groups,
+ struct btrfs_qgroup_list, next_group);
+ ret = __del_qgroup_relation(trans, qgroupid,
+ list->group->qgroupid);
+ if (ret)
+ goto out;
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+ del_qgroup_rb(fs_info, qgroupid);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * Remove the qgroup from sysfs now without holding the qgroup_lock
+ * spinlock, since the sysfs_remove_group() function needs to take
+ * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
+ */
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_qgroup *qgroup;
+ int ret = 0;
+ /* Sometimes we would want to clear the limit on this qgroup.
+ * To meet this requirement, we treat the -1 as a special value
+ * which tell kernel to clear the limit on this qgroup.
+ */
+ const u64 CLEAR_VALUE = -1;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
+ if (limit->max_rfer == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+ qgroup->max_rfer = 0;
+ } else {
+ qgroup->max_rfer = limit->max_rfer;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+ if (limit->max_excl == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+ qgroup->max_excl = 0;
+ } else {
+ qgroup->max_excl = limit->max_excl;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
+ if (limit->rsv_rfer == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+ qgroup->rsv_rfer = 0;
+ } else {
+ qgroup->rsv_rfer = limit->rsv_rfer;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
+ if (limit->rsv_excl == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+ qgroup->rsv_excl = 0;
+ } else {
+ qgroup->rsv_excl = limit->rsv_excl;
+ }
+ }
+ qgroup->lim_flags |= limit->flags;
+
+ spin_unlock(&fs_info->qgroup_lock);
+
+ ret = update_qgroup_limit_item(trans, qgroup);
+ if (ret) {
+ qgroup_mark_inconsistent(fs_info);
+ btrfs_info(fs_info, "unable to update quota limit for %llu",
+ qgroupid);
+ }
+
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
+{
+ struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_qgroup_extent_record *entry;
+ u64 bytenr = record->bytenr;
+
+ lockdep_assert_held(&delayed_refs->lock);
+ trace_btrfs_qgroup_trace_extent(fs_info, record);
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+ node);
+ if (bytenr < entry->bytenr) {
+ p = &(*p)->rb_left;
+ } else if (bytenr > entry->bytenr) {
+ p = &(*p)->rb_right;
+ } else {
+ if (record->data_rsv && !entry->data_rsv) {
+ entry->data_rsv = record->data_rsv;
+ entry->data_rsv_refroot =
+ record->data_rsv_refroot;
+ }
+ return 1;
+ }
+ }
+
+ rb_link_node(&record->node, parent_node, p);
+ rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
+ return 0;
+}
+
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
+ struct btrfs_qgroup_extent_record *qrecord)
+{
+ struct ulist *old_root;
+ u64 bytenr = qrecord->bytenr;
+ int ret;
+
+ /*
+ * We are always called in a context where we are already holding a
+ * transaction handle. Often we are called when adding a data delayed
+ * reference from btrfs_truncate_inode_items() (truncating or unlinking),
+ * in which case we will be holding a write lock on extent buffer from a
+ * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
+ * acquire fs_info->commit_root_sem, because that is a higher level lock
+ * that must be acquired before locking any extent buffers.
+ *
+ * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
+ * but we can't pass it a non-NULL transaction handle, because otherwise
+ * it would not use commit roots and would lock extent buffers, causing
+ * a deadlock if it ends up trying to read lock the same extent buffer
+ * that was previously write locked at btrfs_truncate_inode_items().
+ *
+ * So pass a NULL transaction handle to btrfs_find_all_roots() and
+ * explicitly tell it to not acquire the commit_root_sem - if we are
+ * holding a transaction handle we don't need its protection.
+ */
+ ASSERT(trans != NULL);
+
+ if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ return 0;
+
+ ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
+ true);
+ if (ret < 0) {
+ qgroup_mark_inconsistent(trans->fs_info);
+ btrfs_warn(trans->fs_info,
+"error accounting new delayed refs extent (err code: %d), quota inconsistent",
+ ret);
+ return 0;
+ }
+
+ /*
+ * Here we don't need to get the lock of
+ * trans->transaction->delayed_refs, since inserted qrecord won't
+ * be deleted, only qrecord->node may be modified (new qrecord insert)
+ *
+ * So modifying qrecord->old_roots is safe here
+ */
+ qrecord->old_roots = old_root;
+ return 0;
+}
+
+int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes, gfp_t gfp_flag)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
+ || bytenr == 0 || num_bytes == 0)
+ return 0;
+ record = kzalloc(sizeof(*record), gfp_flag);
+ if (!record)
+ return -ENOMEM;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ record->bytenr = bytenr;
+ record->num_bytes = num_bytes;
+ record->old_roots = NULL;
+
+ spin_lock(&delayed_refs->lock);
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
+ spin_unlock(&delayed_refs->lock);
+ if (ret > 0) {
+ kfree(record);
+ return 0;
+ }
+ return btrfs_qgroup_trace_extent_post(trans, record);
+}
+
+int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int nr = btrfs_header_nritems(eb);
+ int i, extent_type, ret;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ u64 bytenr, num_bytes;
+
+ /* We can be called directly from walk_up_proc() */
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ for (i = 0; i < nr; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ /* filter out non qgroup-accountable extents */
+ extent_type = btrfs_file_extent_type(eb, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+
+ bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (!bytenr)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+ cond_resched();
+ return 0;
+}
+
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
+{
+ int level = 0;
+ int nr, slot;
+ struct extent_buffer *eb;
+
+ if (root_level == 0)
+ return 1;
+
+ while (level <= root_level) {
+ eb = path->nodes[level];
+ nr = btrfs_header_nritems(eb);
+ path->slots[level]++;
+ slot = path->slots[level];
+ if (slot >= nr || level == 0) {
+ /*
+ * Don't free the root - we will detect this
+ * condition after our loop and return a
+ * positive value for caller to stop walking the tree.
+ */
+ if (level != root_level) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+
+ free_extent_buffer(eb);
+ path->nodes[level] = NULL;
+ path->slots[level] = 0;
+ }
+ } else {
+ /*
+ * We have a valid slot to walk back down
+ * from. Stop here so caller can process these
+ * new nodes.
+ */
+ break;
+ }
+
+ level++;
+ }
+
+ eb = path->nodes[root_level];
+ if (path->slots[root_level] >= btrfs_header_nritems(eb))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Helper function to trace a subtree tree block swap.
+ *
+ * The swap will happen in highest tree block, but there may be a lot of
+ * tree blocks involved.
+ *
+ * For example:
+ * OO = Old tree blocks
+ * NN = New tree blocks allocated during balance
+ *
+ * File tree (257) Reloc tree for 257
+ * L2 OO NN
+ * / \ / \
+ * L1 OO OO (a) OO NN (a)
+ * / \ / \ / \ / \
+ * L0 OO OO OO OO OO OO NN NN
+ * (b) (c) (b) (c)
+ *
+ * When calling qgroup_trace_extent_swap(), we will pass:
+ * @src_eb = OO(a)
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
+ * @dst_level = 0
+ * @root_level = 1
+ *
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
+ *
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
+ *
+ * 1) Tree search from @src_eb
+ * It should acts as a simplified btrfs_search_slot().
+ * The key for search can be extracted from @dst_path->nodes[dst_level]
+ * (first key).
+ *
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
+ * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
+ * They should be marked during previous (@dst_level = 1) iteration.
+ *
+ * 3) Mark file extents in leaves dirty
+ * We don't have good way to pick out new file extents only.
+ * So we still follow the old method by scanning all file extents in
+ * the leave.
+ *
+ * This function can free us from keeping two paths, thus later we only need
+ * to care about how to iterate all new tree blocks in reloc tree.
+ */
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int dst_level, int root_level,
+ bool trace_leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_path *src_path;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u32 nodesize = fs_info->nodesize;
+ int cur_level = root_level;
+ int ret;
+
+ BUG_ON(dst_level > root_level);
+ /* Level mismatch */
+ if (btrfs_header_level(src_eb) != root_level)
+ return -EINVAL;
+
+ src_path = btrfs_alloc_path();
+ if (!src_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (dst_level)
+ btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+ else
+ btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+
+ /* For src_path */
+ atomic_inc(&src_eb->refs);
+ src_path->nodes[root_level] = src_eb;
+ src_path->slots[root_level] = dst_path->slots[root_level];
+ src_path->locks[root_level] = 0;
+
+ /* A simplified version of btrfs_search_slot() */
+ while (cur_level >= dst_level) {
+ struct btrfs_key src_key;
+ struct btrfs_key dst_key;
+
+ if (src_path->nodes[cur_level] == NULL) {
+ struct extent_buffer *eb;
+ int parent_slot;
+
+ eb = src_path->nodes[cur_level + 1];
+ parent_slot = src_path->slots[cur_level + 1];
+
+ eb = btrfs_read_node_slot(eb, parent_slot);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ }
+
+ src_path->nodes[cur_level] = eb;
+
+ btrfs_tree_read_lock(eb);
+ src_path->locks[cur_level] = BTRFS_READ_LOCK;
+ }
+
+ src_path->slots[cur_level] = dst_path->slots[cur_level];
+ if (cur_level) {
+ btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_node_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ } else {
+ btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_item_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ }
+ /* Content mismatch, something went wrong */
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ cur_level--;
+ }
+
+ /*
+ * Now both @dst_path and @src_path have been populated, record the tree
+ * blocks for qgroup accounting.
+ */
+ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_extent(trans,
+ dst_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
+ /* Record leaf file extents */
+ if (dst_level == 0 && trace_leaf) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
+ }
+out:
+ btrfs_free_path(src_path);
+ return ret;
+}
+
+/*
+ * Helper function to do recursive generation-aware depth-first search, to
+ * locate all new tree blocks in a subtree of reloc tree.
+ *
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
+ * reloc tree
+ * L2 NN (a)
+ * / \
+ * L1 OO NN (b)
+ * / \ / \
+ * L0 OO OO OO NN
+ * (c) (d)
+ * If we pass:
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
+ * @cur_level = 1
+ * @root_level = 1
+ *
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
+ * above tree blocks along with their counter parts in file tree.
+ * While during search, old tree blocks OO(c) will be skipped as tree block swap
+ * won't affect OO(c).
+ */
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int cur_level, int root_level,
+ u64 last_snapshot, bool trace_leaf)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *eb;
+ bool need_cleanup = false;
+ int ret = 0;
+ int i;
+
+ /* Level sanity check */
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < cur_level) {
+ btrfs_err_rl(fs_info,
+ "%s: bad levels, cur_level=%d root_level=%d",
+ __func__, cur_level, root_level);
+ return -EUCLEAN;
+ }
+
+ /* Read the tree block if needed */
+ if (dst_path->nodes[cur_level] == NULL) {
+ int parent_slot;
+ u64 child_gen;
+
+ /*
+ * dst_path->nodes[root_level] must be initialized before
+ * calling this function.
+ */
+ if (cur_level == root_level) {
+ btrfs_err_rl(fs_info,
+ "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
+ __func__, root_level, root_level, cur_level);
+ return -EUCLEAN;
+ }
+
+ /*
+ * We need to get child blockptr/gen from parent before we can
+ * read it.
+ */
+ eb = dst_path->nodes[cur_level + 1];
+ parent_slot = dst_path->slots[cur_level + 1];
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+
+ /* This node is old, no need to trace */
+ if (child_gen < last_snapshot)
+ goto out;
+
+ eb = btrfs_read_node_slot(eb, parent_slot);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ }
+
+ dst_path->nodes[cur_level] = eb;
+ dst_path->slots[cur_level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK;
+ need_cleanup = true;
+ }
+
+ /* Now record this tree block and its counter part for qgroups */
+ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
+ root_level, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+
+ eb = dst_path->nodes[cur_level];
+
+ if (cur_level > 0) {
+ /* Iterate all child tree blocks */
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ /* Skip old tree blocks as they won't be swapped */
+ if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
+ continue;
+ dst_path->slots[cur_level] = i;
+
+ /* Recursive call (at most 7 times) */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
+ dst_path, cur_level - 1, root_level,
+ last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+cleanup:
+ if (need_cleanup) {
+ /* Clean up */
+ btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
+ dst_path->locks[cur_level]);
+ free_extent_buffer(dst_path->nodes[cur_level]);
+ dst_path->nodes[cur_level] = NULL;
+ dst_path->slots[cur_level] = 0;
+ dst_path->locks[cur_level] = 0;
+ }
+out:
+ return ret;
+}
+
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct extent_buffer *src_eb,
+ struct extent_buffer *dst_eb,
+ u64 last_snapshot, bool trace_leaf)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *dst_path = NULL;
+ int level;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ /* Wrong parameter order */
+ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
+ btrfs_header_generation(src_eb),
+ btrfs_header_generation(dst_eb));
+ return -EUCLEAN;
+ }
+
+ if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ level = btrfs_header_level(dst_eb);
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* For dst_path */
+ atomic_inc(&dst_eb->refs);
+ dst_path->nodes[level] = dst_eb;
+ dst_path->slots[level] = 0;
+ dst_path->locks[level] = 0;
+
+ /* Do the generation aware breadth-first search */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
+ level, last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ btrfs_free_path(dst_path);
+ if (ret < 0)
+ qgroup_mark_inconsistent(fs_info);
+ return ret;
+}
+
+int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
+ struct extent_buffer *root_eb,
+ u64 root_gen, int root_level)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret = 0;
+ int level;
+ u8 drop_subptree_thres;
+ struct extent_buffer *eb = root_eb;
+ struct btrfs_path *path = NULL;
+
+ BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
+ BUG_ON(root_eb == NULL);
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * This function only gets called for snapshot drop, if we hit a high
+ * node here, it means we are going to change ownership for quite a lot
+ * of extents, which will greatly slow down btrfs_commit_transaction().
+ *
+ * So here if we find a high tree here, we just skip the accounting and
+ * mark qgroup inconsistent.
+ */
+ if (root_level >= drop_subptree_thres) {
+ qgroup_mark_inconsistent(fs_info);
+ return 0;
+ }
+
+ if (!extent_buffer_uptodate(root_eb)) {
+ ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
+ if (ret)
+ goto out;
+ }
+
+ if (root_level == 0) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Walk down the tree. Missing extent blocks are filled in as
+ * we go. Metadata is accounted every time we read a new
+ * extent block.
+ *
+ * When we reach a leaf, we account for file extent items in it,
+ * walk back up the tree (adjusting slot pointers as we go)
+ * and restart the search process.
+ */
+ atomic_inc(&root_eb->refs); /* For path */
+ path->nodes[root_level] = root_eb;
+ path->slots[root_level] = 0;
+ path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+ level = root_level;
+ while (level >= 0) {
+ if (path->nodes[level] == NULL) {
+ int parent_slot;
+ u64 child_bytenr;
+
+ /*
+ * We need to get child blockptr from parent before we
+ * can read it.
+ */
+ eb = path->nodes[level + 1];
+ parent_slot = path->slots[level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+
+ eb = btrfs_read_node_slot(eb, parent_slot);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ }
+
+ path->nodes[level] = eb;
+ path->slots[level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ path->locks[level] = BTRFS_READ_LOCK;
+
+ ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
+ fs_info->nodesize,
+ GFP_NOFS);
+ if (ret)
+ goto out;
+ }
+
+ if (level == 0) {
+ ret = btrfs_qgroup_trace_leaf_items(trans,
+ path->nodes[level]);
+ if (ret)
+ goto out;
+
+ /* Nonzero return here means we completed our search */
+ ret = adjust_slots_upwards(path, root_level);
+ if (ret)
+ break;
+
+ /* Restart search with new slots */
+ goto walk_down;
+ }
+
+ level--;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+#define UPDATE_NEW 0
+#define UPDATE_OLD 1
+/*
+ * Walk all of the roots that points to the bytenr and adjust their refcnts.
+ */
+static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct ulist *tmp,
+ struct ulist *qgroups, u64 seq, int update_old)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct ulist_node *tmp_unode;
+ struct ulist_iterator tmp_uiter;
+ struct btrfs_qgroup *qg;
+ int ret = 0;
+
+ if (!roots)
+ return 0;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ qg = find_qgroup_rb(fs_info, unode->val);
+ if (!qg)
+ continue;
+
+ ulist_reinit(tmp);
+ ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ULIST_ITER_INIT(&tmp_uiter);
+ while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = unode_aux_to_qgroup(tmp_unode);
+ if (update_old)
+ btrfs_qgroup_update_old_refcnt(qg, seq, 1);
+ else
+ btrfs_qgroup_update_new_refcnt(qg, seq, 1);
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ qgroup_to_aux(glist->group),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ qgroup_to_aux(glist->group),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * Update qgroup rfer/excl counters.
+ * Rfer update is easy, codes can explain themselves.
+ *
+ * Excl update is tricky, the update is split into 2 parts.
+ * Part 1: Possible exclusive <-> sharing detect:
+ * | A | !A |
+ * -------------------------------------
+ * B | * | - |
+ * -------------------------------------
+ * !B | + | ** |
+ * -------------------------------------
+ *
+ * Conditions:
+ * A: cur_old_roots < nr_old_roots (not exclusive before)
+ * !A: cur_old_roots == nr_old_roots (possible exclusive before)
+ * B: cur_new_roots < nr_new_roots (not exclusive now)
+ * !B: cur_new_roots == nr_new_roots (possible exclusive now)
+ *
+ * Results:
+ * +: Possible sharing -> exclusive -: Possible exclusive -> sharing
+ * *: Definitely not changed. **: Possible unchanged.
+ *
+ * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
+ *
+ * To make the logic clear, we first use condition A and B to split
+ * combination into 4 results.
+ *
+ * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
+ * only on variant maybe 0.
+ *
+ * Lastly, check result **, since there are 2 variants maybe 0, split them
+ * again(2x2).
+ * But this time we don't need to consider other things, the codes and logic
+ * is easy to understand now.
+ */
+static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
+ struct ulist *qgroups,
+ u64 nr_old_roots,
+ u64 nr_new_roots,
+ u64 num_bytes, u64 seq)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ u64 cur_new_count, cur_old_count;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(qgroups, &uiter))) {
+ bool dirty = false;
+
+ qg = unode_aux_to_qgroup(unode);
+ cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+ cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
+ trace_qgroup_update_counters(fs_info, qg, cur_old_count,
+ cur_new_count);
+
+ /* Rfer update part */
+ if (cur_old_count == 0 && cur_new_count > 0) {
+ qg->rfer += num_bytes;
+ qg->rfer_cmpr += num_bytes;
+ dirty = true;
+ }
+ if (cur_old_count > 0 && cur_new_count == 0) {
+ qg->rfer -= num_bytes;
+ qg->rfer_cmpr -= num_bytes;
+ dirty = true;
+ }
+
+ /* Excl update part */
+ /* Exclusive/none -> shared case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count < nr_new_roots) {
+ /* Exclusive -> shared */
+ if (cur_old_count != 0) {
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ }
+
+ /* Shared -> exclusive/none case */
+ if (cur_old_count < nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ /* Shared->exclusive */
+ if (cur_new_count != 0) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
+ }
+
+ /* Exclusive/none -> exclusive/none case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ if (cur_old_count == 0) {
+ /* None -> exclusive/none */
+
+ if (cur_new_count != 0) {
+ /* None -> exclusive */
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
+ /* None -> none, nothing changed */
+ } else {
+ /* Exclusive -> exclusive/none */
+
+ if (cur_new_count == 0) {
+ /* Exclusive -> none */
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ /* Exclusive -> exclusive, nothing changed */
+ }
+ }
+
+ if (dirty)
+ qgroup_dirty(fs_info, qg);
+ }
+ return 0;
+}
+
+/*
+ * Check if the @roots potentially is a list of fs tree roots
+ *
+ * Return 0 for definitely not a fs/subvol tree roots ulist
+ * Return 1 for possible fs/subvol tree roots in the list (considering an empty
+ * one as well)
+ */
+static int maybe_fs_roots(struct ulist *roots)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+
+ /* Empty one, still possible for fs roots */
+ if (!roots || roots->nnodes == 0)
+ return 1;
+
+ ULIST_ITER_INIT(&uiter);
+ unode = ulist_next(roots, &uiter);
+ if (!unode)
+ return 1;
+
+ /*
+ * If it contains fs tree roots, then it must belong to fs/subvol
+ * trees.
+ * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
+ */
+ return is_fstree(unode->val);
+}
+
+int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes, struct ulist *old_roots,
+ struct ulist *new_roots)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct ulist *qgroups = NULL;
+ struct ulist *tmp = NULL;
+ u64 seq;
+ u64 nr_new_roots = 0;
+ u64 nr_old_roots = 0;
+ int ret = 0;
+
+ /*
+ * If quotas get disabled meanwhile, the resources need to be freed and
+ * we can't just exit here.
+ */
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ goto out_free;
+
+ if (new_roots) {
+ if (!maybe_fs_roots(new_roots))
+ goto out_free;
+ nr_new_roots = new_roots->nnodes;
+ }
+ if (old_roots) {
+ if (!maybe_fs_roots(old_roots))
+ goto out_free;
+ nr_old_roots = old_roots->nnodes;
+ }
+
+ /* Quick exit, either not fs tree roots, or won't affect any qgroup */
+ if (nr_old_roots == 0 && nr_new_roots == 0)
+ goto out_free;
+
+ BUG_ON(!fs_info->quota_root);
+
+ trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
+ num_bytes, nr_old_roots, nr_new_roots);
+
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ ret = 0;
+ goto out_free;
+ }
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ spin_lock(&fs_info->qgroup_lock);
+ seq = fs_info->qgroup_seq;
+
+ /* Update old refcnts using old_roots */
+ ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
+ UPDATE_OLD);
+ if (ret < 0)
+ goto out;
+
+ /* Update new refcnts using new_roots */
+ ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
+ UPDATE_NEW);
+ if (ret < 0)
+ goto out;
+
+ qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+ num_bytes, seq);
+
+ /*
+ * Bump qgroup_seq to avoid seq overlap
+ */
+ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+out_free:
+ ulist_free(tmp);
+ ulist_free(qgroups);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ return ret;
+}
+
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct ulist *new_roots = NULL;
+ struct rb_node *node;
+ u64 num_dirty_extents = 0;
+ u64 qgroup_to_skip;
+ int ret = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
+ while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
+
+ num_dirty_extents++;
+ trace_btrfs_qgroup_account_extents(fs_info, record);
+
+ if (!ret && !(fs_info->qgroup_flags &
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
+ /*
+ * Old roots should be searched when inserting qgroup
+ * extent record.
+ *
+ * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
+ * we may have some record inserted during
+ * NO_ACCOUNTING (thus no old_roots populated), but
+ * later we start rescan, which clears NO_ACCOUNTING,
+ * leaving some inserted records without old_roots
+ * populated.
+ *
+ * Those cases are rare and should not cause too much
+ * time spent during commit_transaction().
+ */
+ if (!record->old_roots) {
+ /* Search commit root to find old_roots */
+ ret = btrfs_find_all_roots(NULL, fs_info,
+ record->bytenr, 0,
+ &record->old_roots, false);
+ if (ret < 0)
+ goto cleanup;
+ }
+
+ /* Free the reserved data space */
+ btrfs_qgroup_free_refroot(fs_info,
+ record->data_rsv_refroot,
+ record->data_rsv,
+ BTRFS_QGROUP_RSV_DATA);
+ /*
+ * Use BTRFS_SEQ_LAST as time_seq to do special search,
+ * which doesn't lock tree or delayed_refs and search
+ * current root. It's safe inside commit_transaction().
+ */
+ ret = btrfs_find_all_roots(trans, fs_info,
+ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
+ if (ret < 0)
+ goto cleanup;
+ if (qgroup_to_skip) {
+ ulist_del(new_roots, qgroup_to_skip, 0);
+ ulist_del(record->old_roots, qgroup_to_skip,
+ 0);
+ }
+ ret = btrfs_qgroup_account_extent(trans, record->bytenr,
+ record->num_bytes,
+ record->old_roots,
+ new_roots);
+ record->old_roots = NULL;
+ new_roots = NULL;
+ }
+cleanup:
+ ulist_free(record->old_roots);
+ ulist_free(new_roots);
+ new_roots = NULL;
+ rb_erase(node, &delayed_refs->dirty_extent_root);
+ kfree(record);
+
+ }
+ trace_qgroup_num_dirty_extents(fs_info, trans->transid,
+ num_dirty_extents);
+ return ret;
+}
+
+/*
+ * Writes all changed qgroups to disk.
+ * Called by the transaction commit path and the qgroup assign ioctl.
+ */
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret = 0;
+
+ /*
+ * In case we are called from the qgroup assign ioctl, assert that we
+ * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
+ * disable operation (ioctl) and access a freed quota root.
+ */
+ if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
+ lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
+
+ if (!fs_info->quota_root)
+ return ret;
+
+ spin_lock(&fs_info->qgroup_lock);
+ while (!list_empty(&fs_info->dirty_qgroups)) {
+ struct btrfs_qgroup *qgroup;
+ qgroup = list_first_entry(&fs_info->dirty_qgroups,
+ struct btrfs_qgroup, dirty);
+ list_del_init(&qgroup->dirty);
+ spin_unlock(&fs_info->qgroup_lock);
+ ret = update_qgroup_info_item(trans, qgroup);
+ if (ret)
+ qgroup_mark_inconsistent(fs_info);
+ ret = update_qgroup_limit_item(trans, qgroup);
+ if (ret)
+ qgroup_mark_inconsistent(fs_info);
+ spin_lock(&fs_info->qgroup_lock);
+ }
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
+ else
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ ret = update_qgroup_status_item(trans);
+ if (ret)
+ qgroup_mark_inconsistent(fs_info);
+
+ return ret;
+}
+
+/*
+ * Copy the accounting information between qgroups. This is necessary
+ * when a snapshot or a subvolume is created. Throwing an error will
+ * cause a transaction abort so we take extra care here to only error
+ * when a readonly fs is a reasonable outcome.
+ */
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
+ u64 objectid, struct btrfs_qgroup_inherit *inherit)
+{
+ int ret = 0;
+ int i;
+ u64 *i_qgroups;
+ bool committing = false;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *srcgroup;
+ struct btrfs_qgroup *dstgroup;
+ bool need_rescan = false;
+ u32 level_size = 0;
+ u64 nums;
+
+ /*
+ * There are only two callers of this function.
+ *
+ * One in create_subvol() in the ioctl context, which needs to hold
+ * the qgroup_ioctl_lock.
+ *
+ * The other one in create_pending_snapshot() where no other qgroup
+ * code can modify the fs as they all need to either start a new trans
+ * or hold a trans handler, thus we don't need to hold
+ * qgroup_ioctl_lock.
+ * This would avoid long and complex lock chain and make lockdep happy.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
+ committing = true;
+ spin_unlock(&fs_info->trans_lock);
+
+ if (!committing)
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ goto out;
+
+ quota_root = fs_info->quota_root;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (inherit) {
+ i_qgroups = (u64 *)(inherit + 1);
+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+ 2 * inherit->num_excl_copies;
+ for (i = 0; i < nums; ++i) {
+ srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
+
+ /*
+ * Zero out invalid groups so we can ignore
+ * them later.
+ */
+ if (!srcgroup ||
+ ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
+ *i_qgroups = 0ULL;
+
+ ++i_qgroups;
+ }
+ }
+
+ /*
+ * create a tracking group for the subvol itself
+ */
+ ret = add_qgroup_item(trans, quota_root, objectid);
+ if (ret)
+ goto out;
+
+ /*
+ * add qgroup to all inherited groups
+ */
+ if (inherit) {
+ i_qgroups = (u64 *)(inherit + 1);
+ for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
+ if (*i_qgroups == 0)
+ continue;
+ ret = add_qgroup_relation_item(trans, objectid,
+ *i_qgroups);
+ if (ret && ret != -EEXIST)
+ goto out;
+ ret = add_qgroup_relation_item(trans, *i_qgroups,
+ objectid);
+ if (ret && ret != -EEXIST)
+ goto out;
+ }
+ ret = 0;
+ }
+
+
+ spin_lock(&fs_info->qgroup_lock);
+
+ dstgroup = add_qgroup_rb(fs_info, objectid);
+ if (IS_ERR(dstgroup)) {
+ ret = PTR_ERR(dstgroup);
+ goto unlock;
+ }
+
+ if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+ dstgroup->lim_flags = inherit->lim.flags;
+ dstgroup->max_rfer = inherit->lim.max_rfer;
+ dstgroup->max_excl = inherit->lim.max_excl;
+ dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
+ dstgroup->rsv_excl = inherit->lim.rsv_excl;
+
+ qgroup_dirty(fs_info, dstgroup);
+ }
+
+ if (srcid) {
+ srcgroup = find_qgroup_rb(fs_info, srcid);
+ if (!srcgroup)
+ goto unlock;
+
+ /*
+ * We call inherit after we clone the root in order to make sure
+ * our counts don't go crazy, so at this point the only
+ * difference between the two roots should be the root node.
+ */
+ level_size = fs_info->nodesize;
+ dstgroup->rfer = srcgroup->rfer;
+ dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+ dstgroup->excl = level_size;
+ dstgroup->excl_cmpr = level_size;
+ srcgroup->excl = level_size;
+ srcgroup->excl_cmpr = level_size;
+
+ /* inherit the limit info */
+ dstgroup->lim_flags = srcgroup->lim_flags;
+ dstgroup->max_rfer = srcgroup->max_rfer;
+ dstgroup->max_excl = srcgroup->max_excl;
+ dstgroup->rsv_rfer = srcgroup->rsv_rfer;
+ dstgroup->rsv_excl = srcgroup->rsv_excl;
+
+ qgroup_dirty(fs_info, dstgroup);
+ qgroup_dirty(fs_info, srcgroup);
+ }
+
+ if (!inherit)
+ goto unlock;
+
+ i_qgroups = (u64 *)(inherit + 1);
+ for (i = 0; i < inherit->num_qgroups; ++i) {
+ if (*i_qgroups) {
+ ret = add_relation_rb(fs_info, objectid, *i_qgroups);
+ if (ret)
+ goto unlock;
+ }
+ ++i_qgroups;
+
+ /*
+ * If we're doing a snapshot, and adding the snapshot to a new
+ * qgroup, the numbers are guaranteed to be incorrect.
+ */
+ if (srcid)
+ need_rescan = true;
+ }
+
+ for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *dst;
+
+ if (!i_qgroups[0] || !i_qgroups[1])
+ continue;
+
+ src = find_qgroup_rb(fs_info, i_qgroups[0]);
+ dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+ if (!src || !dst) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ dst->rfer = src->rfer - level_size;
+ dst->rfer_cmpr = src->rfer_cmpr - level_size;
+
+ /* Manually tweaking numbers certainly needs a rescan */
+ need_rescan = true;
+ }
+ for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *dst;
+
+ if (!i_qgroups[0] || !i_qgroups[1])
+ continue;
+
+ src = find_qgroup_rb(fs_info, i_qgroups[0]);
+ dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+ if (!src || !dst) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ dst->excl = src->excl + level_size;
+ dst->excl_cmpr = src->excl_cmpr + level_size;
+ need_rescan = true;
+ }
+
+unlock:
+ spin_unlock(&fs_info->qgroup_lock);
+ if (!ret)
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
+out:
+ if (!committing)
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (need_rescan)
+ qgroup_mark_inconsistent(fs_info);
+ return ret;
+}
+
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
+{
+ if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+ qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
+ return false;
+
+ if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
+ qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
+ return false;
+
+ return true;
+}
+
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
+ enum btrfs_qgroup_rsv_type type)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 ref_root = root->root_key.objectid;
+ int ret = 0;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+
+ if (!is_fstree(ref_root))
+ return 0;
+
+ if (num_bytes == 0)
+ return 0;
+
+ if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
+ capable(CAP_SYS_RESOURCE))
+ enforce = false;
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (!fs_info->quota_root)
+ goto out;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ /*
+ * in a first step, we check all affected qgroups if any limits would
+ * be exceeded
+ */
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = unode_aux_to_qgroup(unode);
+
+ if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+ ret = -EDQUOT;
+ goto out;
+ }
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+ /*
+ * no limits exceeded, now record the reservation into all qgroups
+ */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+
+ qg = unode_aux_to_qgroup(unode);
+
+ qgroup_rsv_add(fs_info, qg, num_bytes, type);
+ }
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ return ret;
+}
+
+/*
+ * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0
+ * qgroup).
+ *
+ * Will handle all higher level qgroup too.
+ *
+ * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
+ * This special case is only used for META_PERTRANS type.
+ */
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ struct btrfs_qgroup *qgroup;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret = 0;
+
+ if (!is_fstree(ref_root))
+ return;
+
+ if (num_bytes == 0)
+ return;
+
+ if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
+ WARN(1, "%s: Invalid type to free", __func__);
+ return;
+ }
+ spin_lock(&fs_info->qgroup_lock);
+
+ if (!fs_info->quota_root)
+ goto out;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ if (num_bytes == (u64)-1)
+ /*
+ * We're freeing all pertrans rsv, get reserved value from
+ * level 0 qgroup as real num_bytes to free.
+ */
+ num_bytes = qgroup->rsv.values[type];
+
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = unode_aux_to_qgroup(unode);
+
+ qgroup_rsv_release(fs_info, qg, num_bytes, type);
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+}
+
+/*
+ * Check if the leaf is the last leaf. Which means all node pointers
+ * are at their last position.
+ */
+static bool is_last_leaf(struct btrfs_path *path)
+{
+ int i;
+
+ for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+ if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * returns < 0 on error, 0 when more leafs are to be scanned.
+ * returns 1 when done.
+ */
+static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
+ struct btrfs_key found;
+ struct extent_buffer *scratch_leaf = NULL;
+ struct ulist *roots = NULL;
+ u64 num_bytes;
+ bool done;
+ int slot;
+ int ret;
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ extent_root = btrfs_extent_root(fs_info,
+ fs_info->qgroup_rescan_progress.objectid);
+ ret = btrfs_search_slot_for_read(extent_root,
+ &fs_info->qgroup_rescan_progress,
+ path, 1, 0);
+
+ btrfs_debug(fs_info,
+ "current progress key (%llu %u %llu), search_slot ret %d",
+ fs_info->qgroup_rescan_progress.objectid,
+ fs_info->qgroup_rescan_progress.type,
+ fs_info->qgroup_rescan_progress.offset, ret);
+
+ if (ret) {
+ /*
+ * The rescan is about to end, we will not be scanning any
+ * further blocks. We cannot unset the RESCAN flag here, because
+ * we want to commit the transaction if everything went well.
+ * To make the live accounting work in this phase, we set our
+ * scan progress pointer such that every real extent objectid
+ * will be smaller.
+ */
+ fs_info->qgroup_rescan_progress.objectid = (u64)-1;
+ btrfs_release_path(path);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return ret;
+ }
+ done = is_last_leaf(path);
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found,
+ btrfs_header_nritems(path->nodes[0]) - 1);
+ fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
+
+ scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!scratch_leaf) {
+ ret = -ENOMEM;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto out;
+ }
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+ btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
+ if (found.type != BTRFS_EXTENT_ITEM_KEY &&
+ found.type != BTRFS_METADATA_ITEM_KEY)
+ continue;
+ if (found.type == BTRFS_METADATA_ITEM_KEY)
+ num_bytes = fs_info->nodesize;
+ else
+ num_bytes = found.offset;
+
+ ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+ &roots, false);
+ if (ret < 0)
+ goto out;
+ /* For rescan, just pass old_roots as NULL */
+ ret = btrfs_qgroup_account_extent(trans, found.objectid,
+ num_bytes, NULL, roots);
+ if (ret < 0)
+ goto out;
+ }
+out:
+ if (scratch_leaf)
+ free_extent_buffer(scratch_leaf);
+
+ if (done && !ret) {
+ ret = 1;
+ fs_info->qgroup_rescan_progress.objectid = (u64)-1;
+ }
+ return ret;
+}
+
+static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_fs_closing(fs_info) ||
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
+}
+
+static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+ qgroup_rescan_work);
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = NULL;
+ int err = -ENOMEM;
+ int ret = 0;
+ bool stopped = false;
+ bool did_leaf_rescans = false;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out;
+ /*
+ * Rescan should only search for commit root, and any later difference
+ * should be recorded by qgroup
+ */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ err = 0;
+ while (!err && !(stopped = rescan_should_stop(fs_info))) {
+ trans = btrfs_start_transaction(fs_info->fs_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ break;
+ }
+
+ err = qgroup_rescan_leaf(trans, path);
+ did_leaf_rescans = true;
+
+ if (err > 0)
+ btrfs_commit_transaction(trans);
+ else
+ btrfs_end_transaction(trans);
+ }
+
+out:
+ btrfs_free_path(path);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (err > 0 &&
+ fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ } else if (err < 0 || stopped) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ /*
+ * Only update status, since the previous part has already updated the
+ * qgroup info, and only if we did any actual work. This also prevents
+ * race with a concurrent quota disable, which has already set
+ * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
+ * btrfs_quota_disable().
+ */
+ if (did_leaf_rescans) {
+ trans = btrfs_start_transaction(fs_info->quota_root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ btrfs_err(fs_info,
+ "fail to start transaction for status update: %d",
+ err);
+ }
+ } else {
+ trans = NULL;
+ }
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (!stopped ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (trans) {
+ ret = update_qgroup_status_item(trans);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d",
+ err);
+ }
+ }
+ fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
+ complete_all(&fs_info->qgroup_rescan_completion);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (!trans)
+ return;
+
+ btrfs_end_transaction(trans);
+
+ if (stopped) {
+ btrfs_info(fs_info, "qgroup scan paused");
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
+ btrfs_info(fs_info, "qgroup scan cancelled");
+ } else if (err >= 0) {
+ btrfs_info(fs_info, "qgroup scan completed%s",
+ err > 0 ? " (inconsistency flag cleared)" : "");
+ } else {
+ btrfs_err(fs_info, "qgroup scan failed with %d", err);
+ }
+}
+
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags)
+{
+ int ret = 0;
+
+ if (!init_flags) {
+ /* we're resuming qgroup rescan at mount time */
+ if (!(fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup rescan is not queued");
+ ret = -EINVAL;
+ } else if (!(fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAG_ON)) {
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup is not enabled");
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+ }
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+
+ if (init_flags) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ btrfs_warn(fs_info,
+ "qgroup rescan is already in progress");
+ ret = -EINPROGRESS;
+ } else if (!(fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAG_ON)) {
+ btrfs_warn(fs_info,
+ "qgroup rescan init failed, qgroup is not enabled");
+ ret = -EINVAL;
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* Quota disable is in progress */
+ ret = -EBUSY;
+ }
+
+ if (ret) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return ret;
+ }
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ }
+
+ memset(&fs_info->qgroup_rescan_progress, 0,
+ sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+ fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+ init_completion(&fs_info->qgroup_rescan_completion);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ btrfs_init_work(&fs_info->qgroup_rescan_work,
+ btrfs_qgroup_rescan_worker, NULL, NULL);
+ return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup *qgroup;
+
+ spin_lock(&fs_info->qgroup_lock);
+ /* clear all current qgroup tracking information */
+ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ qgroup->rfer = 0;
+ qgroup->rfer_cmpr = 0;
+ qgroup->excl = 0;
+ qgroup->excl_cmpr = 0;
+ qgroup_dirty(fs_info, qgroup);
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+ struct btrfs_trans_handle *trans;
+
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * We have set the rescan_progress to 0, which means no more
+ * delayed refs will be accounted by btrfs_qgroup_account_ref.
+ * However, btrfs_qgroup_account_ref may be right after its call
+ * to btrfs_find_all_roots, in which case it would still do the
+ * accounting.
+ * To solve this, we're committing the transaction, which will
+ * ensure we run all delayed refs and only after that, we are
+ * going to clear all tracking information for a clean start.
+ */
+
+ trans = btrfs_join_transaction(fs_info->fs_root);
+ if (IS_ERR(trans)) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return ret;
+ }
+
+ qgroup_rescan_zero_tracking(fs_info);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ return 0;
+}
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+ bool interruptible)
+{
+ int running;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ running = fs_info->qgroup_rescan_running;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (!running)
+ return 0;
+
+ if (interruptible)
+ ret = wait_for_completion_interruptible(
+ &fs_info->qgroup_rescan_completion);
+ else
+ wait_for_completion(&fs_info->qgroup_rescan_completion);
+
+ return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ }
+}
+
+#define rbtree_iterate_from_safe(node, next, start) \
+ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
+
+static int qgroup_unreserve_range(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len)
+{
+ struct rb_node *node;
+ struct rb_node *next;
+ struct ulist_node *entry;
+ int ret = 0;
+
+ node = reserved->range_changed.root.rb_node;
+ if (!node)
+ return 0;
+ while (node) {
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ if (entry->val < start)
+ node = node->rb_right;
+ else
+ node = node->rb_left;
+ }
+
+ if (entry->val > start && rb_prev(&entry->rb_node))
+ entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
+ rb_node);
+
+ rbtree_iterate_from_safe(node, next, &entry->rb_node) {
+ u64 entry_start;
+ u64 entry_end;
+ u64 entry_len;
+ int clear_ret;
+
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ entry_start = entry->val;
+ entry_end = entry->aux;
+ entry_len = entry_end - entry_start + 1;
+
+ if (entry_start >= start + len)
+ break;
+ if (entry_start + entry_len <= start)
+ continue;
+ /*
+ * Now the entry is in [start, start + len), revert the
+ * EXTENT_QGROUP_RESERVED bit.
+ */
+ clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
+ entry_end, EXTENT_QGROUP_RESERVED);
+ if (!ret && clear_ret < 0)
+ ret = clear_ret;
+
+ ulist_del(&reserved->range_changed, entry->val, entry->aux);
+ if (likely(reserved->bytes_changed >= entry_len)) {
+ reserved->bytes_changed -= entry_len;
+ } else {
+ WARN_ON(1);
+ reserved->bytes_changed = 0;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Try to free some space for qgroup.
+ *
+ * For qgroup, there are only 3 ways to free qgroup space:
+ * - Flush nodatacow write
+ * Any nodatacow write will free its reserved data space at run_delalloc_range().
+ * In theory, we should only flush nodatacow inodes, but it's not yet
+ * possible, so we need to flush the whole root.
+ *
+ * - Wait for ordered extents
+ * When ordered extents are finished, their reserved metadata is finally
+ * converted to per_trans status, which can be freed by later commit
+ * transaction.
+ *
+ * - Commit transaction
+ * This would free the meta_per_trans space.
+ * In theory this shouldn't provide much space, but any more qgroup space
+ * is needed.
+ */
+static int try_flush_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /* Can't hold an open transaction or we run the risk of deadlocking. */
+ ASSERT(current->journal_info == NULL);
+ if (WARN_ON(current->journal_info))
+ return 0;
+
+ /*
+ * We don't want to run flush again and again, so if there is a running
+ * one, we won't try to start a new flush, but exit directly.
+ */
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+ wait_event(root->qgroup_flush_wait,
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
+ return 0;
+ }
+
+ ret = btrfs_start_delalloc_snapshot(root, true);
+ if (ret < 0)
+ goto out;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_commit_transaction(trans);
+out:
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
+ wake_up(&root->qgroup_flush_wait);
+ return ret;
+}
+
+static int qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
+{
+ struct btrfs_root *root = inode->root;
+ struct extent_changeset *reserved;
+ bool new_reserved = false;
+ u64 orig_reserved;
+ u64 to_reserve;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+ !is_fstree(root->root_key.objectid) || len == 0)
+ return 0;
+
+ /* @reserved parameter is mandatory for qgroup */
+ if (WARN_ON(!reserved_ret))
+ return -EINVAL;
+ if (!*reserved_ret) {
+ new_reserved = true;
+ *reserved_ret = extent_changeset_alloc();
+ if (!*reserved_ret)
+ return -ENOMEM;
+ }
+ reserved = *reserved_ret;
+ /* Record already reserved space */
+ orig_reserved = reserved->bytes_changed;
+ ret = set_record_extent_bits(&inode->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, reserved);
+
+ /* Newly reserved space */
+ to_reserve = reserved->bytes_changed - orig_reserved;
+ trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
+ to_reserve, QGROUP_RESERVE);
+ if (ret < 0)
+ goto out;
+ ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
+ if (ret < 0)
+ goto cleanup;
+
+ return ret;
+
+cleanup:
+ qgroup_unreserve_range(inode, reserved, start, len);
+out:
+ if (new_reserved) {
+ extent_changeset_free(reserved);
+ *reserved_ret = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or do nothing
+ * if the range is already reserved.
+ *
+ * Return 0 for successful reservation
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: This function may sleep for memory allocation, dirty page flushing and
+ * commit transaction. So caller should not hold any dirty page locked.
+ */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
+{
+ int ret;
+
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(inode->root);
+ if (ret < 0)
+ return ret;
+ return qgroup_reserve_data(inode, reserved_ret, start, len);
+}
+
+/* Free ranges specified by @reserved, normally in error path */
+static int qgroup_free_reserved_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved,
+ u64 start, u64 len, u64 *freed_ret)
+{
+ struct btrfs_root *root = inode->root;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct extent_changeset changeset;
+ u64 freed = 0;
+ int ret;
+
+ extent_changeset_init(&changeset);
+ len = round_up(start + len, root->fs_info->sectorsize);
+ start = round_down(start, root->fs_info->sectorsize);
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
+ u64 range_start = unode->val;
+ /* unode->aux is the inclusive end */
+ u64 range_len = unode->aux - range_start + 1;
+ u64 free_start;
+ u64 free_len;
+
+ extent_changeset_release(&changeset);
+
+ /* Only free range in range [start, start + len) */
+ if (range_start >= start + len ||
+ range_start + range_len <= start)
+ continue;
+ free_start = max(range_start, start);
+ free_len = min(start + len, range_start + range_len) -
+ free_start;
+ /*
+ * TODO: To also modify reserved->ranges_reserved to reflect
+ * the modification.
+ *
+ * However as long as we free qgroup reserved according to
+ * EXTENT_QGROUP_RESERVED, we won't double free.
+ * So not need to rush.
+ */
+ ret = clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
+ EXTENT_QGROUP_RESERVED, &changeset);
+ if (ret < 0)
+ goto out;
+ freed += changeset.bytes_changed;
+ }
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
+ BTRFS_QGROUP_RSV_DATA);
+ if (freed_ret)
+ *freed_ret = freed;
+ ret = 0;
+out:
+ extent_changeset_release(&changeset);
+ return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len,
+ u64 *released, int free)
+{
+ struct extent_changeset changeset;
+ int trace_op = QGROUP_RELEASE;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
+ return 0;
+
+ /* In release case, we shouldn't have @reserved */
+ WARN_ON(!free && reserved);
+ if (free && reserved)
+ return qgroup_free_reserved_data(inode, reserved, start, len, released);
+ extent_changeset_init(&changeset);
+ ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
+ EXTENT_QGROUP_RESERVED, &changeset);
+ if (ret < 0)
+ goto out;
+
+ if (free)
+ trace_op = QGROUP_FREE;
+ trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
+ changeset.bytes_changed, trace_op);
+ if (free)
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
+ changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
+ if (released)
+ *released = changeset.bytes_changed;
+out:
+ extent_changeset_release(&changeset);
+ return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ * if @reserved is given, only reserved range in [@start, @start + @len) will
+ * be freed.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved,
+ u64 start, u64 len, u64 *freed)
+{
+ return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
+{
+ return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
+}
+
+static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+ type != BTRFS_QGROUP_RSV_META_PERTRANS)
+ return;
+ if (num_bytes == 0)
+ return;
+
+ spin_lock(&root->qgroup_meta_rsv_lock);
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+ root->qgroup_meta_rsv_prealloc += num_bytes;
+ else
+ root->qgroup_meta_rsv_pertrans += num_bytes;
+ spin_unlock(&root->qgroup_meta_rsv_lock);
+}
+
+static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+ type != BTRFS_QGROUP_RSV_META_PERTRANS)
+ return 0;
+ if (num_bytes == 0)
+ return 0;
+
+ spin_lock(&root->qgroup_meta_rsv_lock);
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
+ num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
+ num_bytes);
+ root->qgroup_meta_rsv_prealloc -= num_bytes;
+ } else {
+ num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
+ num_bytes);
+ root->qgroup_meta_rsv_pertrans -= num_bytes;
+ }
+ spin_unlock(&root->qgroup_meta_rsv_lock);
+ return num_bytes;
+}
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ !is_fstree(root->root_key.objectid) || num_bytes == 0)
+ return 0;
+
+ BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
+ trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
+ ret = qgroup_reserve(root, num_bytes, enforce, type);
+ if (ret < 0)
+ return ret;
+ /*
+ * Record what we have reserved into root.
+ *
+ * To avoid quota disabled->enabled underflow.
+ * In that case, we may try to free space we haven't reserved
+ * (since quota was disabled), so record what we reserved into root.
+ * And ensure later release won't underflow this number.
+ */
+ add_root_meta_rsv(root, num_bytes, type);
+ return ret;
+}
+
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush)
+{
+ int ret;
+
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+ if ((ret <= 0 && ret != -EDQUOT) || noflush)
+ return ret;
+
+ ret = try_flush_qgroup(root);
+ if (ret < 0)
+ return ret;
+ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+}
+
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ !is_fstree(root->root_key.objectid))
+ return;
+
+ /* TODO: Update trace point to handle such free */
+ trace_qgroup_meta_free_all_pertrans(root);
+ /* Special value -1 means to free all reserved space */
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
+}
+
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ !is_fstree(root->root_key.objectid))
+ return;
+
+ /*
+ * reservation for META_PREALLOC can happen before quota is enabled,
+ * which can lead to underflow.
+ * Here ensure we will only free what we really have reserved.
+ */
+ num_bytes = sub_root_meta_rsv(root, num_bytes, type);
+ BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
+ trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
+ num_bytes, type);
+}
+
+static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
+ int num_bytes)
+{
+ struct btrfs_qgroup *qgroup;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret = 0;
+
+ if (num_bytes == 0)
+ return;
+ if (!fs_info->quota_root)
+ return;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = unode_aux_to_qgroup(unode);
+
+ qgroup_rsv_release(fs_info, qg, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+ qgroup_rsv_add(fs_info, qg, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+}
+
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ !is_fstree(root->root_key.objectid))
+ return;
+ /* Same as btrfs_qgroup_free_meta_prealloc() */
+ num_bytes = sub_root_meta_rsv(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+ trace_qgroup_meta_convert(root, num_bytes);
+ qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
+}
+
+/*
+ * Check qgroup reserved space leaking, normally at destroy inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
+{
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator iter;
+ int ret;
+
+ extent_changeset_init(&changeset);
+ ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
+ EXTENT_QGROUP_RESERVED, &changeset);
+
+ WARN_ON(ret < 0);
+ if (WARN_ON(changeset.bytes_changed)) {
+ ULIST_ITER_INIT(&iter);
+ while ((unode = ulist_next(&changeset.range_changed, &iter))) {
+ btrfs_warn(inode->root->fs_info,
+ "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
+ btrfs_ino(inode), unode->val, unode->aux);
+ }
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
+ changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
+
+ }
+ extent_changeset_release(&changeset);
+}
+
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+ int i;
+
+ spin_lock_init(&swapped_blocks->lock);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ swapped_blocks->blocks[i] = RB_ROOT;
+ swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+ int i;
+
+ swapped_blocks = &root->swapped_blocks;
+
+ spin_lock(&swapped_blocks->lock);
+ if (!swapped_blocks->swapped)
+ goto out;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
+ struct btrfs_qgroup_swapped_block *entry;
+ struct btrfs_qgroup_swapped_block *next;
+
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+ node)
+ kfree(entry);
+ swapped_blocks->blocks[i] = RB_ROOT;
+ }
+ swapped_blocks->swapped = false;
+out:
+ spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root: tree root of the subvolume tree get swapped
+ * @bg: block group under balance
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ * BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot: last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct rb_node **cur;
+ struct rb_node *parent = NULL;
+ int level = btrfs_header_level(subvol_parent) - 1;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+ __func__,
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+ return -EUCLEAN;
+ }
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * @reloc_parent/slot is still before swap, while @block is going to
+ * record the bytenr after swap, so we do the swap here.
+ */
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+ reloc_slot);
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+ subvol_slot);
+ block->last_snapshot = last_snapshot;
+ block->level = level;
+
+ /*
+ * If we have bg == NULL, we're called from btrfs_recover_relocation(),
+ * no one else can modify tree blocks thus we qgroup will not change
+ * no matter the value of trace_leaf.
+ */
+ if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ block->trace_leaf = true;
+ else
+ block->trace_leaf = false;
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+ /* Insert @block into @blocks */
+ spin_lock(&blocks->lock);
+ cur = &blocks->blocks[level].rb_node;
+ while (*cur) {
+ struct btrfs_qgroup_swapped_block *entry;
+
+ parent = *cur;
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+ node);
+
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
+ cur = &(*cur)->rb_left;
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+ cur = &(*cur)->rb_right;
+ } else {
+ if (entry->subvol_generation !=
+ block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation !=
+ block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found.
+ * Shouldn't happen.
+ *
+ * Marking qgroup inconsistent should be enough
+ * for end users.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EEXIST;
+ }
+ kfree(block);
+ goto out_unlock;
+ }
+ }
+ rb_link_node(&block->node, parent, cur);
+ rb_insert_color(&block->node, &blocks->blocks[level]);
+ blocks->swapped = true;
+out_unlock:
+ spin_unlock(&blocks->lock);
+out:
+ if (ret < 0)
+ qgroup_mark_inconsistent(fs_info);
+ return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *subvol_eb)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct extent_buffer *reloc_eb = NULL;
+ struct rb_node *node;
+ bool found = false;
+ bool swapped = false;
+ int level = btrfs_header_level(subvol_eb);
+ int ret = 0;
+ int i;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+ if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ return 0;
+
+ spin_lock(&blocks->lock);
+ if (!blocks->swapped) {
+ spin_unlock(&blocks->lock);
+ return 0;
+ }
+ node = blocks->blocks[level].rb_node;
+
+ while (node) {
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+ if (block->subvol_bytenr < subvol_eb->start) {
+ node = node->rb_left;
+ } else if (block->subvol_bytenr > subvol_eb->start) {
+ node = node->rb_right;
+ } else {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ spin_unlock(&blocks->lock);
+ goto out;
+ }
+ /* Found one, remove it from @blocks first and update blocks->swapped */
+ rb_erase(&block->node, &blocks->blocks[level]);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+ swapped = true;
+ break;
+ }
+ }
+ blocks->swapped = swapped;
+ spin_unlock(&blocks->lock);
+
+ /* Read out reloc subtree root */
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
+ block->reloc_generation, block->level,
+ &block->first_key);
+ if (IS_ERR(reloc_eb)) {
+ ret = PTR_ERR(reloc_eb);
+ reloc_eb = NULL;
+ goto free_out;
+ }
+ if (!extent_buffer_uptodate(reloc_eb)) {
+ ret = -EIO;
+ goto free_out;
+ }
+
+ ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+ block->last_snapshot, block->trace_leaf);
+free_out:
+ kfree(block);
+ free_extent_buffer(reloc_eb);
+out:
+ if (ret < 0) {
+ btrfs_err_rl(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
+ qgroup_mark_inconsistent(fs_info);
+ }
+ return ret;
+}
+
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
+{
+ struct btrfs_qgroup_extent_record *entry;
+ struct btrfs_qgroup_extent_record *next;
+ struct rb_root *root;
+
+ root = &trans->delayed_refs.dirty_extent_root;
+ rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
+ ulist_free(entry->old_roots);
+ kfree(entry);
+ }
+ *root = RB_ROOT;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
new file mode 100644
index 000000000..c382923f7
--- /dev/null
+++ b/fs/btrfs/qgroup.h
@@ -0,0 +1,440 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ */
+
+#ifndef BTRFS_QGROUP_H
+#define BTRFS_QGROUP_H
+
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
+#include <linux/kobject.h>
+#include "ulist.h"
+#include "delayed-ref.h"
+
+/*
+ * Btrfs qgroup overview
+ *
+ * Btrfs qgroup splits into 3 main part:
+ * 1) Reserve
+ * Reserve metadata/data space for incoming operations
+ * Affect how qgroup limit works
+ *
+ * 2) Trace
+ * Tell btrfs qgroup to trace dirty extents.
+ *
+ * Dirty extents including:
+ * - Newly allocated extents
+ * - Extents going to be deleted (in this trans)
+ * - Extents whose owner is going to be modified
+ *
+ * This is the main part affects whether qgroup numbers will stay
+ * consistent.
+ * Btrfs qgroup can trace clean extents and won't cause any problem,
+ * but it will consume extra CPU time, it should be avoided if possible.
+ *
+ * 3) Account
+ * Btrfs qgroup will updates its numbers, based on dirty extents traced
+ * in previous step.
+ *
+ * Normally at qgroup rescan and transaction commit time.
+ */
+
+/*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ * During subtree swap:
+ * O = Old tree blocks
+ * N = New tree blocks
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * NA OB OA OB
+ * / | | \ / | | \
+ * NC ND OE OF OC OD OE OF
+ *
+ * In this case, NA and OA are going to be swapped, record (NA, OA) into
+ * subvolume tree X.
+ *
+ * 2) After subtree swap.
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * OA OB NA OB
+ * / | | \ / | | \
+ * OC OD OE OF NC ND OE OF
+ *
+ * 3a) COW happens for OB
+ * If we are going to COW tree block OB, we check OB's bytenr against
+ * tree X's swapped_blocks structure.
+ * If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ * Then we do subtree scan on both subtrees OA and NA.
+ * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ * Then no matter what we do to subvolume tree X, qgroup numbers will
+ * still be correct.
+ * Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4) Transaction commit
+ * Any record in X's swapped_blocks gets removed, since there is no
+ * modification to the swapped subtrees, no need to trigger heavy qgroup
+ * subtree rescan for them.
+ */
+
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
+
+/*
+ * Record a dirty extent, and info qgroup to update quota on it
+ * TODO: Use kmem cache to alloc it.
+ */
+struct btrfs_qgroup_extent_record {
+ struct rb_node node;
+ u64 bytenr;
+ u64 num_bytes;
+
+ /*
+ * For qgroup reserved data space freeing.
+ *
+ * @data_rsv_refroot and @data_rsv will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * transaction commit time.
+ */
+ u32 data_rsv; /* reserved data space needs to be freed */
+ u64 data_rsv_refroot; /* which root the reserved data belongs to */
+ struct ulist *old_roots;
+};
+
+struct btrfs_qgroup_swapped_block {
+ struct rb_node node;
+
+ int level;
+ bool trace_leaf;
+
+ /* bytenr/generation of the tree block in subvolume tree after swap */
+ u64 subvol_bytenr;
+ u64 subvol_generation;
+
+ /* bytenr/generation of the tree block in reloc tree after swap */
+ u64 reloc_bytenr;
+ u64 reloc_generation;
+
+ u64 last_snapshot;
+ struct btrfs_key first_key;
+};
+
+/*
+ * Qgroup reservation types:
+ *
+ * DATA:
+ * space reserved for data
+ *
+ * META_PERTRANS:
+ * Space reserved for metadata (per-transaction)
+ * Due to the fact that qgroup data is only updated at transaction commit
+ * time, reserved space for metadata must be kept until transaction
+ * commits.
+ * Any metadata reserved that are used in btrfs_start_transaction() should
+ * be of this type.
+ *
+ * META_PREALLOC:
+ * There are cases where metadata space is reserved before starting
+ * transaction, and then btrfs_join_transaction() to get a trans handle.
+ * Any metadata reserved for such usage should be of this type.
+ * And after join_transaction() part (or all) of such reservation should
+ * be converted into META_PERTRANS.
+ */
+enum btrfs_qgroup_rsv_type {
+ BTRFS_QGROUP_RSV_DATA,
+ BTRFS_QGROUP_RSV_META_PERTRANS,
+ BTRFS_QGROUP_RSV_META_PREALLOC,
+ BTRFS_QGROUP_RSV_LAST,
+};
+
+/*
+ * Represents how many bytes we have reserved for this qgroup.
+ *
+ * Each type should have different reservation behavior.
+ * E.g, data follows its io_tree flag modification, while
+ * *currently* meta is just reserve-and-clear during transaction.
+ *
+ * TODO: Add new type for reservation which can survive transaction commit.
+ * Current metadata reservation behavior is not suitable for such case.
+ */
+struct btrfs_qgroup_rsv {
+ u64 values[BTRFS_QGROUP_RSV_LAST];
+};
+
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+ u64 qgroupid;
+
+ /*
+ * state
+ */
+ u64 rfer; /* referenced */
+ u64 rfer_cmpr; /* referenced compressed */
+ u64 excl; /* exclusive */
+ u64 excl_cmpr; /* exclusive compressed */
+
+ /*
+ * limits
+ */
+ u64 lim_flags; /* which limits are set */
+ u64 max_rfer;
+ u64 max_excl;
+ u64 rsv_rfer;
+ u64 rsv_excl;
+
+ /*
+ * reservation tracking
+ */
+ struct btrfs_qgroup_rsv rsv;
+
+ /*
+ * lists
+ */
+ struct list_head groups; /* groups this group is member of */
+ struct list_head members; /* groups that are members of this group */
+ struct list_head dirty; /* dirty groups */
+ struct rb_node node; /* tree of qgroups */
+
+ /*
+ * temp variables for accounting operations
+ * Refer to qgroup_shared_accounting() for details.
+ */
+ u64 old_refcnt;
+ u64 new_refcnt;
+
+ /*
+ * Sysfs kobjectid
+ */
+ struct kobject kobj;
+};
+
+static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
+{
+ return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
+}
+
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE (1<<0)
+#define QGROUP_RELEASE (1<<1)
+#define QGROUP_FREE (1<<2)
+
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+ bool interruptible);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
+ u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
+ u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+
+/*
+ * Inform qgroup to trace one dirty extent, its info is recorded in @record.
+ * So qgroup can account it at transaction committing time.
+ *
+ * No lock version, caller must acquire delayed ref lock and allocated memory,
+ * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
+int btrfs_qgroup_trace_extent_nolock(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
+
+/*
+ * Post handler after qgroup_trace_extent_nolock().
+ *
+ * NOTE: Current qgroup does the expensive backref walk at transaction
+ * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
+ * new transaction.
+ * This is designed to allow btrfs_find_all_roots() to get correct new_roots
+ * result.
+ *
+ * However for old_roots there is no need to do backref walk at that time,
+ * since we search commit roots to walk backref and result will always be
+ * correct.
+ *
+ * Due to the nature of no lock version, we can't do backref there.
+ * So we must call btrfs_qgroup_trace_extent_post() after exiting
+ * spinlock context.
+ *
+ * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
+ * using current root, then we can move all expensive backref walk out of
+ * transaction committing, but not now as qgroup accounting will be wrong again.
+ */
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
+ struct btrfs_qgroup_extent_record *qrecord);
+
+/*
+ * Inform qgroup to trace one dirty extent, specified by @bytenr and
+ * @num_bytes.
+ * So qgroup can account it at commit trans time.
+ *
+ * Better encapsulated version, with memory allocation and backref walk for
+ * commit roots.
+ * So this can sleep.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
+int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes, gfp_t gfp_flag);
+
+/*
+ * Inform qgroup to trace all leaf items of data
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM)
+ */
+int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb);
+/*
+ * Inform qgroup to trace a whole subtree, including all its child tree
+ * blocks and data.
+ * The root tree block is specified by @root_eb.
+ *
+ * Normally used by relocation(tree block swap) and subvolume deletion.
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM or tree search error)
+ */
+int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
+ struct extent_buffer *root_eb,
+ u64 root_gen, int root_level);
+int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes, struct ulist *old_roots,
+ struct ulist *new_roots);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
+ u64 objectid, struct btrfs_qgroup_inherit *inherit);
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl);
+#endif
+
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released);
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len, u64 *freed);
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush);
+/* Reserve metadata space for pertrans and prealloc type */
+static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
+ int num_bytes, bool enforce)
+{
+ return __btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS,
+ enforce, false);
+}
+static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
+ int num_bytes, bool enforce,
+ bool noflush)
+{
+ return __btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC,
+ enforce, noflush);
+}
+
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type);
+
+/* Free per-transaction meta reservation for error handling */
+static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
+ int num_bytes)
+{
+ __btrfs_qgroup_free_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
+}
+
+/* Pre-allocated meta reservation can be freed at need */
+static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
+ int num_bytes)
+{
+ __btrfs_qgroup_free_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+}
+
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
+
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
+
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
+
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *eb);
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000..82c8e9913
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2761 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2012 Fusion-io All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/raid/pq.h>
+#include <linux/hash.h>
+#include <linux/list_sort.h>
+#include <linux/raid/xor.h>
+#include <linux/mm.h>
+#include "misc.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+
+/* set when additional merges to this rbio are not allowed */
+#define RBIO_RMW_LOCKED_BIT 1
+
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT 2
+
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT 3
+
+#define RBIO_CACHE_SIZE 1024
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
+/* Used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+ struct list_head hash_list;
+ spinlock_t lock;
+};
+
+/* Used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+ struct list_head stripe_cache;
+ spinlock_t cache_lock;
+ int cache_size;
+ struct btrfs_stripe_hash table[];
+};
+
+/*
+ * A bvec like structure to present a sector inside a page.
+ *
+ * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
+ */
+struct sector_ptr {
+ struct page *page;
+ unsigned int pgoff:24;
+ unsigned int uptodate:8;
+};
+
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
+static void rmw_work(struct work_struct *work);
+static void read_rebuild_work(struct work_struct *work);
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+ int need_check);
+static void scrub_parity_work(struct work_struct *work);
+
+static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
+{
+ INIT_WORK(&rbio->work, work_func);
+ queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
+}
+
+/*
+ * the stripe hash table is used for locking, and to collect
+ * bios in hopes of making a full stripe
+ */
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
+{
+ struct btrfs_stripe_hash_table *table;
+ struct btrfs_stripe_hash_table *x;
+ struct btrfs_stripe_hash *cur;
+ struct btrfs_stripe_hash *h;
+ int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
+ int i;
+
+ if (info->stripe_hash_table)
+ return 0;
+
+ /*
+ * The table is large, starting with order 4 and can go as high as
+ * order 7 in case lock debugging is turned on.
+ *
+ * Try harder to allocate and fallback to vmalloc to lower the chance
+ * of a failing mount.
+ */
+ table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ spin_lock_init(&table->cache_lock);
+ INIT_LIST_HEAD(&table->stripe_cache);
+
+ h = table->table;
+
+ for (i = 0; i < num_entries; i++) {
+ cur = h + i;
+ INIT_LIST_HEAD(&cur->hash_list);
+ spin_lock_init(&cur->lock);
+ }
+
+ x = cmpxchg(&info->stripe_hash_table, NULL, table);
+ kvfree(x);
+ return 0;
+}
+
+/*
+ * caching an rbio means to copy anything from the
+ * bio_sectors array into the stripe_pages array. We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ int ret;
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ return;
+
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ /* Some range not covered by bio (partial write), skip it */
+ if (!rbio->bio_sectors[i].page)
+ continue;
+
+ ASSERT(rbio->stripe_sectors[i].page);
+ memcpy_page(rbio->stripe_sectors[i].page,
+ rbio->stripe_sectors[i].pgoff,
+ rbio->bio_sectors[i].page,
+ rbio->bio_sectors[i].pgoff,
+ rbio->bioc->fs_info->sectorsize);
+ rbio->stripe_sectors[i].uptodate = 1;
+ }
+ set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+
+/*
+ * we hash on the first logical address of the stripe
+ */
+static int rbio_bucket(struct btrfs_raid_bio *rbio)
+{
+ u64 num = rbio->bioc->raid_map[0];
+
+ /*
+ * we shift down quite a bit. We're using byte
+ * addressing, and most of the lower bits are zeros.
+ * This tends to upset hash_64, and it consistently
+ * returns just one or two different values.
+ *
+ * shifting off the lower bits fixes things.
+ */
+ return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
+}
+
+static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
+ unsigned int page_nr)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page;
+ i++) {
+ if (!rbio->stripe_sectors[i].uptodate)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Update the stripe_sectors[] array to use correct page and pgoff
+ *
+ * Should be called every time any page pointer in stripes_pages[] got modified.
+ */
+static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ u32 offset;
+ int i;
+
+ for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
+ int page_index = offset >> PAGE_SHIFT;
+
+ ASSERT(page_index < rbio->nr_pages);
+ rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
+ rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
+ }
+}
+
+static void steal_rbio_page(struct btrfs_raid_bio *src,
+ struct btrfs_raid_bio *dest, int page_nr)
+{
+ const u32 sectorsize = src->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ if (dest->stripe_pages[page_nr])
+ __free_page(dest->stripe_pages[page_nr]);
+ dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
+ src->stripe_pages[page_nr] = NULL;
+
+ /* Also update the sector->uptodate bits. */
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page; i++)
+ dest->stripe_sectors[i].uptodate = true;
+}
+
+/*
+ * Stealing an rbio means taking all the uptodate pages from the stripe array
+ * in the source rbio and putting them into the destination rbio.
+ *
+ * This will also update the involved stripe_sectors[] which are referring to
+ * the old pages.
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+ int i;
+ struct page *s;
+
+ if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+ return;
+
+ for (i = 0; i < dest->nr_pages; i++) {
+ s = src->stripe_pages[i];
+ if (!s || !full_page_sectors_uptodate(src, i))
+ continue;
+
+ steal_rbio_page(src, dest, i);
+ }
+ index_stripe_sectors(dest);
+ index_stripe_sectors(src);
+}
+
+/*
+ * merging means we take the bio_list from the victim and
+ * splice it into the destination. The victim should
+ * be discarded afterwards.
+ *
+ * must be called with dest->rbio_list_lock held
+ */
+static void merge_rbio(struct btrfs_raid_bio *dest,
+ struct btrfs_raid_bio *victim)
+{
+ bio_list_merge(&dest->bio_list, &victim->bio_list);
+ dest->bio_list_bytes += victim->bio_list_bytes;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
+ dest->stripe_nsectors);
+ bio_list_init(&victim->bio_list);
+}
+
+/*
+ * used to prune items that are in the cache. The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+ int bucket = rbio_bucket(rbio);
+ struct btrfs_stripe_hash_table *table;
+ struct btrfs_stripe_hash *h;
+ int freeit = 0;
+
+ /*
+ * check the bit again under the hash table lock.
+ */
+ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ return;
+
+ table = rbio->bioc->fs_info->stripe_hash_table;
+ h = table->table + bucket;
+
+ /* hold the lock for the bucket because we may be
+ * removing it from the hash table
+ */
+ spin_lock(&h->lock);
+
+ /*
+ * hold the lock for the bio list because we need
+ * to make sure the bio list is empty
+ */
+ spin_lock(&rbio->bio_list_lock);
+
+ if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+ list_del_init(&rbio->stripe_cache);
+ table->cache_size -= 1;
+ freeit = 1;
+
+ /* if the bio list isn't empty, this rbio is
+ * still involved in an IO. We take it out
+ * of the cache list, and drop the ref that
+ * was held for the list.
+ *
+ * If the bio_list was empty, we also remove
+ * the rbio from the hash_table, and drop
+ * the corresponding ref
+ */
+ if (bio_list_empty(&rbio->bio_list)) {
+ if (!list_empty(&rbio->hash_list)) {
+ list_del_init(&rbio->hash_list);
+ refcount_dec(&rbio->refs);
+ BUG_ON(!list_empty(&rbio->plug_list));
+ }
+ }
+ }
+
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock(&h->lock);
+
+ if (freeit)
+ __free_raid_bio(rbio);
+}
+
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+
+ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ return;
+
+ table = rbio->bioc->fs_info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ __remove_rbio_from_cache(rbio);
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove everything in the cache
+ */
+static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+ struct btrfs_raid_bio *rbio;
+
+ table = info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ while (!list_empty(&table->stripe_cache)) {
+ rbio = list_entry(table->stripe_cache.next,
+ struct btrfs_raid_bio,
+ stripe_cache);
+ __remove_rbio_from_cache(rbio);
+ }
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
+ */
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+{
+ if (!info->stripe_hash_table)
+ return;
+ btrfs_clear_rbio_cache(info);
+ kvfree(info->stripe_hash_table);
+ info->stripe_hash_table = NULL;
+}
+
+/*
+ * insert an rbio into the stripe cache. It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+
+ if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+ return;
+
+ table = rbio->bioc->fs_info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ spin_lock(&rbio->bio_list_lock);
+
+ /* bump our ref if we were not in the list before */
+ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+ refcount_inc(&rbio->refs);
+
+ if (!list_empty(&rbio->stripe_cache)){
+ list_move(&rbio->stripe_cache, &table->stripe_cache);
+ } else {
+ list_add(&rbio->stripe_cache, &table->stripe_cache);
+ table->cache_size += 1;
+ }
+
+ spin_unlock(&rbio->bio_list_lock);
+
+ if (table->cache_size > RBIO_CACHE_SIZE) {
+ struct btrfs_raid_bio *found;
+
+ found = list_entry(table->stripe_cache.prev,
+ struct btrfs_raid_bio,
+ stripe_cache);
+
+ if (found != rbio)
+ __remove_rbio_from_cache(found);
+ }
+
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * helper function to run the xor_blocks api. It is only
+ * able to do MAX_XOR_BLOCKS at a time, so we need to
+ * loop through.
+ */
+static void run_xor(void **pages, int src_cnt, ssize_t len)
+{
+ int src_off = 0;
+ int xor_src_cnt = 0;
+ void *dest = pages[src_cnt];
+
+ while(src_cnt > 0) {
+ xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+ xor_blocks(xor_src_cnt, len, dest, pages + src_off);
+
+ src_cnt -= xor_src_cnt;
+ src_off += xor_src_cnt;
+ }
+}
+
+/*
+ * Returns true if the bio list inside this rbio covers an entire stripe (no
+ * rmw required).
+ */
+static int rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+ unsigned long flags;
+ unsigned long size = rbio->bio_list_bytes;
+ int ret = 1;
+
+ spin_lock_irqsave(&rbio->bio_list_lock, flags);
+ if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
+ ret = 0;
+ BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
+ spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+
+ return ret;
+}
+
+/*
+ * returns 1 if it is safe to merge two rbios together.
+ * The merging is safe if the two rbios correspond to
+ * the same stripe and if they are both going in the same
+ * direction (read vs write), and if neither one is
+ * locked for final IO
+ *
+ * The caller is responsible for locking such that
+ * rmw_locked is safe to test
+ */
+static int rbio_can_merge(struct btrfs_raid_bio *last,
+ struct btrfs_raid_bio *cur)
+{
+ if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
+ test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
+ return 0;
+
+ /*
+ * we can't merge with cached rbios, since the
+ * idea is that when we merge the destination
+ * rbio is going to run our IO for us. We can
+ * steal from cached rbios though, other functions
+ * handle that.
+ */
+ if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+ test_bit(RBIO_CACHE_BIT, &cur->flags))
+ return 0;
+
+ if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
+ return 0;
+
+ /* we can't merge with different operations */
+ if (last->operation != cur->operation)
+ return 0;
+ /*
+ * We've need read the full stripe from the drive.
+ * check and repair the parity and write the new results.
+ *
+ * We're not allowed to add any new bios to the
+ * bio list here, anyone else that wants to
+ * change this stripe needs to do their own rmw.
+ */
+ if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
+ return 0;
+
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
+ return 0;
+
+ if (last->operation == BTRFS_RBIO_READ_REBUILD) {
+ int fa = last->faila;
+ int fb = last->failb;
+ int cur_fa = cur->faila;
+ int cur_fb = cur->failb;
+
+ if (last->faila >= last->failb) {
+ fa = last->failb;
+ fb = last->faila;
+ }
+
+ if (cur->faila >= cur->failb) {
+ cur_fa = cur->failb;
+ cur_fb = cur->faila;
+ }
+
+ if (fa != cur_fa || fb != cur_fb)
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
+{
+ ASSERT(stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr < rbio->stripe_nsectors);
+
+ return stripe_nr * rbio->stripe_nsectors + sector_nr;
+}
+
+/* Return a sector from rbio->stripe_sectors, not from the bio list */
+static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
+{
+ return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
+ sector_nr)];
+}
+
+/* Grab a sector inside P stripe */
+static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
+{
+ return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
+}
+
+/* Grab a sector inside Q stripe, return NULL if not RAID6 */
+static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
+{
+ if (rbio->nr_data + 1 == rbio->real_stripes)
+ return NULL;
+ return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
+}
+
+/*
+ * The first stripe in the table for a logical address
+ * has the lock. rbios are added in one of three ways:
+ *
+ * 1) Nobody has the stripe locked yet. The rbio is given
+ * the lock and 0 is returned. The caller must start the IO
+ * themselves.
+ *
+ * 2) Someone has the stripe locked, but we're able to merge
+ * with the lock owner. The rbio is freed and the IO will
+ * start automatically along with the existing rbio. 1 is returned.
+ *
+ * 3) Someone has the stripe locked, but we're not able to merge.
+ * The rbio is added to the lock owner's plug list, or merged into
+ * an rbio already on the plug list. When the lock owner unlocks,
+ * the next rbio on the list is run and the IO is started automatically.
+ * 1 is returned
+ *
+ * If we return 0, the caller still owns the rbio and must continue with
+ * IO submission. If we return 1, the caller must assume the rbio has
+ * already been freed.
+ */
+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash *h;
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *pending;
+ unsigned long flags;
+ struct btrfs_raid_bio *freeit = NULL;
+ struct btrfs_raid_bio *cache_drop = NULL;
+ int ret = 0;
+
+ h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+
+ spin_lock_irqsave(&h->lock, flags);
+ list_for_each_entry(cur, &h->hash_list, hash_list) {
+ if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
+ continue;
+
+ spin_lock(&cur->bio_list_lock);
+
+ /* Can we steal this cached rbio's pages? */
+ if (bio_list_empty(&cur->bio_list) &&
+ list_empty(&cur->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+ list_del_init(&cur->hash_list);
+ refcount_dec(&cur->refs);
+
+ steal_rbio(cur, rbio);
+ cache_drop = cur;
+ spin_unlock(&cur->bio_list_lock);
+
+ goto lockit;
+ }
+
+ /* Can we merge into the lock owner? */
+ if (rbio_can_merge(cur, rbio)) {
+ merge_rbio(cur, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+
+
+ /*
+ * We couldn't merge with the running rbio, see if we can merge
+ * with the pending ones. We don't have to check for rmw_locked
+ * because there is no way they are inside finish_rmw right now
+ */
+ list_for_each_entry(pending, &cur->plug_list, plug_list) {
+ if (rbio_can_merge(pending, rbio)) {
+ merge_rbio(pending, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /*
+ * No merging, put us on the tail of the plug list, our rbio
+ * will be started with the currently running rbio unlocks
+ */
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
+ spin_unlock(&cur->bio_list_lock);
+ ret = 1;
+ goto out;
+ }
+lockit:
+ refcount_inc(&rbio->refs);
+ list_add(&rbio->hash_list, &h->hash_list);
+out:
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (cache_drop)
+ remove_rbio_from_cache(cache_drop);
+ if (freeit)
+ __free_raid_bio(freeit);
+ return ret;
+}
+
+/*
+ * called as rmw or parity rebuild is completed. If the plug list has more
+ * rbios waiting for this stripe, the next one on the list will be started
+ */
+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bucket;
+ struct btrfs_stripe_hash *h;
+ unsigned long flags;
+ int keep_cache = 0;
+
+ bucket = rbio_bucket(rbio);
+ h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
+
+ if (list_empty(&rbio->plug_list))
+ cache_rbio(rbio);
+
+ spin_lock_irqsave(&h->lock, flags);
+ spin_lock(&rbio->bio_list_lock);
+
+ if (!list_empty(&rbio->hash_list)) {
+ /*
+ * if we're still cached and there is no other IO
+ * to perform, just leave this rbio here for others
+ * to steal from later
+ */
+ if (list_empty(&rbio->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+ keep_cache = 1;
+ clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ BUG_ON(!bio_list_empty(&rbio->bio_list));
+ goto done;
+ }
+
+ list_del_init(&rbio->hash_list);
+ refcount_dec(&rbio->refs);
+
+ /*
+ * we use the plug list to hold all the rbios
+ * waiting for the chance to lock this stripe.
+ * hand the lock over to one of them.
+ */
+ if (!list_empty(&rbio->plug_list)) {
+ struct btrfs_raid_bio *next;
+ struct list_head *head = rbio->plug_list.next;
+
+ next = list_entry(head, struct btrfs_raid_bio,
+ plug_list);
+
+ list_del_init(&rbio->plug_list);
+
+ list_add(&next->hash_list, &h->hash_list);
+ refcount_inc(&next->refs);
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (next->operation == BTRFS_RBIO_READ_REBUILD)
+ start_async_work(next, read_rebuild_work);
+ else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ steal_rbio(rbio, next);
+ start_async_work(next, read_rebuild_work);
+ } else if (next->operation == BTRFS_RBIO_WRITE) {
+ steal_rbio(rbio, next);
+ start_async_work(next, rmw_work);
+ } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
+ steal_rbio(rbio, next);
+ start_async_work(next, scrub_parity_work);
+ }
+
+ goto done_nolock;
+ }
+ }
+done:
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+done_nolock:
+ if (!keep_cache)
+ remove_rbio_from_cache(rbio);
+}
+
+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ if (!refcount_dec_and_test(&rbio->refs))
+ return;
+
+ WARN_ON(!list_empty(&rbio->stripe_cache));
+ WARN_ON(!list_empty(&rbio->hash_list));
+ WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i]) {
+ __free_page(rbio->stripe_pages[i]);
+ rbio->stripe_pages[i] = NULL;
+ }
+ }
+
+ btrfs_put_bioc(rbio->bioc);
+ kfree(rbio);
+}
+
+static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
+{
+ struct bio *next;
+
+ while (cur) {
+ next = cur->bi_next;
+ cur->bi_next = NULL;
+ cur->bi_status = err;
+ bio_endio(cur);
+ cur = next;
+ }
+}
+
+/*
+ * this frees the rbio and runs through all the bios in the
+ * bio_list and calls end_io on them
+ */
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
+{
+ struct bio *cur = bio_list_get(&rbio->bio_list);
+ struct bio *extra;
+
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
+
+ /*
+ * At this moment, rbio->bio_list is empty, however since rbio does not
+ * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
+ * hash list, rbio may be merged with others so that rbio->bio_list
+ * becomes non-empty.
+ * Once unlock_stripe() is done, rbio->bio_list will not be updated any
+ * more and we can call bio_endio() on all queued bios.
+ */
+ unlock_stripe(rbio);
+ extra = bio_list_get(&rbio->bio_list);
+ __free_raid_bio(rbio);
+
+ rbio_endio_bio_list(cur, err);
+ if (extra)
+ rbio_endio_bio_list(extra, err);
+}
+
+/*
+ * end io function used by finish_rmw. When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_end_io(struct bio *bio)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+ blk_status_t err = bio->bi_status;
+ int max_errors;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
+ return;
+
+ err = BLK_STS_OK;
+
+ /* OK, we have read all the stripes we need to. */
+ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
+ 0 : rbio->bioc->max_errors;
+ if (atomic_read(&rbio->error) > max_errors)
+ err = BLK_STS_IOERR;
+
+ rbio_orig_end_io(rbio, err);
+}
+
+/**
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr
+ *
+ * @rbio: The raid bio
+ * @stripe_nr: Stripe number, valid range [0, real_stripe)
+ * @sector_nr: Sector number inside the stripe,
+ * valid range [0, stripe_nsectors)
+ * @bio_list_only: Whether to use sectors inside the bio list only.
+ *
+ * The read/modify/write code wants to reuse the original bio page as much
+ * as possible, and only use stripe_sectors as fallback.
+ */
+static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr,
+ bool bio_list_only)
+{
+ struct sector_ptr *sector;
+ int index;
+
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+
+ index = stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ASSERT(index >= 0 && index < rbio->nr_sectors);
+
+ spin_lock_irq(&rbio->bio_list_lock);
+ sector = &rbio->bio_sectors[index];
+ if (sector->page || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (!sector->page)
+ sector = NULL;
+ spin_unlock_irq(&rbio->bio_list_lock);
+ return sector;
+ }
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ return &rbio->stripe_sectors[index];
+}
+
+/*
+ * allocation and initial setup for the btrfs_raid_bio. Not
+ * this does not allocate any pages for rbio->pages.
+ */
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
+ struct btrfs_io_context *bioc)
+{
+ const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
+ const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
+ const unsigned int num_pages = stripe_npages * real_stripes;
+ const unsigned int stripe_nsectors =
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+ const unsigned int num_sectors = stripe_nsectors * real_stripes;
+ struct btrfs_raid_bio *rbio;
+ void *p;
+
+ /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
+ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+ /*
+ * Our current stripe len should be fixed to 64k thus stripe_nsectors
+ * (at most 16) should be no larger than BITS_PER_LONG.
+ */
+ ASSERT(stripe_nsectors <= BITS_PER_LONG);
+
+ rbio = kzalloc(sizeof(*rbio) +
+ sizeof(*rbio->stripe_pages) * num_pages +
+ sizeof(*rbio->bio_sectors) * num_sectors +
+ sizeof(*rbio->stripe_sectors) * num_sectors +
+ sizeof(*rbio->finish_pointers) * real_stripes,
+ GFP_NOFS);
+ if (!rbio)
+ return ERR_PTR(-ENOMEM);
+
+ bio_list_init(&rbio->bio_list);
+ INIT_LIST_HEAD(&rbio->plug_list);
+ spin_lock_init(&rbio->bio_list_lock);
+ INIT_LIST_HEAD(&rbio->stripe_cache);
+ INIT_LIST_HEAD(&rbio->hash_list);
+ btrfs_get_bioc(bioc);
+ rbio->bioc = bioc;
+ rbio->nr_pages = num_pages;
+ rbio->nr_sectors = num_sectors;
+ rbio->real_stripes = real_stripes;
+ rbio->stripe_npages = stripe_npages;
+ rbio->stripe_nsectors = stripe_nsectors;
+ rbio->faila = -1;
+ rbio->failb = -1;
+ refcount_set(&rbio->refs, 1);
+ atomic_set(&rbio->error, 0);
+ atomic_set(&rbio->stripes_pending, 0);
+
+ /*
+ * The stripe_pages, bio_sectors, etc arrays point to the extra memory
+ * we allocated past the end of the rbio.
+ */
+ p = rbio + 1;
+#define CONSUME_ALLOC(ptr, count) do { \
+ ptr = p; \
+ p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
+ } while (0)
+ CONSUME_ALLOC(rbio->stripe_pages, num_pages);
+ CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
+ CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
+ CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
+#undef CONSUME_ALLOC
+
+ ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
+ rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+
+ return rbio;
+}
+
+/* allocate pages for all the stripes in the bio, including parity */
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ if (ret < 0)
+ return ret;
+ /* Mapping all sectors */
+ index_stripe_sectors(rbio);
+ return 0;
+}
+
+/* only allocate pages for p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+ const int data_pages = rbio->nr_data * rbio->stripe_npages;
+ int ret;
+
+ ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
+ rbio->stripe_pages + data_pages);
+ if (ret < 0)
+ return ret;
+
+ index_stripe_sectors(rbio);
+ return 0;
+}
+
+/*
+ * Add a single sector @sector into our list of bios for IO.
+ *
+ * Return 0 if everything went well.
+ * Return <0 for error.
+ */
+static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct sector_ptr *sector,
+ unsigned int stripe_nr,
+ unsigned int sector_nr,
+ enum req_op op)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct bio *last = bio_list->tail;
+ int ret;
+ struct bio *bio;
+ struct btrfs_io_stripe *stripe;
+ u64 disk_start;
+
+ /*
+ * Note: here stripe_nr has taken device replace into consideration,
+ * thus it can be larger than rbio->real_stripe.
+ * So here we check against bioc->num_stripes, not rbio->real_stripes.
+ */
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT(sector->page);
+
+ stripe = &rbio->bioc->stripes[stripe_nr];
+ disk_start = stripe->physical + sector_nr * sectorsize;
+
+ /* if the device is missing, just fail this stripe */
+ if (!stripe->dev->bdev)
+ return fail_rbio_index(rbio, stripe_nr);
+
+ /* see if we can add this page onto our existing bio */
+ if (last) {
+ u64 last_end = last->bi_iter.bi_sector << 9;
+ last_end += last->bi_iter.bi_size;
+
+ /*
+ * we can't merge these if they are from different
+ * devices or if they are not contiguous
+ */
+ if (last_end == disk_start && !last->bi_status &&
+ last->bi_bdev == stripe->dev->bdev) {
+ ret = bio_add_page(last, sector->page, sectorsize,
+ sector->pgoff);
+ if (ret == sectorsize)
+ return 0;
+ }
+ }
+
+ /* put a new bio on the list */
+ bio = bio_alloc(stripe->dev->bdev,
+ max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
+ op, GFP_NOFS);
+ bio->bi_iter.bi_sector = disk_start >> 9;
+ bio->bi_private = rbio;
+
+ bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
+ bio_list_add(bio_list, bio);
+ return 0;
+}
+
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk. This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction. The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+ if (rbio->faila >= 0 || rbio->failb >= 0) {
+ BUG_ON(rbio->faila == rbio->real_stripes - 1);
+ __raid56_parity_recover(rbio);
+ } else {
+ finish_rmw(rbio);
+ }
+}
+
+static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ rbio->bioc->raid_map[0];
+
+ bio_for_each_segment(bvec, bio, iter) {
+ u32 bvec_offset;
+
+ for (bvec_offset = 0; bvec_offset < bvec.bv_len;
+ bvec_offset += sectorsize, offset += sectorsize) {
+ int index = offset / sectorsize;
+ struct sector_ptr *sector = &rbio->bio_sectors[index];
+
+ sector->page = bvec.bv_page;
+ sector->pgoff = bvec.bv_offset + bvec_offset;
+ ASSERT(sector->pgoff < PAGE_SIZE);
+ }
+ }
+}
+
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result. This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ struct bio *bio;
+
+ spin_lock_irq(&rbio->bio_list_lock);
+ bio_list_for_each(bio, &rbio->bio_list)
+ index_one_bio(rbio, bio);
+
+ spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
+ struct raid56_bio_trace_info *trace_info)
+{
+ const struct btrfs_io_context *bioc = rbio->bioc;
+ int i;
+
+ ASSERT(bioc);
+
+ /* We rely on bio->bi_bdev to find the stripe number. */
+ if (!bio->bi_bdev)
+ goto not_found;
+
+ for (i = 0; i < bioc->num_stripes; i++) {
+ if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
+ continue;
+ trace_info->stripe_nr = i;
+ trace_info->devid = bioc->stripes[i].dev->devid;
+ trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ bioc->stripes[i].physical;
+ return;
+ }
+
+not_found:
+ trace_info->devid = -1;
+ trace_info->offset = -1;
+ trace_info->stripe_nr = -1;
+}
+
+/*
+ * this is called from one of two situations. We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
+ void **pointers = rbio->finish_pointers;
+ int nr_data = rbio->nr_data;
+ /* The total sector number inside the full stripe. */
+ int total_sector_nr;
+ int stripe;
+ /* Sector number inside a stripe. */
+ int sectornr;
+ bool has_qstripe;
+ struct bio_list bio_list;
+ struct bio *bio;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ if (rbio->real_stripes - rbio->nr_data == 1)
+ has_qstripe = false;
+ else if (rbio->real_stripes - rbio->nr_data == 2)
+ has_qstripe = true;
+ else
+ BUG();
+
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
+
+ /* at this point we either have a full stripe,
+ * or we've read the full stripe from the drive.
+ * recalculate the parity and write the new results.
+ *
+ * We're not allowed to add any new bios to the
+ * bio list here, anyone else that wants to
+ * change this stripe needs to do their own rmw.
+ */
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ atomic_set(&rbio->error, 0);
+
+ /*
+ * now that we've set rmw_locked, run through the
+ * bio list one last time and map the page pointers
+ *
+ * We don't cache full rbios because we're assuming
+ * the higher layers are unlikely to use this area of
+ * the disk again soon. If they do use it again,
+ * hopefully they will send another full bio.
+ */
+ index_rbio_pages(rbio);
+ if (!rbio_is_full(rbio))
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
+ /* First collect one sector from each data stripe */
+ for (stripe = 0; stripe < nr_data; stripe++) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ }
+
+ /* Then add the parity stripe */
+ sector = rbio_pstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+
+ if (has_qstripe) {
+ /*
+ * RAID6, add the qstripe and call the library function
+ * to fill in our p/q
+ */
+ sector = rbio_qstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) +
+ sector->pgoff;
+
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
+ }
+ for (stripe = stripe - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
+ }
+
+ /*
+ * Start writing. Make bios for everything from the higher layers (the
+ * bio_list in our rbio) and our P/Q. Ignore everything else.
+ */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ }
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
+ }
+
+ if (likely(!bioc->num_tgtdevs))
+ goto write_data;
+
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
+
+ if (!bioc->tgtdev_map[stripe]) {
+ /*
+ * We can skip the whole stripe completely, note
+ * total_sector_nr will be increased by one anyway.
+ */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
+ continue;
+ }
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ }
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ rbio->bioc->tgtdev_map[stripe],
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
+ }
+
+write_data:
+ atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+ BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
+
+ while ((bio = bio_list_pop(&bio_list))) {
+ bio->bi_end_io = raid_write_end_io;
+
+ if (trace_raid56_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_write_stripe(rbio, bio, &trace_info);
+ }
+ submit_bio(bio);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+}
+
+/*
+ * helper to find the stripe number for a given bio. Used to figure out which
+ * stripe has failed. This expects the bio to correspond to a physical disk,
+ * so it looks up based on physical sector numbers.
+ */
+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ u64 physical = bio->bi_iter.bi_sector;
+ int i;
+ struct btrfs_io_stripe *stripe;
+
+ physical <<= 9;
+
+ for (i = 0; i < rbio->bioc->num_stripes; i++) {
+ stripe = &rbio->bioc->stripes[i];
+ if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
+ stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+ * helper to find the stripe number for a given
+ * bio (before mapping). Used to figure out which stripe has
+ * failed. This looks up based on logical block numbers.
+ */
+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ u64 logical = bio->bi_iter.bi_sector << 9;
+ int i;
+
+ for (i = 0; i < rbio->nr_data; i++) {
+ u64 stripe_start = rbio->bioc->raid_map[i];
+
+ if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
+ return i;
+ }
+ return -1;
+}
+
+/*
+ * returns -EIO if we had too many failures
+ */
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&rbio->bio_list_lock, flags);
+
+ /* we already know this stripe is bad, move on */
+ if (rbio->faila == failed || rbio->failb == failed)
+ goto out;
+
+ if (rbio->faila == -1) {
+ /* first failure on this rbio */
+ rbio->faila = failed;
+ atomic_inc(&rbio->error);
+ } else if (rbio->failb == -1) {
+ /* second failure on this rbio */
+ rbio->failb = failed;
+ atomic_inc(&rbio->error);
+ } else {
+ ret = -EIO;
+ }
+out:
+ spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+
+ return ret;
+}
+
+/*
+ * helper to fail a stripe based on a physical disk
+ * bio.
+ */
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ int failed = find_bio_stripe(rbio, bio);
+
+ if (failed < 0)
+ return -EIO;
+
+ return fail_rbio_index(rbio, failed);
+}
+
+/*
+ * For subpage case, we can no longer set page Uptodate directly for
+ * stripe_pages[], thus we need to locate the sector.
+ */
+static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
+ struct page *page,
+ unsigned int pgoff)
+{
+ int i;
+
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
+ if (sector->page == page && sector->pgoff == pgoff)
+ return sector;
+ }
+ return NULL;
+}
+
+/*
+ * this sets each page in the bio uptodate. It should only be used on private
+ * rbio pages, nothing that comes in from the higher layers
+ */
+static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct sector_ptr *sector;
+ int pgoff;
+
+ for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
+ pgoff += sectorsize) {
+ sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
+ ASSERT(sector);
+ if (sector)
+ sector->uptodate = 1;
+ }
+ }
+}
+
+static void raid56_bio_end_io(struct bio *bio)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (bio->bi_status)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(rbio, bio);
+
+ bio_put(bio);
+
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ queue_work(rbio->bioc->fs_info->endio_raid56_workers,
+ &rbio->end_io_work);
+}
+
+/*
+ * End io handler for the read phase of the RMW cycle. All the bios here are
+ * physical stripe bios we've read from the disk so we can recalculate the
+ * parity of the stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_rmw_end_io_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
+
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ return;
+ }
+
+ /*
+ * This will normally call finish_rmw to start our write but if there
+ * are any failed stripes we'll reconstruct from parity first.
+ */
+ validate_rbio_for_rmw(rbio);
+}
+
+/*
+ * the stripe must be locked by the caller. It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct bio_list bio_list;
+ const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
+ int ret;
+ int total_sector_nr;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ index_rbio_pages(rbio);
+
+ atomic_set(&rbio->error, 0);
+ /* Build a list of bios to read all the missing data sectors. */
+ for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a page
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
+
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate page. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * this can happen if others have merged with
+ * us, it means there is nothing left to read.
+ * But if there are missing devices it may not be
+ * safe to do the full stripe write yet.
+ */
+ goto finish;
+ }
+
+ /*
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
+ */
+ atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
+ while ((bio = bio_list_pop(&bio_list))) {
+ bio->bi_end_io = raid56_bio_end_io;
+
+ if (trace_raid56_read_partial_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_read_partial(rbio, bio, &trace_info);
+ }
+ submit_bio(bio);
+ }
+ /* the actual write will happen once the reads are done */
+ return 0;
+
+cleanup:
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
+ return -EIO;
+
+finish:
+ validate_rbio_for_rmw(rbio);
+ return 0;
+}
+
+/*
+ * if the upper layers pass in a full stripe, we thank them by only allocating
+ * enough pages to hold the parity, and sending it all down quickly.
+ */
+static int full_stripe_write(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = alloc_rbio_parity_pages(rbio);
+ if (ret)
+ return ret;
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0)
+ finish_rmw(rbio);
+ return 0;
+}
+
+/*
+ * partial stripe writes get handed over to async helpers.
+ * We're really hoping to merge a few more writes into this
+ * rbio before calculating new parity
+ */
+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0)
+ start_async_work(rbio, rmw_work);
+ return 0;
+}
+
+/*
+ * sometimes while we were reading from the drive to
+ * recalculate parity, enough new bios come into create
+ * a full stripe. So we do a check here to see if we can
+ * go directly to finish_rmw
+ */
+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
+{
+ /* head off into rmw land if we don't have a full stripe */
+ if (!rbio_is_full(rbio))
+ return partial_stripe_write(rbio);
+ return full_stripe_write(rbio);
+}
+
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list. When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+ struct blk_plug_cb cb;
+ struct btrfs_fs_info *info;
+ struct list_head rbio_list;
+ struct work_struct work;
+};
+
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+ plug_list);
+ const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+ plug_list);
+ u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
+ u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
+
+ if (a_sector < b_sector)
+ return -1;
+ if (a_sector > b_sector)
+ return 1;
+ return 0;
+}
+
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *last = NULL;
+
+ /*
+ * sort our plug list then try to merge
+ * everything we can in hopes of creating full
+ * stripes.
+ */
+ list_sort(NULL, &plug->rbio_list, plug_cmp);
+ while (!list_empty(&plug->rbio_list)) {
+ cur = list_entry(plug->rbio_list.next,
+ struct btrfs_raid_bio, plug_list);
+ list_del_init(&cur->plug_list);
+
+ if (rbio_is_full(cur)) {
+ int ret;
+
+ /* we have a full stripe, send it down */
+ ret = full_stripe_write(cur);
+ BUG_ON(ret);
+ continue;
+ }
+ if (last) {
+ if (rbio_can_merge(last, cur)) {
+ merge_rbio(last, cur);
+ __free_raid_bio(cur);
+ continue;
+
+ }
+ __raid56_parity_write(last);
+ }
+ last = cur;
+ }
+ if (last) {
+ __raid56_parity_write(last);
+ }
+ kfree(plug);
+}
+
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct work_struct *work)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(work, struct btrfs_plug_cb, work);
+ run_plug(plug);
+}
+
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+
+ if (from_schedule) {
+ INIT_WORK(&plug->work, unplug_work);
+ queue_work(plug->info->rmw_workers, &plug->work);
+ return;
+ }
+ run_plug(plug);
+}
+
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bioc->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * BTRFS_STRIPE_LEN);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ fs_info->sectorsize_bits) % rbio->stripe_nsectors;
+
+ set_bit(bit, &rbio->dbitmap);
+ }
+}
+
+/*
+ * our main entry point for writes from the rest of the FS.
+ */
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
+{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
+ struct btrfs_raid_bio *rbio;
+ struct btrfs_plug_cb *plug = NULL;
+ struct blk_plug_cb *cb;
+ int ret = 0;
+
+ rbio = alloc_rbio(fs_info, bioc);
+ if (IS_ERR(rbio)) {
+ ret = PTR_ERR(rbio);
+ goto fail;
+ }
+ rbio->operation = BTRFS_RBIO_WRITE;
+ rbio_add_bio(rbio, bio);
+
+ /*
+ * don't plug on full rbios, just get them out the door
+ * as quickly as we can
+ */
+ if (rbio_is_full(rbio)) {
+ ret = full_stripe_write(rbio);
+ if (ret) {
+ __free_raid_bio(rbio);
+ goto fail;
+ }
+ return;
+ }
+
+ cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
+ if (cb) {
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+ if (!plug->info) {
+ plug->info = fs_info;
+ INIT_LIST_HEAD(&plug->rbio_list);
+ }
+ list_add_tail(&rbio->plug_list, &plug->rbio_list);
+ } else {
+ ret = __raid56_parity_write(rbio);
+ if (ret) {
+ __free_raid_bio(rbio);
+ goto fail;
+ }
+ }
+
+ return;
+
+fail:
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+}
+
+/*
+ * all parity reconstruction happens here. We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int sectornr, stripe;
+ void **pointers;
+ void **unmap_array;
+ int faila = -1, failb = -1;
+ blk_status_t err;
+ int i;
+
+ /*
+ * This array stores the pointer for each sector, thus it has the extra
+ * pgoff value added from each sector
+ */
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers) {
+ err = BLK_STS_RESOURCE;
+ goto cleanup_io;
+ }
+
+ /*
+ * Store copy of pointers that does not get reordered during
+ * reconstruction so that kunmap_local works.
+ */
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!unmap_array) {
+ err = BLK_STS_RESOURCE;
+ goto cleanup_pointers;
+ }
+
+ faila = rbio->faila;
+ failb = rbio->failb;
+
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+ }
+
+ index_rbio_pages(rbio);
+
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
+ /*
+ * Now we just use bitmap to mark the horizontal stripes in
+ * which we have data when doing parity scrub.
+ */
+ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+ !test_bit(sectornr, &rbio->dbitmap))
+ continue;
+
+ /*
+ * Setup our array of pointers with sectors from each stripe
+ *
+ * NOTE: store a duplicate array of pointers to preserve the
+ * pointer order
+ */
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ /*
+ * If we're rebuilding a read, we have to use
+ * pages from the bio list
+ */
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
+ (stripe == faila || stripe == failb)) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ }
+ ASSERT(sector->page);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ unmap_array[stripe] = pointers[stripe];
+ }
+
+ /* All raid6 handling here */
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ /* Single failure, rebuild from parity raid5 style */
+ if (failb < 0) {
+ if (faila == rbio->nr_data) {
+ /*
+ * Just the P stripe has failed, without
+ * a bad data or Q stripe.
+ * TODO, we should redo the xor here.
+ */
+ err = BLK_STS_IOERR;
+ goto cleanup;
+ }
+ /*
+ * a single failure in raid6 is rebuilt
+ * in the pstripe code below
+ */
+ goto pstripe;
+ }
+
+ /* make sure our ps and qs are in order */
+ if (faila > failb)
+ swap(faila, failb);
+
+ /* if the q stripe is failed, do a pstripe reconstruction
+ * from the xors.
+ * If both the q stripe and the P stripe are failed, we're
+ * here due to a crc mismatch and we can't give them the
+ * data they want
+ */
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
+ RAID5_P_STRIPE) {
+ err = BLK_STS_IOERR;
+ goto cleanup;
+ }
+ /*
+ * otherwise we have one bad data stripe and
+ * a good P stripe. raid5!
+ */
+ goto pstripe;
+ }
+
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
+ raid6_datap_recov(rbio->real_stripes,
+ sectorsize, faila, pointers);
+ } else {
+ raid6_2data_recov(rbio->real_stripes,
+ sectorsize, faila, failb,
+ pointers);
+ }
+ } else {
+ void *p;
+
+ /* rebuild from P stripe here (raid5 or raid6) */
+ BUG_ON(failb != -1);
+pstripe:
+ /* Copy parity block into failed block to start with */
+ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
+
+ /* rearrange the pointer array */
+ p = pointers[faila];
+ for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
+ pointers[stripe] = pointers[stripe + 1];
+ pointers[rbio->nr_data - 1] = p;
+
+ /* xor in the rest */
+ run_xor(pointers, rbio->nr_data - 1, sectorsize);
+ }
+ /* if we're doing this rebuild as part of an rmw, go through
+ * and set all of our private rbio pages in the
+ * failed stripes as uptodate. This way finish_rmw will
+ * know they can be trusted. If this was a read reconstruction,
+ * other endio functions will fiddle the uptodate bits
+ */
+ if (rbio->operation == BTRFS_RBIO_WRITE) {
+ for (i = 0; i < rbio->stripe_nsectors; i++) {
+ if (faila != -1) {
+ sector = rbio_stripe_sector(rbio, faila, i);
+ sector->uptodate = 1;
+ }
+ if (failb != -1) {
+ sector = rbio_stripe_sector(rbio, failb, i);
+ sector->uptodate = 1;
+ }
+ }
+ }
+ for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
+ kunmap_local(unmap_array[stripe]);
+ }
+
+ err = BLK_STS_OK;
+cleanup:
+ kfree(unmap_array);
+cleanup_pointers:
+ kfree(pointers);
+
+cleanup_io:
+ /*
+ * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
+ * valid rbio which is consistent with ondisk content, thus such a
+ * valid rbio can be cached to avoid further disk reads.
+ */
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ /*
+ * - In case of two failures, where rbio->failb != -1:
+ *
+ * Do not cache this rbio since the above read reconstruction
+ * (raid6_datap_recov() or raid6_2data_recov()) may have
+ * changed some content of stripes which are not identical to
+ * on-disk content any more, otherwise, a later write/recover
+ * may steal stripe_pages from this rbio and end up with
+ * corruptions or rebuild failures.
+ *
+ * - In case of single failure, where rbio->failb == -1:
+ *
+ * Cache this rbio iff the above read reconstruction is
+ * executed without problems.
+ */
+ if (err == BLK_STS_OK && rbio->failb < 0)
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ rbio_orig_end_io(rbio, err);
+ } else if (err == BLK_STS_OK) {
+ rbio->faila = -1;
+ rbio->failb = -1;
+
+ if (rbio->operation == BTRFS_RBIO_WRITE)
+ finish_rmw(rbio);
+ else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
+ finish_parity_scrub(rbio, 0);
+ else
+ BUG();
+ } else {
+ rbio_orig_end_io(rbio, err);
+ }
+}
+
+/*
+ * This is called only for stripes we've read from disk to reconstruct the
+ * parity.
+ */
+static void raid_recover_end_io_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
+
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ else
+ __raid_recover_end_io(rbio);
+}
+
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct bio_list bio_list;
+ int ret;
+ int total_sector_nr;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ atomic_set(&rbio->error, 0);
+
+ /*
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
+ */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ struct sector_ptr *sector;
+
+ if (rbio->faila == stripe || rbio->failb == stripe) {
+ atomic_inc(&rbio->error);
+ /* Skip the current stripe. */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
+ continue;
+ }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret < 0)
+ goto cleanup;
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * we might have no bios to read just because the pages
+ * were up to date, or we might have no bios to read because
+ * the devices were gone.
+ */
+ if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
+ __raid_recover_end_io(rbio);
+ return 0;
+ } else {
+ goto cleanup;
+ }
+ }
+
+ /*
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
+ */
+ atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
+ while ((bio = bio_list_pop(&bio_list))) {
+ bio->bi_end_io = raid56_bio_end_io;
+
+ if (trace_raid56_scrub_read_recover_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
+ }
+ submit_bio(bio);
+ }
+
+ return 0;
+
+cleanup:
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
+ return -EIO;
+}
+
+/*
+ * the main entry point for reads from the higher layers. This
+ * is really only called when the normal read path had a failure,
+ * so we assume the bio they send down corresponds to a failed part
+ * of the drive.
+ */
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
+ struct btrfs_raid_bio *rbio;
+
+ rbio = alloc_rbio(fs_info, bioc);
+ if (IS_ERR(rbio)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
+ goto out_end_bio;
+ }
+
+ rbio->operation = BTRFS_RBIO_READ_REBUILD;
+ rbio_add_bio(rbio, bio);
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
+ btrfs_warn(fs_info,
+"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
+ __func__, bio->bi_iter.bi_sector << 9,
+ (u64)bio->bi_iter.bi_size, bioc->map_type);
+ __free_raid_bio(rbio);
+ bio->bi_status = BLK_STS_IOERR;
+ goto out_end_bio;
+ }
+
+ /*
+ * Loop retry:
+ * for 'mirror == 2', reconstruct from all other stripes.
+ * for 'mirror_num > 2', select a stripe to fail on every retry.
+ */
+ if (mirror_num > 2) {
+ /*
+ * 'mirror == 3' is to fail the p stripe and
+ * reconstruct from the q stripe. 'mirror > 3' is to
+ * fail a data stripe and reconstruct from p+q stripe.
+ */
+ rbio->failb = rbio->real_stripes - (mirror_num - 1);
+ ASSERT(rbio->failb > 0);
+ if (rbio->failb <= rbio->faila)
+ rbio->failb--;
+ }
+
+ if (lock_stripe_add(rbio))
+ return;
+
+ /*
+ * This adds our rbio to the list of rbios that will be handled after
+ * the current lock owner is done.
+ */
+ __raid56_parity_recover(rbio);
+ return;
+
+out_end_bio:
+ bio_endio(bio);
+}
+
+static void rmw_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ raid56_rmw_stripe(rbio);
+}
+
+static void read_rebuild_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ __raid56_parity_recover(rbio);
+}
+
+/*
+ * The following code is used to scrub/replace the parity stripe
+ *
+ * Caller must have already increased bio_counter for getting @bioc.
+ *
+ * Note: We need make sure all the pages that add into the scrub/replace
+ * raid bio are correct and not be changed during the scrub/replace. That
+ * is those pages just hold metadata or file data with checksum.
+ */
+
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors)
+{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
+ struct btrfs_raid_bio *rbio;
+ int i;
+
+ rbio = alloc_rbio(fs_info, bioc);
+ if (IS_ERR(rbio))
+ return NULL;
+ bio_list_add(&rbio->bio_list, bio);
+ /*
+ * This is a special bio which is used to hold the completion handler
+ * and make the scrub rbio is similar to the other types
+ */
+ ASSERT(!bio->bi_iter.bi_size);
+ rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
+
+ /*
+ * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
+ * to the end position, so this search can start from the first parity
+ * stripe.
+ */
+ for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
+ if (bioc->stripes[i].dev == scrub_dev) {
+ rbio->scrubp = i;
+ break;
+ }
+ }
+ ASSERT(i < rbio->real_stripes);
+
+ bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
+ return rbio;
+}
+
+/* Used for both parity scrub and missing. */
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+ unsigned int pgoff, u64 logical)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int stripe_offset;
+ int index;
+
+ ASSERT(logical >= rbio->bioc->raid_map[0]);
+ ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
+ BTRFS_STRIPE_LEN * rbio->nr_data);
+ stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
+ index = stripe_offset / sectorsize;
+ rbio->bio_sectors[index].page = page;
+ rbio->bio_sectors[index].pgoff = pgoff;
+}
+
+/*
+ * We just scrub the parity that we have correct data on the same horizontal,
+ * so we needn't allocate all pages for all the stripes.
+ */
+static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int total_sector_nr;
+
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct page *page;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
+
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+ if (rbio->stripe_pages[index])
+ continue;
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[index] = page;
+ }
+ index_stripe_sectors(rbio);
+ return 0;
+}
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+ int need_check)
+{
+ struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
+ void **pointers = rbio->finish_pointers;
+ unsigned long *pbitmap = &rbio->finish_pbitmap;
+ int nr_data = rbio->nr_data;
+ int stripe;
+ int sectornr;
+ bool has_qstripe;
+ struct sector_ptr p_sector = { 0 };
+ struct sector_ptr q_sector = { 0 };
+ struct bio_list bio_list;
+ struct bio *bio;
+ int is_replace = 0;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ if (rbio->real_stripes - rbio->nr_data == 1)
+ has_qstripe = false;
+ else if (rbio->real_stripes - rbio->nr_data == 2)
+ has_qstripe = true;
+ else
+ BUG();
+
+ if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
+ is_replace = 1;
+ bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
+ }
+
+ /*
+ * Because the higher layers(scrubber) are unlikely to
+ * use this area of the disk again soon, so don't cache
+ * it.
+ */
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ if (!need_check)
+ goto writeback;
+
+ p_sector.page = alloc_page(GFP_NOFS);
+ if (!p_sector.page)
+ goto cleanup;
+ p_sector.pgoff = 0;
+ p_sector.uptodate = 1;
+
+ if (has_qstripe) {
+ /* RAID6, allocate and map temp space for the Q stripe */
+ q_sector.page = alloc_page(GFP_NOFS);
+ if (!q_sector.page) {
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
+ goto cleanup;
+ }
+ q_sector.pgoff = 0;
+ q_sector.uptodate = 1;
+ pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
+ }
+
+ atomic_set(&rbio->error, 0);
+
+ /* Map the parity stripe just once */
+ pointers[nr_data] = kmap_local_page(p_sector.page);
+
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
+ void *parity;
+
+ /* first collect one page from each data stripe */
+ for (stripe = 0; stripe < nr_data; stripe++) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ }
+
+ if (has_qstripe) {
+ /* RAID6, call the library function to fill in our P/Q */
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
+ }
+
+ /* Check scrubbing parity and repair it */
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ parity = kmap_local_page(sector->page) + sector->pgoff;
+ if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
+ memcpy(parity, pointers[rbio->scrubp], sectorsize);
+ else
+ /* Parity is right, needn't writeback */
+ bitmap_clear(&rbio->dbitmap, sectornr, 1);
+ kunmap_local(parity);
+
+ for (stripe = nr_data - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
+ }
+
+ kunmap_local(pointers[nr_data]);
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
+ if (q_sector.page) {
+ kunmap_local(pointers[rbio->real_stripes - 1]);
+ __free_page(q_sector.page);
+ q_sector.page = NULL;
+ }
+
+writeback:
+ /*
+ * time to start writing. Make bios for everything from the
+ * higher layers (the bio_list in our rbio) and our p/q. Ignore
+ * everything else.
+ */
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
+
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
+ }
+
+ if (!is_replace)
+ goto submit_write;
+
+ for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
+
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ bioc->tgtdev_map[rbio->scrubp],
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
+ }
+
+submit_write:
+ nr_data = bio_list_size(&bio_list);
+ if (!nr_data) {
+ /* Every parity is right */
+ rbio_orig_end_io(rbio, BLK_STS_OK);
+ return;
+ }
+
+ atomic_set(&rbio->stripes_pending, nr_data);
+
+ while ((bio = bio_list_pop(&bio_list))) {
+ bio->bi_end_io = raid_write_end_io;
+
+ if (trace_raid56_scrub_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
+ }
+ submit_bio(bio);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+}
+
+static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
+{
+ if (stripe >= 0 && stripe < rbio->nr_data)
+ return 1;
+ return 0;
+}
+
+/*
+ * While we're doing the parity check and repair, we could have errors
+ * in reading pages off the disk. This checks for errors and if we're
+ * not able to read the page it'll trigger parity reconstruction. The
+ * parity scrub will be finished after we've reconstructed the failed
+ * stripes
+ */
+static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+{
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
+ goto cleanup;
+
+ if (rbio->faila >= 0 || rbio->failb >= 0) {
+ int dfail = 0, failp = -1;
+
+ if (is_data_stripe(rbio, rbio->faila))
+ dfail++;
+ else if (is_parity_stripe(rbio->faila))
+ failp = rbio->faila;
+
+ if (is_data_stripe(rbio, rbio->failb))
+ dfail++;
+ else if (is_parity_stripe(rbio->failb))
+ failp = rbio->failb;
+
+ /*
+ * Because we can not use a scrubbing parity to repair
+ * the data, so the capability of the repair is declined.
+ * (In the case of RAID5, we can not repair anything)
+ */
+ if (dfail > rbio->bioc->max_errors - 1)
+ goto cleanup;
+
+ /*
+ * If all data is good, only parity is correctly, just
+ * repair the parity.
+ */
+ if (dfail == 0) {
+ finish_parity_scrub(rbio, 0);
+ return;
+ }
+
+ /*
+ * Here means we got one corrupted data stripe and one
+ * corrupted parity on RAID6, if the corrupted parity
+ * is scrubbing parity, luckily, use the other one to repair
+ * the data, or we can not repair the data stripe.
+ */
+ if (failp != rbio->scrubp)
+ goto cleanup;
+
+ __raid_recover_end_io(rbio);
+ } else {
+ finish_parity_scrub(rbio, 1);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+}
+
+/*
+ * end io for the read phase of the rmw cycle. All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_parity_scrub_end_io_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
+
+ /*
+ * This will normally call finish_rmw to start our write, but if there
+ * are any failed stripes we'll reconstruct from parity first
+ */
+ validate_rbio_for_parity_scrub(rbio);
+}
+
+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct bio_list bio_list;
+ int ret;
+ int total_sector_nr;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_essential_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ atomic_set(&rbio->error, 0);
+ /* Build a list of bios to read all the missing parts. */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ struct sector_ptr *sector;
+
+ /* No data in the vertical stripe, no need to read. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a sector
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
+
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate sector. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * this can happen if others have merged with
+ * us, it means there is nothing left to read.
+ * But if there are missing devices it may not be
+ * safe to do the full stripe write yet.
+ */
+ goto finish;
+ }
+
+ /*
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
+ */
+ atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
+ while ((bio = bio_list_pop(&bio_list))) {
+ bio->bi_end_io = raid56_bio_end_io;
+
+ if (trace_raid56_scrub_read_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read(rbio, bio, &trace_info);
+ }
+ submit_bio(bio);
+ }
+ /* the actual write will happen once the reads are done */
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+
+ while ((bio = bio_list_pop(&bio_list)))
+ bio_put(bio);
+
+ return;
+
+finish:
+ validate_rbio_for_parity_scrub(rbio);
+}
+
+static void scrub_parity_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ raid56_parity_scrub_stripe(rbio);
+}
+
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!lock_stripe_add(rbio))
+ start_async_work(rbio, scrub_parity_work);
+}
+
+/* The following code is used for dev replace of a missing RAID 5/6 device. */
+
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
+{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
+ struct btrfs_raid_bio *rbio;
+
+ rbio = alloc_rbio(fs_info, bioc);
+ if (IS_ERR(rbio))
+ return NULL;
+
+ rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
+ bio_list_add(&rbio->bio_list, bio);
+ /*
+ * This is a special bio which is used to hold the completion handler
+ * and make the scrub rbio is similar to the other types
+ */
+ ASSERT(!bio->bi_iter.bi_size);
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
+ btrfs_warn_rl(fs_info,
+ "can not determine the failed stripe number for full stripe %llu",
+ bioc->raid_map[0]);
+ __free_raid_bio(rbio);
+ return NULL;
+ }
+
+ return rbio;
+}
+
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!lock_stripe_add(rbio))
+ start_async_work(rbio, read_rebuild_work);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000..91d5c0ada
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2012 Fusion-io All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ */
+
+#ifndef BTRFS_RAID56_H
+#define BTRFS_RAID56_H
+
+#include <linux/workqueue.h>
+#include "volumes.h"
+
+enum btrfs_rbio_ops {
+ BTRFS_RBIO_WRITE,
+ BTRFS_RBIO_READ_REBUILD,
+ BTRFS_RBIO_PARITY_SCRUB,
+ BTRFS_RBIO_REBUILD_MISSING,
+};
+
+struct btrfs_raid_bio {
+ struct btrfs_io_context *bioc;
+
+ /*
+ * While we're doing RMW on a stripe we put it into a hash table so we
+ * can lock the stripe and merge more rbios into it.
+ */
+ struct list_head hash_list;
+
+ /* LRU list for the stripe cache */
+ struct list_head stripe_cache;
+
+ /* For scheduling work in the helper threads */
+ struct work_struct work;
+
+ /*
+ * bio_list and bio_list_lock are used to add more bios into the stripe
+ * in hopes of avoiding the full RMW
+ */
+ struct bio_list bio_list;
+ spinlock_t bio_list_lock;
+
+ /*
+ * Also protected by the bio_list_lock, the plug list is used by the
+ * plugging code to collect partial bios while plugged. The stripe
+ * locking code also uses it to hand off the stripe lock to the next
+ * pending IO.
+ */
+ struct list_head plug_list;
+
+ /* Flags that tell us if it is safe to merge with this bio. */
+ unsigned long flags;
+
+ /*
+ * Set if we're doing a parity rebuild for a read from higher up, which
+ * is handled differently from a parity rebuild as part of RMW.
+ */
+ enum btrfs_rbio_ops operation;
+
+ /* How many pages there are for the full stripe including P/Q */
+ u16 nr_pages;
+
+ /* How many sectors there are for the full stripe including P/Q */
+ u16 nr_sectors;
+
+ /* Number of data stripes (no p/q) */
+ u8 nr_data;
+
+ /* Numer of all stripes (including P/Q) */
+ u8 real_stripes;
+
+ /* How many pages there are for each stripe */
+ u8 stripe_npages;
+
+ /* How many sectors there are for each stripe */
+ u8 stripe_nsectors;
+
+ /* First bad stripe, -1 means no corruption */
+ s8 faila;
+
+ /* Second bad stripe (for RAID6 use) */
+ s8 failb;
+
+ /* Stripe number that we're scrubbing */
+ u8 scrubp;
+
+ /*
+ * Size of all the bios in the bio_list. This helps us decide if the
+ * rbio maps to a full stripe or not.
+ */
+ int bio_list_bytes;
+
+ refcount_t refs;
+
+ atomic_t stripes_pending;
+
+ atomic_t error;
+
+ struct work_struct end_io_work;
+
+ /* Bitmap to record which horizontal stripe has data */
+ unsigned long dbitmap;
+
+ /* Allocated with stripe_nsectors-many bits for finish_*() calls */
+ unsigned long finish_pbitmap;
+
+ /*
+ * These are two arrays of pointers. We allocate the rbio big enough
+ * to hold them both and setup their locations when the rbio is
+ * allocated.
+ */
+
+ /*
+ * Pointers to pages that we allocated for reading/writing stripes
+ * directly from the disk (including P/Q).
+ */
+ struct page **stripe_pages;
+
+ /* Pointers to the sectors in the bio_list, for faster lookup */
+ struct sector_ptr *bio_sectors;
+
+ /*
+ * For subpage support, we need to map each sector to above
+ * stripe_pages.
+ */
+ struct sector_ptr *stripe_sectors;
+
+ /* Allocated with real_stripes-many pointers for finish_*() calls */
+ void **finish_pointers;
+};
+
+/*
+ * For trace event usage only. Records useful debug info for each bio submitted
+ * by RAID56 to each physical device.
+ *
+ * No matter signed or not, (-1) is always the one indicating we can not grab
+ * the proper stripe number.
+ */
+struct raid56_bio_trace_info {
+ u64 devid;
+
+ /* The offset inside the stripe. (<= STRIPE_LEN) */
+ u32 offset;
+
+ /*
+ * Stripe number.
+ * 0 is the first data stripe, and nr_data for P stripe,
+ * nr_data + 1 for Q stripe.
+ * >= real_stripes for
+ */
+ u8 stripe_nr;
+};
+
+static inline int nr_data_stripes(const struct map_lookup *map)
+{
+ return map->num_stripes - btrfs_nr_parity_stripes(map->type);
+}
+
+#define RAID5_P_STRIPE ((u64)-2)
+#define RAID6_Q_STRIPE ((u64)-1)
+
+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
+ ((x) == RAID6_Q_STRIPE))
+
+struct btrfs_device;
+
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num);
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
+
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+ unsigned int pgoff, u64 logical);
+
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors);
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
+
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+
+#endif
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
new file mode 100644
index 000000000..5c2b66d15
--- /dev/null
+++ b/fs/btrfs/rcu-string.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ */
+
+#ifndef BTRFS_RCU_STRING_H
+#define BTRFS_RCU_STRING_H
+
+struct rcu_string {
+ struct rcu_head rcu;
+ char str[];
+};
+
+static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
+{
+ size_t len = strlen(src) + 1;
+ struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
+ (len * sizeof(char)), mask);
+ if (!ret)
+ return ret;
+ /* Warn if the source got unexpectedly truncated. */
+ if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
+ kfree(ret);
+ return NULL;
+ }
+ return ret;
+}
+
+static inline void rcu_string_free(struct rcu_string *str)
+{
+ if (str)
+ kfree_rcu(str, rcu);
+}
+
+#define printk_in_rcu(fmt, ...) do { \
+ rcu_read_lock(); \
+ printk(fmt, __VA_ARGS__); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define printk_ratelimited_in_rcu(fmt, ...) do { \
+ rcu_read_lock(); \
+ printk_ratelimited(fmt, __VA_ARGS__); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define rcu_str_deref(rcu_str) ({ \
+ struct rcu_string *__str = rcu_dereference(rcu_str); \
+ __str->str; \
+})
+
+#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
new file mode 100644
index 000000000..8083fe866
--- /dev/null
+++ b/fs/btrfs/ref-verify.c
@@ -0,0 +1,1025 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "delayed-ref.h"
+#include "ref-verify.h"
+
+/*
+ * Used to keep track the roots and number of refs each root has for a given
+ * bytenr. This just tracks the number of direct references, no shared
+ * references.
+ */
+struct root_entry {
+ u64 root_objectid;
+ u64 num_refs;
+ struct rb_node node;
+};
+
+/*
+ * These are meant to represent what should exist in the extent tree, these can
+ * be used to verify the extent tree is consistent as these should all match
+ * what the extent tree says.
+ */
+struct ref_entry {
+ u64 root_objectid;
+ u64 parent;
+ u64 owner;
+ u64 offset;
+ u64 num_refs;
+ struct rb_node node;
+};
+
+#define MAX_TRACE 16
+
+/*
+ * Whenever we add/remove a reference we record the action. The action maps
+ * back to the delayed ref action. We hold the ref we are changing in the
+ * action so we can account for the history properly, and we record the root we
+ * were called with since it could be different from ref_root. We also store
+ * stack traces because that's how I roll.
+ */
+struct ref_action {
+ int action;
+ u64 root;
+ struct ref_entry ref;
+ struct list_head list;
+ unsigned long trace[MAX_TRACE];
+ unsigned int trace_len;
+};
+
+/*
+ * One of these for every block we reference, it holds the roots and references
+ * to it as well as all of the ref actions that have occurred to it. We never
+ * free it until we unmount the file system in order to make sure re-allocations
+ * are happening properly.
+ */
+struct block_entry {
+ u64 bytenr;
+ u64 len;
+ u64 num_refs;
+ int metadata;
+ int from_disk;
+ struct rb_root roots;
+ struct rb_root refs;
+ struct rb_node node;
+ struct list_head actions;
+};
+
+static struct block_entry *insert_block_entry(struct rb_root *root,
+ struct block_entry *be)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct block_entry *entry;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct block_entry, node);
+ if (entry->bytenr > be->bytenr)
+ p = &(*p)->rb_left;
+ else if (entry->bytenr < be->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(&be->node, parent_node, p);
+ rb_insert_color(&be->node, root);
+ return NULL;
+}
+
+static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *n;
+ struct block_entry *entry = NULL;
+
+ n = root->rb_node;
+ while (n) {
+ entry = rb_entry(n, struct block_entry, node);
+ if (entry->bytenr < bytenr)
+ n = n->rb_right;
+ else if (entry->bytenr > bytenr)
+ n = n->rb_left;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static struct root_entry *insert_root_entry(struct rb_root *root,
+ struct root_entry *re)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct root_entry *entry;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct root_entry, node);
+ if (entry->root_objectid > re->root_objectid)
+ p = &(*p)->rb_left;
+ else if (entry->root_objectid < re->root_objectid)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(&re->node, parent_node, p);
+ rb_insert_color(&re->node, root);
+ return NULL;
+
+}
+
+static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
+{
+ if (ref1->root_objectid < ref2->root_objectid)
+ return -1;
+ if (ref1->root_objectid > ref2->root_objectid)
+ return 1;
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ if (ref1->owner < ref2->owner)
+ return -1;
+ if (ref1->owner > ref2->owner)
+ return 1;
+ if (ref1->offset < ref2->offset)
+ return -1;
+ if (ref1->offset > ref2->offset)
+ return 1;
+ return 0;
+}
+
+static struct ref_entry *insert_ref_entry(struct rb_root *root,
+ struct ref_entry *ref)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct ref_entry *entry;
+ int cmp;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct ref_entry, node);
+ cmp = comp_refs(entry, ref);
+ if (cmp > 0)
+ p = &(*p)->rb_left;
+ else if (cmp < 0)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(&ref->node, parent_node, p);
+ rb_insert_color(&ref->node, root);
+ return NULL;
+
+}
+
+static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
+{
+ struct rb_node *n;
+ struct root_entry *entry = NULL;
+
+ n = root->rb_node;
+ while (n) {
+ entry = rb_entry(n, struct root_entry, node);
+ if (entry->root_objectid < objectid)
+ n = n->rb_right;
+ else if (entry->root_objectid > objectid)
+ n = n->rb_left;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_STACKTRACE
+static void __save_stack_trace(struct ref_action *ra)
+{
+ ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2);
+}
+
+static void __print_stack_trace(struct btrfs_fs_info *fs_info,
+ struct ref_action *ra)
+{
+ if (ra->trace_len == 0) {
+ btrfs_err(fs_info, " ref-verify: no stacktrace");
+ return;
+ }
+ stack_trace_print(ra->trace, ra->trace_len, 2);
+}
+#else
+static inline void __save_stack_trace(struct ref_action *ra)
+{
+}
+
+static inline void __print_stack_trace(struct btrfs_fs_info *fs_info,
+ struct ref_action *ra)
+{
+ btrfs_err(fs_info, " ref-verify: no stacktrace support");
+}
+#endif
+
+static void free_block_entry(struct block_entry *be)
+{
+ struct root_entry *re;
+ struct ref_entry *ref;
+ struct ref_action *ra;
+ struct rb_node *n;
+
+ while ((n = rb_first(&be->roots))) {
+ re = rb_entry(n, struct root_entry, node);
+ rb_erase(&re->node, &be->roots);
+ kfree(re);
+ }
+
+ while((n = rb_first(&be->refs))) {
+ ref = rb_entry(n, struct ref_entry, node);
+ rb_erase(&ref->node, &be->refs);
+ kfree(ref);
+ }
+
+ while (!list_empty(&be->actions)) {
+ ra = list_first_entry(&be->actions, struct ref_action,
+ list);
+ list_del(&ra->list);
+ kfree(ra);
+ }
+ kfree(be);
+}
+
+static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 len,
+ u64 root_objectid)
+{
+ struct block_entry *be = NULL, *exist;
+ struct root_entry *re = NULL;
+
+ re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
+ be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
+ if (!be || !re) {
+ kfree(re);
+ kfree(be);
+ return ERR_PTR(-ENOMEM);
+ }
+ be->bytenr = bytenr;
+ be->len = len;
+
+ re->root_objectid = root_objectid;
+ re->num_refs = 0;
+
+ spin_lock(&fs_info->ref_verify_lock);
+ exist = insert_block_entry(&fs_info->block_tree, be);
+ if (exist) {
+ if (root_objectid) {
+ struct root_entry *exist_re;
+
+ exist_re = insert_root_entry(&exist->roots, re);
+ if (exist_re)
+ kfree(re);
+ } else {
+ kfree(re);
+ }
+ kfree(be);
+ return exist;
+ }
+
+ be->num_refs = 0;
+ be->metadata = 0;
+ be->from_disk = 0;
+ be->roots = RB_ROOT;
+ be->refs = RB_ROOT;
+ INIT_LIST_HEAD(&be->actions);
+ if (root_objectid)
+ insert_root_entry(&be->roots, re);
+ else
+ kfree(re);
+ return be;
+}
+
+static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 parent, u64 bytenr, int level)
+{
+ struct block_entry *be;
+ struct root_entry *re;
+ struct ref_entry *ref = NULL, *exist;
+
+ ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ if (parent)
+ ref->root_objectid = 0;
+ else
+ ref->root_objectid = ref_root;
+ ref->parent = parent;
+ ref->owner = level;
+ ref->offset = 0;
+ ref->num_refs = 1;
+
+ be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root);
+ if (IS_ERR(be)) {
+ kfree(ref);
+ return PTR_ERR(be);
+ }
+ be->num_refs++;
+ be->from_disk = 1;
+ be->metadata = 1;
+
+ if (!parent) {
+ ASSERT(ref_root);
+ re = lookup_root_entry(&be->roots, ref_root);
+ ASSERT(re);
+ re->num_refs++;
+ }
+ exist = insert_ref_entry(&be->refs, ref);
+ if (exist) {
+ exist->num_refs++;
+ kfree(ref);
+ }
+ spin_unlock(&fs_info->ref_verify_lock);
+
+ return 0;
+}
+
+static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
+ u64 parent, u32 num_refs, u64 bytenr,
+ u64 num_bytes)
+{
+ struct block_entry *be;
+ struct ref_entry *ref;
+
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+ be = add_block_entry(fs_info, bytenr, num_bytes, 0);
+ if (IS_ERR(be)) {
+ kfree(ref);
+ return PTR_ERR(be);
+ }
+ be->num_refs += num_refs;
+
+ ref->parent = parent;
+ ref->num_refs = num_refs;
+ if (insert_ref_entry(&be->refs, ref)) {
+ spin_unlock(&fs_info->ref_verify_lock);
+ btrfs_err(fs_info, "existing shared ref when reading from disk?");
+ kfree(ref);
+ return -EINVAL;
+ }
+ spin_unlock(&fs_info->ref_verify_lock);
+ return 0;
+}
+
+static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *dref,
+ u64 bytenr, u64 num_bytes)
+{
+ struct block_entry *be;
+ struct ref_entry *ref;
+ struct root_entry *re;
+ u64 ref_root = btrfs_extent_data_ref_root(leaf, dref);
+ u64 owner = btrfs_extent_data_ref_objectid(leaf, dref);
+ u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
+ u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
+
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
+ if (IS_ERR(be)) {
+ kfree(ref);
+ return PTR_ERR(be);
+ }
+ be->num_refs += num_refs;
+
+ ref->parent = 0;
+ ref->owner = owner;
+ ref->root_objectid = ref_root;
+ ref->offset = offset;
+ ref->num_refs = num_refs;
+ if (insert_ref_entry(&be->refs, ref)) {
+ spin_unlock(&fs_info->ref_verify_lock);
+ btrfs_err(fs_info, "existing ref when reading from disk?");
+ kfree(ref);
+ return -EINVAL;
+ }
+
+ re = lookup_root_entry(&be->roots, ref_root);
+ if (!re) {
+ spin_unlock(&fs_info->ref_verify_lock);
+ btrfs_err(fs_info, "missing root in new block entry?");
+ return -EINVAL;
+ }
+ re->num_refs += num_refs;
+ spin_unlock(&fs_info->ref_verify_lock);
+ return 0;
+}
+
+static int process_extent_item(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, struct btrfs_key *key,
+ int slot, int *tree_block_level)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct extent_buffer *leaf = path->nodes[0];
+ u32 item_size = btrfs_item_size(leaf, slot);
+ unsigned long end, ptr;
+ u64 offset, flags, count;
+ int type, ret;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ if ((key->type == BTRFS_EXTENT_ITEM_KEY) &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *tree_block_level = btrfs_tree_block_level(leaf, info);
+ iref = (struct btrfs_extent_inline_ref *)(info + 1);
+ } else {
+ if (key->type == BTRFS_METADATA_ITEM_KEY)
+ *tree_block_level = key->offset;
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ }
+
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ while (ptr < end) {
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ switch (type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = add_tree_block(fs_info, offset, 0, key->objectid,
+ *tree_block_level);
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = add_tree_block(fs_info, 0, offset, key->objectid,
+ *tree_block_level);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ ret = add_extent_data_ref(fs_info, leaf, dref,
+ key->objectid, key->offset);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ count = btrfs_shared_data_ref_count(leaf, sref);
+ ret = add_shared_data_ref(fs_info, offset, count,
+ key->objectid, key->offset);
+ break;
+ default:
+ btrfs_err(fs_info, "invalid key type in iref");
+ ret = -EINVAL;
+ break;
+ }
+ if (ret)
+ break;
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+ return ret;
+}
+
+static int process_leaf(struct btrfs_root *root,
+ struct btrfs_path *path, u64 *bytenr, u64 *num_bytes,
+ int *tree_block_level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ u32 count;
+ int i = 0, ret = 0;
+ struct btrfs_key key;
+ int nritems = btrfs_header_nritems(leaf);
+
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ switch (key.type) {
+ case BTRFS_EXTENT_ITEM_KEY:
+ *num_bytes = key.offset;
+ fallthrough;
+ case BTRFS_METADATA_ITEM_KEY:
+ *bytenr = key.objectid;
+ ret = process_extent_item(fs_info, path, &key, i,
+ tree_block_level);
+ break;
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = add_tree_block(fs_info, key.offset, 0,
+ key.objectid, *tree_block_level);
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = add_tree_block(fs_info, 0, key.offset,
+ key.objectid, *tree_block_level);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = btrfs_item_ptr(leaf, i,
+ struct btrfs_extent_data_ref);
+ ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr,
+ *num_bytes);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = btrfs_item_ptr(leaf, i,
+ struct btrfs_shared_data_ref);
+ count = btrfs_shared_data_ref_count(leaf, sref);
+ ret = add_shared_data_ref(fs_info, key.offset, count,
+ *bytenr, *num_bytes);
+ break;
+ default:
+ break;
+ }
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+/* Walk down to the leaf from the given level */
+static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int level, u64 *bytenr, u64 *num_bytes,
+ int *tree_block_level)
+{
+ struct extent_buffer *eb;
+ int ret = 0;
+
+ while (level >= 0) {
+ if (level) {
+ eb = btrfs_read_node_slot(path->nodes[level],
+ path->slots[level]);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+ btrfs_tree_read_lock(eb);
+ path->nodes[level-1] = eb;
+ path->slots[level-1] = 0;
+ path->locks[level-1] = BTRFS_READ_LOCK;
+ } else {
+ ret = process_leaf(root, path, bytenr, num_bytes,
+ tree_block_level);
+ if (ret)
+ break;
+ }
+ level--;
+ }
+ return ret;
+}
+
+/* Walk up to the next node that needs to be processed */
+static int walk_up_tree(struct btrfs_path *path, int *level)
+{
+ int l;
+
+ for (l = 0; l < BTRFS_MAX_LEVEL; l++) {
+ if (!path->nodes[l])
+ continue;
+ if (l) {
+ path->slots[l]++;
+ if (path->slots[l] <
+ btrfs_header_nritems(path->nodes[l])) {
+ *level = l;
+ return 0;
+ }
+ }
+ btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]);
+ free_extent_buffer(path->nodes[l]);
+ path->nodes[l] = NULL;
+ path->slots[l] = 0;
+ path->locks[l] = 0;
+ }
+
+ return 1;
+}
+
+static void dump_ref_action(struct btrfs_fs_info *fs_info,
+ struct ref_action *ra)
+{
+ btrfs_err(fs_info,
+" Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
+ ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent,
+ ra->ref.owner, ra->ref.offset, ra->ref.num_refs);
+ __print_stack_trace(fs_info, ra);
+}
+
+/*
+ * Dumps all the information from the block entry to printk, it's going to be
+ * awesome.
+ */
+static void dump_block_entry(struct btrfs_fs_info *fs_info,
+ struct block_entry *be)
+{
+ struct ref_entry *ref;
+ struct root_entry *re;
+ struct ref_action *ra;
+ struct rb_node *n;
+
+ btrfs_err(fs_info,
+"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d",
+ be->bytenr, be->len, be->num_refs, be->metadata,
+ be->from_disk);
+
+ for (n = rb_first(&be->refs); n; n = rb_next(n)) {
+ ref = rb_entry(n, struct ref_entry, node);
+ btrfs_err(fs_info,
+" ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
+ ref->root_objectid, ref->parent, ref->owner,
+ ref->offset, ref->num_refs);
+ }
+
+ for (n = rb_first(&be->roots); n; n = rb_next(n)) {
+ re = rb_entry(n, struct root_entry, node);
+ btrfs_err(fs_info, " root entry %llu, num_refs %llu",
+ re->root_objectid, re->num_refs);
+ }
+
+ list_for_each_entry(ra, &be->actions, list)
+ dump_ref_action(fs_info, ra);
+}
+
+/*
+ * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
+ *
+ * This will add an action item to the given bytenr and do sanity checks to make
+ * sure we haven't messed something up. If we are making a new allocation and
+ * this block entry has history we will delete all previous actions as long as
+ * our sanity checks pass as they are no longer needed.
+ */
+int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref *generic_ref)
+{
+ struct ref_entry *ref = NULL, *exist;
+ struct ref_action *ra = NULL;
+ struct block_entry *be = NULL;
+ struct root_entry *re = NULL;
+ int action = generic_ref->action;
+ int ret = 0;
+ bool metadata;
+ u64 bytenr = generic_ref->bytenr;
+ u64 num_bytes = generic_ref->len;
+ u64 parent = generic_ref->parent;
+ u64 ref_root = 0;
+ u64 owner = 0;
+ u64 offset = 0;
+
+ if (!btrfs_test_opt(fs_info, REF_VERIFY))
+ return 0;
+
+ if (generic_ref->type == BTRFS_REF_METADATA) {
+ if (!parent)
+ ref_root = generic_ref->tree_ref.owning_root;
+ owner = generic_ref->tree_ref.level;
+ } else if (!parent) {
+ ref_root = generic_ref->data_ref.owning_root;
+ owner = generic_ref->data_ref.ino;
+ offset = generic_ref->data_ref.offset;
+ }
+ metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
+ ra = kmalloc(sizeof(struct ref_action), GFP_NOFS);
+ if (!ra || !ref) {
+ kfree(ref);
+ kfree(ra);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ref->parent = parent;
+ ref->owner = owner;
+ ref->root_objectid = ref_root;
+ ref->offset = offset;
+ ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1;
+
+ memcpy(&ra->ref, ref, sizeof(struct ref_entry));
+ /*
+ * Save the extra info from the delayed ref in the ref action to make it
+ * easier to figure out what is happening. The real ref's we add to the
+ * ref tree need to reflect what we save on disk so it matches any
+ * on-disk refs we pre-loaded.
+ */
+ ra->ref.owner = owner;
+ ra->ref.offset = offset;
+ ra->ref.root_objectid = ref_root;
+ __save_stack_trace(ra);
+
+ INIT_LIST_HEAD(&ra->list);
+ ra->action = action;
+ ra->root = generic_ref->real_root;
+
+ /*
+ * This is an allocation, preallocate the block_entry in case we haven't
+ * used it before.
+ */
+ ret = -EINVAL;
+ if (action == BTRFS_ADD_DELAYED_EXTENT) {
+ /*
+ * For subvol_create we'll just pass in whatever the parent root
+ * is and the new root objectid, so let's not treat the passed
+ * in root as if it really has a ref for this bytenr.
+ */
+ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
+ if (IS_ERR(be)) {
+ kfree(ref);
+ kfree(ra);
+ ret = PTR_ERR(be);
+ goto out;
+ }
+ be->num_refs++;
+ if (metadata)
+ be->metadata = 1;
+
+ if (be->num_refs != 1) {
+ btrfs_err(fs_info,
+ "re-allocated a block that still has references to it!");
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ goto out_unlock;
+ }
+
+ while (!list_empty(&be->actions)) {
+ struct ref_action *tmp;
+
+ tmp = list_first_entry(&be->actions, struct ref_action,
+ list);
+ list_del(&tmp->list);
+ kfree(tmp);
+ }
+ } else {
+ struct root_entry *tmp;
+
+ if (!parent) {
+ re = kmalloc(sizeof(struct root_entry), GFP_NOFS);
+ if (!re) {
+ kfree(ref);
+ kfree(ra);
+ ret = -ENOMEM;
+ goto out;
+ }
+ /*
+ * This is the root that is modifying us, so it's the
+ * one we want to lookup below when we modify the
+ * re->num_refs.
+ */
+ ref_root = generic_ref->real_root;
+ re->root_objectid = generic_ref->real_root;
+ re->num_refs = 0;
+ }
+
+ spin_lock(&fs_info->ref_verify_lock);
+ be = lookup_block_entry(&fs_info->block_tree, bytenr);
+ if (!be) {
+ btrfs_err(fs_info,
+"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
+ action, bytenr, num_bytes);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ kfree(re);
+ goto out_unlock;
+ } else if (be->num_refs == 0) {
+ btrfs_err(fs_info,
+ "trying to do action %d for a bytenr that has 0 total references",
+ action);
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ kfree(re);
+ goto out_unlock;
+ }
+
+ if (!parent) {
+ tmp = insert_root_entry(&be->roots, re);
+ if (tmp) {
+ kfree(re);
+ re = tmp;
+ }
+ }
+ }
+
+ exist = insert_ref_entry(&be->refs, ref);
+ if (exist) {
+ if (action == BTRFS_DROP_DELAYED_REF) {
+ if (exist->num_refs == 0) {
+ btrfs_err(fs_info,
+"dropping a ref for a existing root that doesn't have a ref on the block");
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ goto out_unlock;
+ }
+ exist->num_refs--;
+ if (exist->num_refs == 0) {
+ rb_erase(&exist->node, &be->refs);
+ kfree(exist);
+ }
+ } else if (!be->metadata) {
+ exist->num_refs++;
+ } else {
+ btrfs_err(fs_info,
+"attempting to add another ref for an existing ref on a tree block");
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ goto out_unlock;
+ }
+ kfree(ref);
+ } else {
+ if (action == BTRFS_DROP_DELAYED_REF) {
+ btrfs_err(fs_info,
+"dropping a ref for a root that doesn't have a ref on the block");
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ goto out_unlock;
+ }
+ }
+
+ if (!parent && !re) {
+ re = lookup_root_entry(&be->roots, ref_root);
+ if (!re) {
+ /*
+ * This shouldn't happen because we will add our re
+ * above when we lookup the be with !parent, but just in
+ * case catch this case so we don't panic because I
+ * didn't think of some other corner case.
+ */
+ btrfs_err(fs_info, "failed to find root %llu for %llu",
+ generic_ref->real_root, be->bytenr);
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ra);
+ goto out_unlock;
+ }
+ }
+ if (action == BTRFS_DROP_DELAYED_REF) {
+ if (re)
+ re->num_refs--;
+ be->num_refs--;
+ } else if (action == BTRFS_ADD_DELAYED_REF) {
+ be->num_refs++;
+ if (re)
+ re->num_refs++;
+ }
+ list_add_tail(&ra->list, &be->actions);
+ ret = 0;
+out_unlock:
+ spin_unlock(&fs_info->ref_verify_lock);
+out:
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
+ return ret;
+}
+
+/* Free up the ref cache */
+void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
+{
+ struct block_entry *be;
+ struct rb_node *n;
+
+ if (!btrfs_test_opt(fs_info, REF_VERIFY))
+ return;
+
+ spin_lock(&fs_info->ref_verify_lock);
+ while ((n = rb_first(&fs_info->block_tree))) {
+ be = rb_entry(n, struct block_entry, node);
+ rb_erase(&be->node, &fs_info->block_tree);
+ free_block_entry(be);
+ cond_resched_lock(&fs_info->ref_verify_lock);
+ }
+ spin_unlock(&fs_info->ref_verify_lock);
+}
+
+void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
+ u64 len)
+{
+ struct block_entry *be = NULL, *entry;
+ struct rb_node *n;
+
+ if (!btrfs_test_opt(fs_info, REF_VERIFY))
+ return;
+
+ spin_lock(&fs_info->ref_verify_lock);
+ n = fs_info->block_tree.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct block_entry, node);
+ if (entry->bytenr < start) {
+ n = n->rb_right;
+ } else if (entry->bytenr > start) {
+ n = n->rb_left;
+ } else {
+ be = entry;
+ break;
+ }
+ /* We want to get as close to start as possible */
+ if (be == NULL ||
+ (entry->bytenr < start && be->bytenr > start) ||
+ (entry->bytenr < start && entry->bytenr > be->bytenr))
+ be = entry;
+ }
+
+ /*
+ * Could have an empty block group, maybe have something to check for
+ * this case to verify we were actually empty?
+ */
+ if (!be) {
+ spin_unlock(&fs_info->ref_verify_lock);
+ return;
+ }
+
+ n = &be->node;
+ while (n) {
+ be = rb_entry(n, struct block_entry, node);
+ n = rb_next(n);
+ if (be->bytenr < start && be->bytenr + be->len > start) {
+ btrfs_err(fs_info,
+ "block entry overlaps a block group [%llu,%llu]!",
+ start, len);
+ dump_block_entry(fs_info, be);
+ continue;
+ }
+ if (be->bytenr < start)
+ continue;
+ if (be->bytenr >= start + len)
+ break;
+ if (be->bytenr + be->len > start + len) {
+ btrfs_err(fs_info,
+ "block entry overlaps a block group [%llu,%llu]!",
+ start, len);
+ dump_block_entry(fs_info, be);
+ }
+ rb_erase(&be->node, &fs_info->block_tree);
+ free_block_entry(be);
+ }
+ spin_unlock(&fs_info->ref_verify_lock);
+}
+
+/* Walk down all roots and build the ref tree, meant to be called at mount */
+int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *extent_root;
+ struct btrfs_path *path;
+ struct extent_buffer *eb;
+ int tree_block_level = 0;
+ u64 bytenr = 0, num_bytes = 0;
+ int ret, level;
+
+ if (!btrfs_test_opt(fs_info, REF_VERIFY))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ extent_root = btrfs_extent_root(fs_info, 0);
+ eb = btrfs_read_lock_root_node(extent_root);
+ level = btrfs_header_level(eb);
+ path->nodes[level] = eb;
+ path->slots[level] = 0;
+ path->locks[level] = BTRFS_READ_LOCK;
+
+ while (1) {
+ /*
+ * We have to keep track of the bytenr/num_bytes we last hit
+ * because we could have run out of space for an inline ref, and
+ * would have had to added a ref key item which may appear on a
+ * different leaf from the original extent item.
+ */
+ ret = walk_down_tree(extent_root, path, level,
+ &bytenr, &num_bytes, &tree_block_level);
+ if (ret)
+ break;
+ ret = walk_up_tree(path, &level);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
new file mode 100644
index 000000000..855de3771
--- /dev/null
+++ b/fs/btrfs/ref-verify.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ */
+
+#ifndef BTRFS_REF_VERIFY_H
+#define BTRFS_REF_VERIFY_H
+
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
+void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
+int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref *generic_ref);
+void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
+ u64 len);
+
+static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->ref_verify_lock);
+ fs_info->block_tree = RB_ROOT;
+}
+#else
+static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+
+static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
+{
+}
+
+static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref *generic_ref)
+{
+ return 0;
+}
+
+static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len)
+{
+}
+
+static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
+{
+}
+
+#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
+
+#endif
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
new file mode 100644
index 000000000..f50586ff8
--- /dev/null
+++ b/fs/btrfs/reflink.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blkdev.h>
+#include <linux/iversion.h>
+#include "compression.h"
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "disk-io.h"
+#include "reflink.h"
+#include "transaction.h"
+#include "subpage.h"
+
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ u64 endoff,
+ const u64 destoff,
+ const u64 olen,
+ int no_time_update)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ inode_inc_iversion(inode);
+ if (!no_time_update) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
+ /*
+ * We round up to the block size at eof when determining which
+ * extents to clone above, but shouldn't round up the file size.
+ */
+ if (endoff > destoff + olen)
+ endoff = destoff + olen;
+ if (endoff > inode->i_size) {
+ i_size_write(inode, endoff);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ }
+
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans);
+out:
+ return ret;
+}
+
+static int copy_inline_to_page(struct btrfs_inode *inode,
+ const u64 file_offset,
+ char *inline_data,
+ const u64 size,
+ const u64 datal,
+ const u8 comp_type)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 block_size = fs_info->sectorsize;
+ const u64 range_end = file_offset + block_size - 1;
+ const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
+ char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
+ struct extent_changeset *data_reserved = NULL;
+ struct page *page = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ int ret;
+
+ ASSERT(IS_ALIGNED(file_offset, block_size));
+
+ /*
+ * We have flushed and locked the ranges of the source and destination
+ * inodes, we also have locked the inodes, so we are safe to do a
+ * reservation here. Also we must not do the reservation while holding
+ * a transaction open, otherwise we would deadlock.
+ */
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
+ if (ret)
+ goto out;
+
+ page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(mapping));
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
+
+ clear_extent_bit(&inode->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * After dirtying the page our caller will need to start a transaction,
+ * and if we are low on metadata free space, that can cause flushing of
+ * delalloc for all inodes in order to get metadata space released.
+ * However we are holding the range locked for the whole duration of
+ * the clone/dedupe operation, so we may deadlock if that happens and no
+ * other task releases enough space. So mark this inode as not being
+ * possible to flush to avoid such deadlock. We will clear that flag
+ * when we finish cloning all extents, since a transaction is started
+ * after finding each extent to clone.
+ */
+ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
+
+ if (comp_type == BTRFS_COMPRESS_NONE) {
+ memcpy_to_page(page, offset_in_page(file_offset), data_start,
+ datal);
+ } else {
+ ret = btrfs_decompress(comp_type, data_start, page,
+ offset_in_page(file_offset),
+ inline_size, datal);
+ if (ret)
+ goto out_unlock;
+ flush_dcache_page(page);
+ }
+
+ /*
+ * If our inline data is smaller then the block/page size, then the
+ * remaining of the block/page is equivalent to zeroes. We had something
+ * like the following done:
+ *
+ * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
+ * $ sync # (or fsync)
+ * $ xfs_io -c "falloc 0 4K" file
+ * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
+ *
+ * So what's in the range [500, 4095] corresponds to zeroes.
+ */
+ if (datal < block_size)
+ memzero_page(page, datal, block_size - datal);
+
+ btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
+ btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
+ btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
+out_unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ if (ret)
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(inode, block_size);
+out:
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+/*
+ * Deal with cloning of inline extents. We try to copy the inline extent from
+ * the source inode to destination inode when possible. When not possible we
+ * copy the inline extent's data into the respective page of the inode.
+ */
+static int clone_copy_inline_extent(struct inode *dst,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ const u64 drop_start,
+ const u64 datal,
+ const u64 size,
+ const u8 comp_type,
+ char *inline_data,
+ struct btrfs_trans_handle **trans_out)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
+ struct btrfs_root *root = BTRFS_I(dst)->root;
+ const u64 aligned_end = ALIGN(new_key->offset + datal,
+ fs_info->sectorsize);
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ int ret;
+ struct btrfs_key key;
+
+ if (new_key->offset > 0) {
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
+ }
+
+ key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ goto copy_inline_extent;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ key.type == BTRFS_EXTENT_DATA_KEY) {
+ /*
+ * There's an implicit hole at file offset 0, copy the
+ * inline extent's data to the page.
+ */
+ ASSERT(key.offset > 0);
+ goto copy_to_page;
+ }
+ } else if (i_size_read(dst) <= datal) {
+ struct btrfs_file_extent_item *ei;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ /*
+ * If it's an inline extent replace it with the source inline
+ * extent, otherwise copy the source inline extent data into
+ * the respective page at the destination inode.
+ */
+ if (btrfs_file_extent_type(path->nodes[0], ei) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto copy_inline_extent;
+
+ goto copy_to_page;
+ }
+
+copy_inline_extent:
+ /*
+ * We have no extent items, or we have an extent at offset 0 which may
+ * or may not be inlined. All these cases are dealt the same way.
+ */
+ if (i_size_read(dst) > datal) {
+ /*
+ * At the destination offset 0 we have either a hole, a regular
+ * extent or an inline extent larger then the one we want to
+ * clone. Deal with all these cases by copying the inline extent
+ * data into the respective page at the destination inode.
+ */
+ goto copy_to_page;
+ }
+
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
+ btrfs_release_path(path);
+ /*
+ * If we end up here it means were copy the inline extent into a leaf
+ * of the destination inode. We know we will drop or adjust at most one
+ * extent item in the destination root.
+ *
+ * 1 unit - adjusting old extent (we may have to split it)
+ * 1 unit - add new extent
+ * 1 unit - inode update
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+ drop_args.path = path;
+ drop_args.start = drop_start;
+ drop_args.end = aligned_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
+ if (ret)
+ goto out;
+ ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+ if (ret)
+ goto out;
+
+ write_extent_buffer(path->nodes[0], inline_data,
+ btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]),
+ size);
+ btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
+ btrfs_set_inode_full_sync(BTRFS_I(dst));
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
+out:
+ if (!ret && !trans) {
+ /*
+ * No transaction here means we copied the inline extent into a
+ * page of the destination inode.
+ *
+ * 1 unit to update inode item
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ }
+ }
+ if (ret && trans) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ }
+ if (!ret)
+ *trans_out = trans;
+
+ return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
+}
+
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen
+ * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+ const u64 off, const u64 olen, const u64 olen_aligned,
+ const u64 destoff, int no_time_update)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *leaf;
+ struct btrfs_trans_handle *trans;
+ char *buf = NULL;
+ struct btrfs_key key;
+ u32 nritems;
+ int slot;
+ int ret;
+ const u64 len = olen_aligned;
+ u64 last_dest_end = destoff;
+ u64 prev_extent_end = off;
+
+ ret = -ENOMEM;
+ buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!buf)
+ return ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ kvfree(buf);
+ return ret;
+ }
+
+ path->reada = READA_FORWARD;
+ /* Clone data */
+ key.objectid = btrfs_ino(BTRFS_I(src));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = off;
+
+ while (1) {
+ struct btrfs_file_extent_item *extent;
+ u64 extent_gen;
+ int type;
+ u32 size;
+ struct btrfs_key new_key;
+ u64 disko = 0, diskl = 0;
+ u64 datao = 0, datal = 0;
+ u8 comp;
+ u64 drop_start;
+
+ /* Note the key will change type as we walk through the tree */
+ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+ /*
+ * First search, if no extent item that starts at offset off was
+ * found but the previous item is an extent item, it's possible
+ * it might overlap our target range, therefore process it.
+ */
+ if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.type > BTRFS_EXTENT_DATA_KEY ||
+ key.objectid != btrfs_ino(BTRFS_I(src)))
+ break;
+
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ extent_gen = btrfs_file_extent_generation(leaf, extent);
+ comp = btrfs_file_extent_compression(leaf, extent);
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+ diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+ datao = btrfs_file_extent_offset(leaf, extent);
+ datal = btrfs_file_extent_num_bytes(leaf, extent);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* Take upper bound, may be compressed */
+ datal = btrfs_file_extent_ram_bytes(leaf, extent);
+ }
+
+ /*
+ * The first search might have left us at an extent item that
+ * ends before our target range's start, can happen if we have
+ * holes and NO_HOLES feature enabled.
+ *
+ * Subsequent searches may leave us on a file range we have
+ * processed before - this happens due to a race with ordered
+ * extent completion for a file range that is outside our source
+ * range, but that range was part of a file extent item that
+ * also covered a leading part of our source range.
+ */
+ if (key.offset + datal <= prev_extent_end) {
+ path->slots[0]++;
+ goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
+ }
+
+ prev_extent_end = key.offset + datal;
+ size = btrfs_item_size(leaf, slot);
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ btrfs_release_path(path);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.objectid = btrfs_ino(BTRFS_I(inode));
+ if (off <= key.offset)
+ new_key.offset = key.offset + destoff - off;
+ else
+ new_key.offset = destoff;
+
+ /*
+ * Deal with a hole that doesn't have an extent item that
+ * represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning range or at
+ * the beginning (fully overlaps it or partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ struct btrfs_replace_extent_info clone_info;
+
+ /*
+ * a | --- range to clone ---| b
+ * | ------------- extent ------------- |
+ */
+
+ /* Subtract range b */
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ /* Subtract range a */
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ clone_info.disk_offset = disko;
+ clone_info.disk_len = diskl;
+ clone_info.data_offset = datao;
+ clone_info.data_len = datal;
+ clone_info.file_offset = new_key.offset;
+ clone_info.extent_buf = buf;
+ clone_info.is_new_extent = false;
+ clone_info.update_times = !no_time_update;
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ drop_start, new_key.offset + datal - 1,
+ &clone_info, &trans);
+ if (ret)
+ goto out;
+ } else {
+ ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
+ /*
+ * Inline extents always have to start at file offset 0
+ * and can never be bigger then the sector size. We can
+ * never clone only parts of an inline extent, since all
+ * reflink operations must start at a sector size aligned
+ * offset, and the length must be aligned too or end at
+ * the i_size (which implies the whole inlined data).
+ */
+ ASSERT(key.offset == 0);
+ ASSERT(datal <= fs_info->sectorsize);
+ if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
+ WARN_ON(key.offset != 0) ||
+ WARN_ON(datal > fs_info->sectorsize)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ ret = clone_copy_inline_extent(inode, path, &new_key,
+ drop_start, datal, size,
+ comp, buf, &trans);
+ if (ret)
+ goto out;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * Whenever we share an extent we update the last_reflink_trans
+ * of each inode to the current transaction. This is needed to
+ * make sure fsync does not log multiple checksum items with
+ * overlapping ranges (because some extent items might refer
+ * only to sections of the original extent). For the destination
+ * inode we do this regardless of the generation of the extents
+ * or even if they are inline extents or explicit holes, to make
+ * sure a full fsync does not skip them. For the source inode,
+ * we only need to update last_reflink_trans in case it's a new
+ * extent that is not a hole or an inline extent, to deal with
+ * the checksums problem on fsync.
+ */
+ if (extent_gen == trans->transid && disko > 0)
+ BTRFS_I(src)->last_reflink_trans = trans->transid;
+
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
+
+ last_dest_end = ALIGN(new_key.offset + datal,
+ fs_info->sectorsize);
+ ret = clone_finish_inode_update(trans, inode, last_dest_end,
+ destoff, olen, no_time_update);
+ if (ret)
+ goto out;
+ if (new_key.offset + datal >= destoff + len)
+ break;
+
+ btrfs_release_path(path);
+ key.offset = prev_extent_end;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ cond_resched();
+ }
+ ret = 0;
+
+ if (last_dest_end < destoff + len) {
+ /*
+ * We have an implicit hole that fully or partially overlaps our
+ * cloning range at its end. This means that we either have the
+ * NO_HOLES feature enabled or the implicit hole happened due to
+ * mixing buffered and direct IO writes against this file.
+ */
+ btrfs_release_path(path);
+
+ /*
+ * When using NO_HOLES and we are cloning a range that covers
+ * only a hole (no extents) into a range beyond the current
+ * i_size, punching a hole in the target range will not create
+ * an extent map defining a hole, because the range starts at or
+ * beyond current i_size. If the file previously had an i_size
+ * greater than the new i_size set by this clone operation, we
+ * need to make sure the next fsync is a full fsync, so that it
+ * detects and logs a hole covering a range from the current
+ * i_size to the new i_size. If the clone range covers extents,
+ * besides a hole, then we know the full sync flag was already
+ * set by previous calls to btrfs_replace_file_extents() that
+ * replaced file extent items.
+ */
+ if (last_dest_end >= i_size_read(inode))
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
+
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ last_dest_end, destoff + len - 1, NULL, &trans);
+ if (ret)
+ goto out;
+
+ ret = clone_finish_inode_update(trans, inode, destoff + len,
+ destoff, olen, no_time_update);
+ }
+
+out:
+ btrfs_free_path(path);
+ kvfree(buf);
+ clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
+
+ return ret;
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ u64 range1_end = loff1 + len - 1;
+ u64 range2_end = loff2 + len - 1;
+
+ if (inode1 < inode2) {
+ swap(inode1, inode2);
+ swap(loff1, loff2);
+ swap(range1_end, range2_end);
+ } else if (inode1 == inode2 && loff2 < loff1) {
+ swap(loff1, loff2);
+ swap(range1_end, range2_end);
+ }
+
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
+ btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
+}
+
+static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 < inode2)
+ swap(inode1, inode2);
+ down_write(&BTRFS_I(inode1)->i_mmap_lock);
+ down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
+}
+
+static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
+{
+ up_write(&BTRFS_I(inode1)->i_mmap_lock);
+ up_write(&BTRFS_I(inode2)->i_mmap_lock);
+}
+
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
+ struct inode *dst, u64 dst_loff)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ const u64 bs = fs_info->sb->s_blocksize;
+ int ret;
+
+ /*
+ * Lock destination range to serialize with concurrent readahead() and
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+
+ btrfs_btree_balance_dirty(fs_info);
+
+ return ret;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
+ struct inode *dst, u64 dst_loff)
+{
+ int ret = 0;
+ u64 i, tail_len, chunk_count;
+ struct btrfs_root *root_dst = BTRFS_I(dst)->root;
+
+ spin_lock(&root_dst->root_item_lock);
+ if (root_dst->send_in_progress) {
+ btrfs_warn_rl(root_dst->fs_info,
+"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
+ root_dst->root_key.objectid,
+ root_dst->send_in_progress);
+ spin_unlock(&root_dst->root_item_lock);
+ return -EAGAIN;
+ }
+ root_dst->dedupe_in_progress++;
+ spin_unlock(&root_dst->root_item_lock);
+
+ tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
+ chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
+
+ for (i = 0; i < chunk_count; i++) {
+ ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
+ dst, dst_loff);
+ if (ret)
+ goto out;
+
+ loff += BTRFS_MAX_DEDUPE_LEN;
+ dst_loff += BTRFS_MAX_DEDUPE_LEN;
+ }
+
+ if (tail_len > 0)
+ ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+out:
+ spin_lock(&root_dst->root_item_lock);
+ root_dst->dedupe_in_progress--;
+ spin_unlock(&root_dst->root_item_lock);
+
+ return ret;
+}
+
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret;
+ int wb_ret;
+ u64 len = olen;
+ u64 bs = fs_info->sb->s_blocksize;
+
+ /*
+ * VFS's generic_remap_file_range_prep() protects us from cloning the
+ * eof block into the middle of a file, which would result in corruption
+ * if the file size is not blocksize aligned. So we don't need to check
+ * for that case here.
+ */
+ if (off + len == src->i_size)
+ len = ALIGN(src->i_size, bs) - off;
+
+ if (destoff > inode->i_size) {
+ const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
+
+ ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
+ if (ret)
+ return ret;
+ /*
+ * We may have truncated the last block if the inode's size is
+ * not sector size aligned, so we need to wait for writeback to
+ * complete before proceeding further, otherwise we can race
+ * with cloning and attempt to increment a reference to an
+ * extent that no longer exists (writeback completed right after
+ * we found the previous extent covering eof and before we
+ * attempted to increment its reference count).
+ */
+ ret = btrfs_wait_ordered_range(inode, wb_start,
+ destoff - wb_start);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Lock destination range to serialize with concurrent readahead() and
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
+ ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
+
+ /*
+ * We may have copied an inline extent into a page of the destination
+ * range, so wait for writeback to complete before truncating pages
+ * from the page cache. This is a rare case.
+ */
+ wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+ ret = ret ? ret : wb_ret;
+ /*
+ * Truncate page cache pages so that future reads will see the cloned
+ * data immediately and not the previous data.
+ */
+ truncate_inode_pages_range(&inode->i_data,
+ round_down(destoff, PAGE_SIZE),
+ round_up(destoff + len, PAGE_SIZE) - 1);
+
+ btrfs_btree_balance_dirty(fs_info);
+
+ return ret;
+}
+
+static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+ u64 wb_len;
+ int ret;
+
+ if (!(remap_flags & REMAP_FILE_DEDUP)) {
+ struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+
+ if (btrfs_root_readonly(root_out))
+ return -EROFS;
+
+ ASSERT(inode_in->i_sb == inode_out->i_sb);
+ }
+
+ /* Don't make the dst file partly checksummed */
+ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ return -EINVAL;
+ }
+
+ /*
+ * Now that the inodes are locked, we need to start writeback ourselves
+ * and can not rely on the writeback from the VFS's generic helper
+ * generic_remap_file_range_prep() because:
+ *
+ * 1) For compression we must call filemap_fdatawrite_range() range
+ * twice (btrfs_fdatawrite_range() does it for us), and the generic
+ * helper only calls it once;
+ *
+ * 2) filemap_fdatawrite_range(), called by the generic helper only
+ * waits for the writeback to complete, i.e. for IO to be done, and
+ * not for the ordered extents to complete. We need to wait for them
+ * to complete so that new file extent items are in the fs tree.
+ */
+ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
+ wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ else
+ wb_len = ALIGN(*len, bs);
+
+ /*
+ * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
+ *
+ * Btrfs' back references do not have a block level granularity, they
+ * work at the whole extent level.
+ * NOCOW buffered write without data space reserved may not be able
+ * to fall back to CoW due to lack of data space, thus could cause
+ * data loss.
+ *
+ * Here we take a shortcut by flushing the whole inode, so that all
+ * nocow write should reach disk as nocow before we increase the
+ * reference of the extent. We could do better by only flushing NOCOW
+ * data, but that needs extra accounting.
+ *
+ * Also we don't need to check ASYNC_EXTENT, as async extent will be
+ * CoWed anyway, not affecting nocow part.
+ */
+ ret = filemap_flush(inode_in->i_mapping);
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ wb_len);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ wb_len);
+ if (ret < 0)
+ return ret;
+
+ return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+}
+
+static bool file_sync_write(const struct file *file)
+{
+ if (file->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(file)))
+ return true;
+
+ return false;
+}
+
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, loff_t len,
+ unsigned int remap_flags)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ bool same_inode = dst_inode == src_inode;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (same_inode) {
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
+ lock_two_nondirectories(src_inode, dst_inode);
+ btrfs_double_mmap_lock(src_inode, dst_inode);
+ }
+
+ ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ else
+ ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+
+out_unlock:
+ if (same_inode) {
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
+ btrfs_double_mmap_unlock(src_inode, dst_inode);
+ unlock_two_nondirectories(src_inode, dst_inode);
+ }
+
+ /*
+ * If either the source or the destination file was opened with O_SYNC,
+ * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
+ * source files/ranges, so that after a successful return (0) followed
+ * by a power failure results in the reflinked data to be readable from
+ * both files/ranges.
+ */
+ if (ret == 0 && len > 0 &&
+ (file_sync_write(src_file) || file_sync_write(dst_file))) {
+ ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
+ if (ret == 0)
+ ret = btrfs_sync_file(dst_file, destoff,
+ destoff + len - 1, 0);
+ }
+
+ return ret < 0 ? ret : len;
+}
diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h
new file mode 100644
index 000000000..ecb309b4d
--- /dev/null
+++ b/fs/btrfs/reflink.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_REFLINK_H
+#define BTRFS_REFLINK_H
+
+#include <linux/fs.h>
+
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
+
+#endif /* BTRFS_REFLINK_H */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 000000000..d2bdcc2cc
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,4532 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/error-injection.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+#include "free-space-cache.h"
+#include "qgroup.h"
+#include "print-tree.h"
+#include "delalloc-space.h"
+#include "block-group.h"
+#include "backref.h"
+#include "misc.h"
+#include "subpage.h"
+#include "zoned.h"
+#include "inode-item.h"
+
+/*
+ * Relocation overview
+ *
+ * [What does relocation do]
+ *
+ * The objective of relocation is to relocate all extents of the target block
+ * group to other block groups.
+ * This is utilized by resize (shrink only), profile converting, compacting
+ * space, or balance routine to spread chunks over devices.
+ *
+ * Before | After
+ * ------------------------------------------------------------------
+ * BG A: 10 data extents | BG A: deleted
+ * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
+ * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
+ *
+ * [How does relocation work]
+ *
+ * 1. Mark the target block group read-only
+ * New extents won't be allocated from the target block group.
+ *
+ * 2.1 Record each extent in the target block group
+ * To build a proper map of extents to be relocated.
+ *
+ * 2.2 Build data reloc tree and reloc trees
+ * Data reloc tree will contain an inode, recording all newly relocated
+ * data extents.
+ * There will be only one data reloc tree for one data block group.
+ *
+ * Reloc tree will be a special snapshot of its source tree, containing
+ * relocated tree blocks.
+ * Each tree referring to a tree block in target block group will get its
+ * reloc tree built.
+ *
+ * 2.3 Swap source tree with its corresponding reloc tree
+ * Each involved tree only refers to new extents after swap.
+ *
+ * 3. Cleanup reloc trees and data reloc tree.
+ * As old extents in the target block group are still referenced by reloc
+ * trees, we need to clean them up before really freeing the target block
+ * group.
+ *
+ * The main complexity is in steps 2.2 and 2.3.
+ *
+ * The entry point of relocation is relocate_block_group() function.
+ */
+
+#define RELOCATION_RESERVED_NODES 256
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simle_node for search/insert */
+ void *data;
+};
+
+struct mapping_tree {
+ struct rb_root rb_root;
+ spinlock_t lock;
+};
+
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simple_node for search/insert */
+ u64 owner;
+ struct btrfs_key key;
+ unsigned int level:8;
+ unsigned int key_ready:1;
+};
+
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+ u64 start;
+ u64 end;
+ u64 boundary[MAX_EXTENTS];
+ unsigned int nr;
+};
+
+struct reloc_control {
+ /* block group to relocate */
+ struct btrfs_block_group *block_group;
+ /* extent tree */
+ struct btrfs_root *extent_root;
+ /* inode for moving data */
+ struct inode *data_inode;
+
+ struct btrfs_block_rsv *block_rsv;
+
+ struct btrfs_backref_cache backref_cache;
+
+ struct file_extent_cluster cluster;
+ /* tree blocks have been processed */
+ struct extent_io_tree processed_blocks;
+ /* map start of tree root to corresponding reloc tree */
+ struct mapping_tree reloc_root_tree;
+ /* list of reloc trees */
+ struct list_head reloc_roots;
+ /* list of subvolume trees that get relocated */
+ struct list_head dirty_subvol_roots;
+ /* size of metadata reservation for merging reloc trees */
+ u64 merging_rsv_size;
+ /* size of relocated tree nodes */
+ u64 nodes_relocated;
+ /* reserved size for block group relocation*/
+ u64 reserved_bytes;
+
+ u64 search_start;
+ u64 extents_found;
+
+ unsigned int stage:8;
+ unsigned int create_reloc_tree:1;
+ unsigned int merge_reloc_tree:1;
+ unsigned int found_file_extent:1;
+};
+
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS 0
+#define UPDATE_DATA_PTRS 1
+
+static void mark_block_processed(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
+{
+ u32 blocksize;
+
+ if (node->level == 0 ||
+ in_range(node->bytenr, rc->block_group->start,
+ rc->block_group->length)) {
+ blocksize = rc->extent_root->fs_info->nodesize;
+ set_extent_bits(&rc->processed_blocks, node->bytenr,
+ node->bytenr + blocksize - 1, EXTENT_DIRTY);
+ }
+ node->processed = 1;
+}
+
+
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+ tree->rb_root = RB_ROOT;
+ spin_lock_init(&tree->lock);
+}
+
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct btrfs_backref_node *walk_up_backref(
+ struct btrfs_backref_node *node,
+ struct btrfs_backref_edge *edges[], int *index)
+{
+ struct btrfs_backref_edge *edge;
+ int idx = *index;
+
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next,
+ struct btrfs_backref_edge, list[LOWER]);
+ edges[idx++] = edge;
+ node = edge->node[UPPER];
+ }
+ BUG_ON(node->detached);
+ *index = idx;
+ return node;
+}
+
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct btrfs_backref_node *walk_down_backref(
+ struct btrfs_backref_edge *edges[], int *index)
+{
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *lower;
+ int idx = *index;
+
+ while (idx > 0) {
+ edge = edges[idx - 1];
+ lower = edge->node[LOWER];
+ if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+ idx--;
+ continue;
+ }
+ edge = list_entry(edge->list[LOWER].next,
+ struct btrfs_backref_edge, list[LOWER]);
+ edges[idx - 1] = edge;
+ *index = idx;
+ return edge->node[UPPER];
+ }
+ *index = 0;
+ return NULL;
+}
+
+static void update_backref_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node, u64 bytenr)
+{
+ struct rb_node *rb_node;
+ rb_erase(&node->rb_node, &cache->rb_root);
+ node->bytenr = bytenr;
+ rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ if (rb_node)
+ btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
+}
+
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_backref_cache *cache)
+{
+ struct btrfs_backref_node *node;
+ int level = 0;
+
+ if (cache->last_trans == 0) {
+ cache->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (cache->last_trans == trans->transid)
+ return 0;
+
+ /*
+ * detached nodes are used to avoid unnecessary backref
+ * lookup. transaction commit changes the extent tree.
+ * so the detached nodes are no longer useful.
+ */
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct btrfs_backref_node, list);
+ btrfs_backref_cleanup_node(cache, node);
+ }
+
+ while (!list_empty(&cache->changed)) {
+ node = list_entry(cache->changed.next,
+ struct btrfs_backref_node, list);
+ list_del_init(&node->list);
+ BUG_ON(node->pending);
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+
+ /*
+ * some nodes can be left in the pending list if there were
+ * errors during processing the pending nodes.
+ */
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ list_for_each_entry(node, &cache->pending[level], list) {
+ BUG_ON(!node->pending);
+ if (node->bytenr == node->new_bytenr)
+ continue;
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+ }
+
+ cache->last_trans = 0;
+ return 1;
+}
+
+static bool reloc_root_is_dead(struct btrfs_root *root)
+{
+ /*
+ * Pair with set_bit/clear_bit in clean_dirty_subvols and
+ * btrfs_update_reloc_root. We need to see the updated bit before
+ * trying to access reloc_root
+ */
+ smp_rmb();
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
+ return true;
+ return false;
+}
+
+/*
+ * Check if this subvolume tree has valid reloc tree.
+ *
+ * Reloc tree after swap is considered dead, thus not considered as valid.
+ * This is enough for most callers, as they don't distinguish dead reloc root
+ * from no reloc root. But btrfs_should_ignore_reloc_root() below is a
+ * special case.
+ */
+static bool have_reloc_root(struct btrfs_root *root)
+{
+ if (reloc_root_is_dead(root))
+ return false;
+ if (!root->reloc_root)
+ return false;
+ return true;
+}
+
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return 0;
+
+ /* This root has been merged with its reloc tree, we can ignore it */
+ if (reloc_root_is_dead(root))
+ return 1;
+
+ reloc_root = root->reloc_root;
+ if (!reloc_root)
+ return 0;
+
+ if (btrfs_header_generation(reloc_root->commit_root) ==
+ root->fs_info->running_transaction->transid)
+ return 0;
+ /*
+ * if there is reloc tree and it was created in previous
+ * transaction backref lookup can find the reloc tree,
+ * so backref node for the fs tree root is useless for
+ * relocation.
+ */
+ return 1;
+}
+
+/*
+ * find reloc tree by address of tree root
+ */
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct reloc_control *rc = fs_info->reloc_ctl;
+ struct rb_node *rb_node;
+ struct mapping_node *node;
+ struct btrfs_root *root = NULL;
+
+ ASSERT(rc);
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ root = node->data;
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+ return btrfs_grab_root(root);
+}
+
+/*
+ * For useless nodes, do two major clean ups:
+ *
+ * - Cleanup the children edges and nodes
+ * If child node is also orphan (no parent) during cleanup, then the child
+ * node will also be cleaned up.
+ *
+ * - Freeing up leaves (level 0), keeps nodes detached
+ * For nodes, the node is still cached as "detached"
+ *
+ * Return false if @node is not in the @useless_nodes list.
+ * Return true if @node is in the @useless_nodes list.
+ */
+static bool handle_useless_nodes(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct list_head *useless_node = &cache->useless_node;
+ bool ret = false;
+
+ while (!list_empty(useless_node)) {
+ struct btrfs_backref_node *cur;
+
+ cur = list_first_entry(useless_node, struct btrfs_backref_node,
+ list);
+ list_del_init(&cur->list);
+
+ /* Only tree root nodes can be added to @useless_nodes */
+ ASSERT(list_empty(&cur->upper));
+
+ if (cur == node)
+ ret = true;
+
+ /* The node is the lowest node */
+ if (cur->lowest) {
+ list_del_init(&cur->lower);
+ cur->lowest = 0;
+ }
+
+ /* Cleanup the lower edges */
+ while (!list_empty(&cur->lower)) {
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *lower;
+
+ edge = list_entry(cur->lower.next,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ btrfs_backref_free_edge(cache, edge);
+
+ /* Child node is also orphan, queue for cleanup */
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, useless_node);
+ }
+ /* Mark this block processed for relocation */
+ mark_block_processed(rc, cur);
+
+ /*
+ * Backref nodes for tree leaves are deleted from the cache.
+ * Backref nodes for upper level tree blocks are left in the
+ * cache to avoid unnecessary backref lookup.
+ */
+ if (cur->level > 0) {
+ list_add(&cur->list, &cache->detached);
+ cur->detached = 1;
+ } else {
+ rb_erase(&cur->rb_node, &cache->rb_root);
+ btrfs_backref_free_node(cache, cur);
+ }
+ }
+ return ret;
+}
+
+/*
+ * Build backref tree for a given tree block. Root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond roots of
+ * b-trees that reference the tree block.
+ *
+ * The basic idea of this function is check backrefs of a given block to find
+ * upper level blocks that reference the block, and then check backrefs of
+ * these upper level blocks recursively. The recursion stops when tree root is
+ * reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find that backrefs for a block are cached, we know backrefs for
+ * all upper level blocks that directly/indirectly reference the block are also
+ * cached.
+ */
+static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
+ struct reloc_control *rc, struct btrfs_key *node_key,
+ int level, u64 bytenr)
+{
+ struct btrfs_backref_iter *iter;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ /* For searching parent of TREE_BLOCK_REF */
+ struct btrfs_path *path;
+ struct btrfs_backref_node *cur;
+ struct btrfs_backref_node *node = NULL;
+ struct btrfs_backref_edge *edge;
+ int ret;
+ int err = 0;
+
+ iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ node = btrfs_backref_alloc_node(cache, bytenr, level);
+ if (!node) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ node->lowest = 1;
+ cur = node;
+
+ /* Breadth-first search to build backref cache */
+ do {
+ ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
+ cur);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ edge = list_first_entry_or_null(&cache->pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ /*
+ * The pending list isn't empty, take the first block to
+ * process
+ */
+ if (edge) {
+ list_del_init(&edge->list[UPPER]);
+ cur = edge->node[UPPER];
+ }
+ } while (edge);
+
+ /* Finish the upper linkage of newly added edges/nodes */
+ ret = btrfs_backref_finish_upper_links(cache, node);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (handle_useless_nodes(rc, node))
+ node = NULL;
+out:
+ btrfs_backref_iter_free(iter);
+ btrfs_free_path(path);
+ if (err) {
+ btrfs_backref_error_cleanup(cache, node);
+ return ERR_PTR(err);
+ }
+ ASSERT(!node || !node->detached);
+ ASSERT(list_empty(&cache->useless_node) &&
+ list_empty(&cache->pending_edge));
+ return node;
+}
+
+/*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *src,
+ struct btrfs_root *dest)
+{
+ struct btrfs_root *reloc_root = src->reloc_root;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct btrfs_backref_node *node = NULL;
+ struct btrfs_backref_node *new_node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *new_edge;
+ struct rb_node *rb_node;
+
+ if (cache->last_trans > 0)
+ update_backref_cache(trans, cache);
+
+ rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
+ if (node->detached)
+ node = NULL;
+ else
+ BUG_ON(node->new_bytenr != reloc_root->node->start);
+ }
+
+ if (!node) {
+ rb_node = rb_simple_search(&cache->rb_root,
+ reloc_root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct btrfs_backref_node,
+ rb_node);
+ BUG_ON(node->detached);
+ }
+ }
+
+ if (!node)
+ return 0;
+
+ new_node = btrfs_backref_alloc_node(cache, dest->node->start,
+ node->level);
+ if (!new_node)
+ return -ENOMEM;
+
+ new_node->lowest = node->lowest;
+ new_node->checked = 1;
+ new_node->root = btrfs_grab_root(dest);
+ ASSERT(new_node->root);
+
+ if (!node->lowest) {
+ list_for_each_entry(edge, &node->lower, list[UPPER]) {
+ new_edge = btrfs_backref_alloc_edge(cache);
+ if (!new_edge)
+ goto fail;
+
+ btrfs_backref_link_edge(new_edge, edge->node[LOWER],
+ new_node, LINK_UPPER);
+ }
+ } else {
+ list_add_tail(&new_node->lower, &cache->leaves);
+ }
+
+ rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
+ &new_node->rb_node);
+ if (rb_node)
+ btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
+
+ if (!new_node->lowest) {
+ list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+ list_add_tail(&new_edge->list[LOWER],
+ &new_edge->node[LOWER]->upper);
+ }
+ }
+ return 0;
+fail:
+ while (!list_empty(&new_node->lower)) {
+ new_edge = list_entry(new_node->lower.next,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del(&new_edge->list[UPPER]);
+ btrfs_backref_free_edge(cache, new_edge);
+ }
+ btrfs_backref_free_node(cache, new_node);
+ return -ENOMEM;
+}
+
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __must_check __add_reloc_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *rb_node;
+ struct mapping_node *node;
+ struct reloc_control *rc = fs_info->reloc_ctl;
+
+ node = kmalloc(sizeof(*node), GFP_NOFS);
+ if (!node)
+ return -ENOMEM;
+
+ node->bytenr = root->commit_root->start;
+ node->data = root;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ if (rb_node) {
+ btrfs_err(fs_info,
+ "Duplicate root found for start=%llu while inserting into relocation tree",
+ node->bytenr);
+ return -EEXIST;
+ }
+
+ list_add_tail(&root->root_list, &rc->reloc_roots);
+ return 0;
+}
+
+/*
+ * helper to delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static void __del_reloc_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *rb_node;
+ struct mapping_node *node = NULL;
+ struct reloc_control *rc = fs_info->reloc_ctl;
+ bool put_ref = false;
+
+ if (rc && root->node) {
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
+ root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ RB_CLEAR_NODE(&node->rb_node);
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+ ASSERT(!node || (struct btrfs_root *)node->data == root);
+ }
+
+ /*
+ * We only put the reloc root here if it's on the list. There's a lot
+ * of places where the pattern is to splice the rc->reloc_roots, process
+ * the reloc roots, and then add the reloc root back onto
+ * rc->reloc_roots. If we call __del_reloc_root while it's off of the
+ * list we don't want the reference being dropped, because the guy
+ * messing with the list is in charge of the reference.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&root->root_list)) {
+ put_ref = true;
+ list_del_init(&root->root_list);
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (put_ref)
+ btrfs_put_root(root);
+ kfree(node);
+}
+
+/*
+ * helper to update the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *rb_node;
+ struct mapping_node *node = NULL;
+ struct reloc_control *rc = fs_info->reloc_ctl;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
+ root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+
+ if (!node)
+ return 0;
+ BUG_ON((struct btrfs_root *)node->data != root);
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ node->bytenr = root->node->start;
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ if (rb_node)
+ btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
+ return 0;
+}
+
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb;
+ struct btrfs_root_item *root_item;
+ struct btrfs_key root_key;
+ int ret = 0;
+ bool must_abort = false;
+
+ root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+ if (!root_item)
+ return ERR_PTR(-ENOMEM);
+
+ root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = objectid;
+
+ if (root->root_key.objectid == objectid) {
+ u64 commit_root_gen;
+
+ /* called by btrfs_init_reloc_root */
+ ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ if (ret)
+ goto fail;
+
+ /*
+ * Set the last_snapshot field to the generation of the commit
+ * root - like this ctree.c:btrfs_block_can_be_shared() behaves
+ * correctly (returns true) when the relocation root is created
+ * either inside the critical section of a transaction commit
+ * (through transaction.c:qgroup_account_snapshot()) and when
+ * it's created before the transaction commit is started.
+ */
+ commit_root_gen = btrfs_header_generation(root->commit_root);
+ btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
+ } else {
+ /*
+ * called by btrfs_reloc_post_snapshot_hook.
+ * the source tree is a reloc tree, all tree blocks
+ * modified after it was created have RELOC flag
+ * set in their headers. so it's OK to not update
+ * the 'last_snapshot'.
+ */
+ ret = btrfs_copy_root(trans, root, root->node, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ if (ret)
+ goto fail;
+ }
+
+ /*
+ * We have changed references at this point, we must abort the
+ * transaction if anything fails.
+ */
+ must_abort = true;
+
+ memcpy(root_item, &root->root_item, sizeof(*root_item));
+ btrfs_set_root_bytenr(root_item, eb->start);
+ btrfs_set_root_level(root_item, btrfs_header_level(eb));
+ btrfs_set_root_generation(root_item, trans->transid);
+
+ if (root->root_key.objectid == objectid) {
+ btrfs_set_root_refs(root_item, 0);
+ memset(&root_item->drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ btrfs_set_root_drop_level(root_item, 0);
+ }
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ ret = btrfs_insert_root(trans, fs_info->tree_root,
+ &root_key, root_item);
+ if (ret)
+ goto fail;
+
+ kfree(root_item);
+
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
+ if (IS_ERR(reloc_root)) {
+ ret = PTR_ERR(reloc_root);
+ goto abort;
+ }
+ set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
+ reloc_root->last_trans = trans->transid;
+ return reloc_root;
+fail:
+ kfree(root_item);
+abort:
+ if (must_abort)
+ btrfs_abort_transaction(trans, ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ *
+ * The reloc_root comes out of here with two references, one for
+ * root->reloc_root, and another for being on the rc->reloc_roots list.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *reloc_root;
+ struct reloc_control *rc = fs_info->reloc_ctl;
+ struct btrfs_block_rsv *rsv;
+ int clear_rsv = 0;
+ int ret;
+
+ if (!rc)
+ return 0;
+
+ /*
+ * The subvolume has reloc tree but the swap is finished, no need to
+ * create/update the dead reloc tree
+ */
+ if (reloc_root_is_dead(root))
+ return 0;
+
+ /*
+ * This is subtle but important. We do not do
+ * record_root_in_transaction for reloc roots, instead we record their
+ * corresponding fs root, and then here we update the last trans for the
+ * reloc root. This means that we have to do this for the entire life
+ * of the reloc root, regardless of which stage of the relocation we are
+ * in.
+ */
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ reloc_root->last_trans = trans->transid;
+ return 0;
+ }
+
+ /*
+ * We are merging reloc roots, we do not need new reloc trees. Also
+ * reloc trees never need their own reloc tree.
+ */
+ if (!rc->create_reloc_tree ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!trans->reloc_reserved) {
+ rsv = trans->block_rsv;
+ trans->block_rsv = rc->block_rsv;
+ clear_rsv = 1;
+ }
+ reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+ if (clear_rsv)
+ trans->block_rsv = rsv;
+ if (IS_ERR(reloc_root))
+ return PTR_ERR(reloc_root);
+
+ ret = __add_reloc_root(reloc_root);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ /* Pairs with create_reloc_root */
+ btrfs_put_root(reloc_root);
+ return ret;
+ }
+ root->reloc_root = btrfs_grab_root(reloc_root);
+ return 0;
+}
+
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root_item *root_item;
+ int ret;
+
+ if (!have_reloc_root(root))
+ return 0;
+
+ reloc_root = root->reloc_root;
+ root_item = &reloc_root->root_item;
+
+ /*
+ * We are probably ok here, but __del_reloc_root() will drop its ref of
+ * the root. We have the ref for root->reloc_root, but just in case
+ * hold it while we update the reloc root.
+ */
+ btrfs_grab_root(reloc_root);
+
+ /* root->reloc_root will stay until current relocation finished */
+ if (fs_info->reloc_ctl->merge_reloc_tree &&
+ btrfs_root_refs(root_item) == 0) {
+ set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ /*
+ * Mark the tree as dead before we change reloc_root so
+ * have_reloc_root will not touch it from now on.
+ */
+ smp_wmb();
+ __del_reloc_root(reloc_root);
+ }
+
+ if (reloc_root->commit_root != reloc_root->node) {
+ __update_reloc_root(reloc_root);
+ btrfs_set_root_node(root_item, reloc_root->node);
+ free_extent_buffer(reloc_root->commit_root);
+ reloc_root->commit_root = btrfs_root_node(reloc_root);
+ }
+
+ ret = btrfs_update_root(trans, fs_info->tree_root,
+ &reloc_root->root_key, root_item);
+ btrfs_put_root(reloc_root);
+ return ret;
+}
+
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < btrfs_ino(entry))
+ node = node->rb_left;
+ else if (objectid > btrfs_ino(entry))
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= btrfs_ino(entry)) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ return inode;
+ }
+
+ objectid = btrfs_ino(entry) + 1;
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+ return NULL;
+}
+
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+ ret = btrfs_lookup_file_extent(NULL, root, path,
+ btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi));
+
+ if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static noinline_for_stack
+int replace_file_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ struct inode *inode = NULL;
+ u64 parent;
+ u64 bytenr;
+ u64 new_bytenr = 0;
+ u64 num_bytes;
+ u64 end;
+ u32 nritems;
+ u32 i;
+ int ret = 0;
+ int first = 1;
+ int dirty = 0;
+
+ if (rc->stage != UPDATE_DATA_PTRS)
+ return 0;
+
+ /* reloc trees always use full backref */
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ parent = leaf->start;
+ else
+ parent = 0;
+
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ struct btrfs_ref ref = { 0 };
+
+ cond_resched();
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ if (bytenr == 0)
+ continue;
+ if (!in_range(bytenr, rc->block_group->start,
+ rc->block_group->length))
+ continue;
+
+ /*
+ * if we are modifying block in fs tree, wait for read_folio
+ * to complete and drop the extent cache
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (first) {
+ inode = find_next_inode(root, key.objectid);
+ first = 0;
+ } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
+ btrfs_add_delayed_iput(inode);
+ inode = find_next_inode(root, key.objectid);
+ }
+ if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
+ end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ WARN_ON(!IS_ALIGNED(key.offset,
+ fs_info->sectorsize));
+ WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
+ end--;
+ ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+ key.offset, end);
+ if (!ret)
+ continue;
+
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ key.offset, end, true);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ key.offset, end, NULL);
+ }
+ }
+
+ ret = get_new_location(rc->data_inode, &new_bytenr,
+ bytenr, num_bytes);
+ if (ret) {
+ /*
+ * Don't have to abort since we've not changed anything
+ * in the file extent yet.
+ */
+ break;
+ }
+
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+ dirty = 1;
+
+ key.offset -= btrfs_file_extent_offset(leaf, fi);
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
+ num_bytes, parent);
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+ num_bytes, parent);
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+ if (dirty)
+ btrfs_mark_buffer_dirty(leaf);
+ if (inode)
+ btrfs_add_delayed_iput(inode);
+ return ret;
+}
+
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+ struct btrfs_path *path, int level)
+{
+ struct btrfs_disk_key key1;
+ struct btrfs_disk_key key2;
+ btrfs_node_key(eb, &key1, slot);
+ btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+ return memcmp(&key1, &key2, sizeof(key1));
+}
+
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static noinline_for_stack
+int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
+ struct btrfs_root *dest, struct btrfs_root *src,
+ struct btrfs_path *path, struct btrfs_key *next_key,
+ int lowest_level, int max_level)
+{
+ struct btrfs_fs_info *fs_info = dest->fs_info;
+ struct extent_buffer *eb;
+ struct extent_buffer *parent;
+ struct btrfs_ref ref = { 0 };
+ struct btrfs_key key;
+ u64 old_bytenr;
+ u64 new_bytenr;
+ u64 old_ptr_gen;
+ u64 new_ptr_gen;
+ u64 last_snapshot;
+ u32 blocksize;
+ int cow = 0;
+ int level;
+ int ret;
+ int slot;
+
+ ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+ last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+again:
+ slot = path->slots[lowest_level];
+ btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+
+ eb = btrfs_lock_root_node(dest);
+ level = btrfs_header_level(eb);
+
+ if (level < lowest_level) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ return 0;
+ }
+
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ return ret;
+ }
+ }
+
+ if (next_key) {
+ next_key->objectid = (u64)-1;
+ next_key->type = (u8)-1;
+ next_key->offset = (u64)-1;
+ }
+
+ parent = eb;
+ while (1) {
+ level = btrfs_header_level(parent);
+ ASSERT(level >= lowest_level);
+
+ ret = btrfs_bin_search(parent, &key, &slot);
+ if (ret < 0)
+ break;
+ if (ret && slot > 0)
+ slot--;
+
+ if (next_key && slot + 1 < btrfs_header_nritems(parent))
+ btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+
+ old_bytenr = btrfs_node_blockptr(parent, slot);
+ blocksize = fs_info->nodesize;
+ old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+
+ if (level <= max_level) {
+ eb = path->nodes[level];
+ new_bytenr = btrfs_node_blockptr(eb,
+ path->slots[level]);
+ new_ptr_gen = btrfs_node_ptr_generation(eb,
+ path->slots[level]);
+ } else {
+ new_bytenr = 0;
+ new_ptr_gen = 0;
+ }
+
+ if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
+ ret = level;
+ break;
+ }
+
+ if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+ memcmp_node_keys(parent, slot, path, level)) {
+ if (level <= lowest_level) {
+ ret = 0;
+ break;
+ }
+
+ eb = btrfs_read_node_slot(parent, slot);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ break;
+ }
+ btrfs_tree_lock(eb);
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, parent,
+ slot, &eb,
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ break;
+ }
+ }
+
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+
+ parent = eb;
+ continue;
+ }
+
+ if (!cow) {
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ cow = 1;
+ goto again;
+ }
+
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ btrfs_release_path(path);
+
+ path->lowest_level = level;
+ set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
+ ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+ clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
+ path->lowest_level = 0;
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ break;
+ }
+
+ /*
+ * Info qgroup to trace both subtrees.
+ *
+ * We must trace both trees.
+ * 1) Tree reloc subtree
+ * If not traced, we will leak data numbers
+ * 2) Fs subtree
+ * If not traced, we will double count old data
+ *
+ * We don't scan the subtree right now, but only record
+ * the swapped tree blocks.
+ * The real subtree rescan is delayed until we have new
+ * CoW on the subtree root node before transaction commit.
+ */
+ ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ rc->block_group, parent, slot,
+ path->nodes[level], path->slots[level],
+ last_snapshot);
+ if (ret < 0)
+ break;
+ /*
+ * swap blocks in fs tree and reloc tree.
+ */
+ btrfs_set_node_blockptr(parent, slot, new_bytenr);
+ btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+ btrfs_mark_buffer_dirty(parent);
+
+ btrfs_set_node_blockptr(path->nodes[level],
+ path->slots[level], old_bytenr);
+ btrfs_set_node_ptr_generation(path->nodes[level],
+ path->slots[level], old_ptr_gen);
+ btrfs_mark_buffer_dirty(path->nodes[level]);
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
+ blocksize, path->nodes[level]->start);
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
+ blocksize, 0);
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
+ true);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
+ blocksize, path->nodes[level]->start);
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
+ blocksize, 0);
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
+ 0, true);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ btrfs_unlock_up_safe(path, 0);
+
+ ret = level;
+ break;
+ }
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ return ret;
+}
+
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int *level)
+{
+ struct extent_buffer *eb;
+ int i;
+ u64 last_snapshot;
+ u32 nritems;
+
+ last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+ for (i = 0; i < *level; i++) {
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+
+ for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+ eb = path->nodes[i];
+ nritems = btrfs_header_nritems(eb);
+ while (path->slots[i] + 1 < nritems) {
+ path->slots[i]++;
+ if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+ last_snapshot)
+ continue;
+
+ *level = i;
+ return 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+ return 1;
+}
+
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int *level)
+{
+ struct extent_buffer *eb = NULL;
+ int i;
+ u64 ptr_gen = 0;
+ u64 last_snapshot;
+ u32 nritems;
+
+ last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+ for (i = *level; i > 0; i--) {
+ eb = path->nodes[i];
+ nritems = btrfs_header_nritems(eb);
+ while (path->slots[i] < nritems) {
+ ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+ if (ptr_gen > last_snapshot)
+ break;
+ path->slots[i]++;
+ }
+ if (path->slots[i] >= nritems) {
+ if (i == *level)
+ break;
+ *level = i + 1;
+ return 0;
+ }
+ if (i == 1) {
+ *level = i;
+ return 0;
+ }
+
+ eb = btrfs_read_node_slot(eb, path->slots[i]);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+ BUG_ON(btrfs_header_level(eb) != i - 1);
+ path->nodes[i - 1] = eb;
+ path->slots[i - 1] = 0;
+ }
+ return 1;
+}
+
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+ struct btrfs_key *min_key,
+ struct btrfs_key *max_key)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode = NULL;
+ u64 objectid;
+ u64 start, end;
+ u64 ino;
+
+ objectid = min_key->objectid;
+ while (1) {
+ cond_resched();
+ iput(inode);
+
+ if (objectid > max_key->objectid)
+ break;
+
+ inode = find_next_inode(root, objectid);
+ if (!inode)
+ break;
+ ino = btrfs_ino(BTRFS_I(inode));
+
+ if (ino > max_key->objectid) {
+ iput(inode);
+ break;
+ }
+
+ objectid = ino + 1;
+ if (!S_ISREG(inode->i_mode))
+ continue;
+
+ if (unlikely(min_key->objectid == ino)) {
+ if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+ continue;
+ if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+ start = 0;
+ else {
+ start = min_key->offset;
+ WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
+ }
+ } else {
+ start = 0;
+ }
+
+ if (unlikely(max_key->objectid == ino)) {
+ if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+ continue;
+ if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+ end = (u64)-1;
+ } else {
+ if (max_key->offset == 0)
+ continue;
+ end = max_key->offset;
+ WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
+ end--;
+ }
+ } else {
+ end = (u64)-1;
+ }
+
+ /* the lock_extent waits for read_folio to complete */
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ }
+ return 0;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key)
+
+{
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level])
+ break;
+ if (path->slots[level] + 1 <
+ btrfs_header_nritems(path->nodes[level])) {
+ btrfs_node_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ return 0;
+ }
+ level++;
+ }
+ return 1;
+}
+
+/*
+ * Insert current subvolume into reloc_control::dirty_subvol_roots
+ */
+static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root = root->reloc_root;
+ struct btrfs_root_item *reloc_root_item;
+ int ret;
+
+ /* @root must be a subvolume tree root with a valid reloc tree */
+ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(reloc_root);
+
+ reloc_root_item = &reloc_root->root_item;
+ memset(&reloc_root_item->drop_progress, 0,
+ sizeof(reloc_root_item->drop_progress));
+ btrfs_set_root_drop_level(reloc_root_item, 0);
+ btrfs_set_root_refs(reloc_root_item, 0);
+ ret = btrfs_update_reloc_root(trans, root);
+ if (ret)
+ return ret;
+
+ if (list_empty(&root->reloc_dirty_list)) {
+ btrfs_grab_root(root);
+ list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
+ }
+
+ return 0;
+}
+
+static int clean_dirty_subvols(struct reloc_control *rc)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *next;
+ int ret = 0;
+ int ret2;
+
+ list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
+ reloc_dirty_list) {
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ /* Merged subvolume, cleanup its reloc root */
+ struct btrfs_root *reloc_root = root->reloc_root;
+
+ list_del_init(&root->reloc_dirty_list);
+ root->reloc_root = NULL;
+ /*
+ * Need barrier to ensure clear_bit() only happens after
+ * root->reloc_root = NULL. Pairs with have_reloc_root.
+ */
+ smp_wmb();
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ if (reloc_root) {
+ /*
+ * btrfs_drop_snapshot drops our ref we hold for
+ * ->reloc_root. If it fails however we must
+ * drop the ref ourselves.
+ */
+ ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
+ if (ret2 < 0) {
+ btrfs_put_root(reloc_root);
+ if (!ret)
+ ret = ret2;
+ }
+ }
+ btrfs_put_root(root);
+ } else {
+ /* Orphan reloc tree, just clean it up */
+ ret2 = btrfs_drop_snapshot(root, 0, 1);
+ if (ret2 < 0) {
+ btrfs_put_root(root);
+ if (!ret)
+ ret = ret2;
+ }
+ }
+ }
+ return ret;
+}
+
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct btrfs_key key;
+ struct btrfs_key next_key;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root_item *root_item;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int reserve_level;
+ int level;
+ int max_level;
+ int replaced = 0;
+ int ret = 0;
+ u32 min_reserved;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ reloc_root = root->reloc_root;
+ root_item = &reloc_root->root_item;
+
+ if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+ level = btrfs_root_level(root_item);
+ atomic_inc(&reloc_root->node->refs);
+ path->nodes[level] = reloc_root->node;
+ path->slots[level] = 0;
+ } else {
+ btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+
+ level = btrfs_root_drop_level(root_item);
+ BUG_ON(level == 0);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+ path->slots[level]);
+ WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+
+ btrfs_unlock_up_safe(path, 0);
+ }
+
+ /*
+ * In merge_reloc_root(), we modify the upper level pointer to swap the
+ * tree blocks between reloc tree and subvolume tree. Thus for tree
+ * block COW, we COW at most from level 1 to root level for each tree.
+ *
+ * Thus the needed metadata size is at most root_level * nodesize,
+ * and * 2 since we have two trees to COW.
+ */
+ reserve_level = max_t(int, 1, btrfs_root_level(root_item));
+ min_reserved = fs_info->nodesize * reserve_level * 2;
+ memset(&next_key, 0, sizeof(next_key));
+
+ while (1) {
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+ min_reserved,
+ BTRFS_RESERVE_FLUSH_LIMIT);
+ if (ret)
+ goto out;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ /*
+ * At this point we no longer have a reloc_control, so we can't
+ * depend on btrfs_init_reloc_root to update our last_trans.
+ *
+ * But that's ok, we started the trans handle on our
+ * corresponding fs_root, which means it's been added to the
+ * dirty list. At commit time we'll still call
+ * btrfs_update_reloc_root() and update our root item
+ * appropriately.
+ */
+ reloc_root->last_trans = trans->transid;
+ trans->block_rsv = rc->block_rsv;
+
+ replaced = 0;
+ max_level = level;
+
+ ret = walk_down_reloc_tree(reloc_root, path, &level);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+
+ if (!find_next_key(path, level, &key) &&
+ btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+ ret = 0;
+ } else {
+ ret = replace_path(trans, rc, root, reloc_root, path,
+ &next_key, level, max_level);
+ }
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ level = ret;
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ replaced = 1;
+ }
+
+ ret = walk_up_reloc_tree(reloc_root, path, &level);
+ if (ret > 0)
+ break;
+
+ BUG_ON(level == 0);
+ /*
+ * save the merging progress in the drop_progress.
+ * this is OK since root refs == 1 in this case.
+ */
+ btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+ path->slots[level]);
+ btrfs_set_root_drop_level(root_item, level);
+
+ btrfs_end_transaction_throttle(trans);
+ trans = NULL;
+
+ btrfs_btree_balance_dirty(fs_info);
+
+ if (replaced && rc->stage == UPDATE_DATA_PTRS)
+ invalidate_extent_cache(root, &key, &next_key);
+ }
+
+ /*
+ * handle the case only one block in the fs tree need to be
+ * relocated and the block is tree root.
+ */
+ leaf = btrfs_lock_root_node(root);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
+ BTRFS_NESTING_COW);
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(leaf);
+out:
+ btrfs_free_path(path);
+
+ if (ret == 0) {
+ ret = insert_dirty_subvol(trans, rc, root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+
+ if (trans)
+ btrfs_end_transaction_throttle(trans);
+
+ btrfs_btree_balance_dirty(fs_info);
+
+ if (replaced && rc->stage == UPDATE_DATA_PTRS)
+ invalidate_extent_cache(root, &key, &next_key);
+
+ return ret;
+}
+
+static noinline_for_stack
+int prepare_to_merge(struct reloc_control *rc, int err)
+{
+ struct btrfs_root *root = rc->extent_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *reloc_root;
+ struct btrfs_trans_handle *trans;
+ LIST_HEAD(reloc_roots);
+ u64 num_bytes = 0;
+ int ret;
+
+ mutex_lock(&fs_info->reloc_mutex);
+ rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ rc->merging_rsv_size += rc->nodes_relocated * 2;
+ mutex_unlock(&fs_info->reloc_mutex);
+
+again:
+ if (!err) {
+ num_bytes = rc->merging_rsv_size;
+ ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret)
+ err = ret;
+ }
+
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans)) {
+ if (!err)
+ btrfs_block_rsv_release(fs_info, rc->block_rsv,
+ num_bytes, NULL);
+ return PTR_ERR(trans);
+ }
+
+ if (!err) {
+ if (num_bytes != rc->merging_rsv_size) {
+ btrfs_end_transaction(trans);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv,
+ num_bytes, NULL);
+ goto again;
+ }
+ }
+
+ rc->merge_reloc_tree = 1;
+
+ while (!list_empty(&rc->reloc_roots)) {
+ reloc_root = list_entry(rc->reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del_init(&reloc_root->root_list);
+
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
+ if (IS_ERR(root)) {
+ /*
+ * Even if we have an error we need this reloc root
+ * back on our list so we can clean up properly.
+ */
+ list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_abort_transaction(trans, (int)PTR_ERR(root));
+ if (!err)
+ err = PTR_ERR(root);
+ break;
+ }
+
+ if (unlikely(root->reloc_root != reloc_root)) {
+ if (root->reloc_root) {
+ btrfs_err(fs_info,
+"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
+ root->root_key.objectid,
+ root->reloc_root->root_key.objectid,
+ root->reloc_root->root_key.type,
+ root->reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &root->reloc_root->root_item),
+ reloc_root->root_key.objectid,
+ reloc_root->root_key.type,
+ reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &reloc_root->root_item));
+ } else {
+ btrfs_err(fs_info,
+"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
+ root->root_key.objectid,
+ reloc_root->root_key.objectid,
+ reloc_root->root_key.type,
+ reloc_root->root_key.offset,
+ btrfs_root_generation(
+ &reloc_root->root_item));
+ }
+ list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(root);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ if (!err)
+ err = -EUCLEAN;
+ break;
+ }
+
+ /*
+ * set reference count to 1, so btrfs_recover_relocation
+ * knows it should resumes merging
+ */
+ if (!err)
+ btrfs_set_root_refs(&reloc_root->root_item, 1);
+ ret = btrfs_update_reloc_root(trans, root);
+
+ /*
+ * Even if we have an error we need this reloc root back on our
+ * list so we can clean up properly.
+ */
+ list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(root);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ if (!err)
+ err = ret;
+ break;
+ }
+ }
+
+ list_splice(&reloc_roots, &rc->reloc_roots);
+
+ if (!err)
+ err = btrfs_commit_transaction(trans);
+ else
+ btrfs_end_transaction(trans);
+ return err;
+}
+
+static noinline_for_stack
+void free_reloc_roots(struct list_head *list)
+{
+ struct btrfs_root *reloc_root, *tmp;
+
+ list_for_each_entry_safe(reloc_root, tmp, list, root_list)
+ __del_reloc_root(reloc_root);
+}
+
+static noinline_for_stack
+void merge_reloc_roots(struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct btrfs_root *root;
+ struct btrfs_root *reloc_root;
+ LIST_HEAD(reloc_roots);
+ int found = 0;
+ int ret = 0;
+again:
+ root = rc->extent_root;
+
+ /*
+ * this serializes us with btrfs_record_root_in_transaction,
+ * we have to make sure nobody is in the middle of
+ * adding their roots to the list while we are
+ * doing this splice
+ */
+ mutex_lock(&fs_info->reloc_mutex);
+ list_splice_init(&rc->reloc_roots, &reloc_roots);
+ mutex_unlock(&fs_info->reloc_mutex);
+
+ while (!list_empty(&reloc_roots)) {
+ found = 1;
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
+
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ if (WARN_ON(IS_ERR(root))) {
+ /*
+ * For recovery we read the fs roots on mount,
+ * and if we didn't find the root then we marked
+ * the reloc root as a garbage root. For normal
+ * relocation obviously the root should exist in
+ * memory. However there's no reason we can't
+ * handle the error properly here just in case.
+ */
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ if (WARN_ON(root->reloc_root != reloc_root)) {
+ /*
+ * This can happen if on-disk metadata has some
+ * corruption, e.g. bad reloc tree key offset.
+ */
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = merge_reloc_root(rc, root);
+ btrfs_put_root(root);
+ if (ret) {
+ if (list_empty(&reloc_root->root_list))
+ list_add_tail(&reloc_root->root_list,
+ &reloc_roots);
+ goto out;
+ }
+ } else {
+ if (!IS_ERR(root)) {
+ if (root->reloc_root == reloc_root) {
+ root->reloc_root = NULL;
+ btrfs_put_root(reloc_root);
+ }
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
+ &root->state);
+ btrfs_put_root(root);
+ }
+
+ list_del_init(&reloc_root->root_list);
+ /* Don't forget to queue this reloc root for cleanup */
+ list_add_tail(&reloc_root->reloc_dirty_list,
+ &rc->dirty_subvol_roots);
+ }
+ }
+
+ if (found) {
+ found = 0;
+ goto again;
+ }
+out:
+ if (ret) {
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ free_reloc_roots(&reloc_roots);
+
+ /* new reloc root may be added */
+ mutex_lock(&fs_info->reloc_mutex);
+ list_splice_init(&rc->reloc_roots, &reloc_roots);
+ mutex_unlock(&fs_info->reloc_mutex);
+ free_reloc_roots(&reloc_roots);
+ }
+
+ /*
+ * We used to have
+ *
+ * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ *
+ * here, but it's wrong. If we fail to start the transaction in
+ * prepare_to_merge() we will have only 0 ref reloc roots, none of which
+ * have actually been removed from the reloc_root_tree rb tree. This is
+ * fine because we're bailing here, and we hold a reference on the root
+ * for the list that holds it, so these roots will be cleaned up when we
+ * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
+ * will be cleaned up on unmount.
+ *
+ * The remaining nodes will be cleaned up by free_reloc_control.
+ */
+}
+
+static void free_block_list(struct rb_root *blocks)
+{
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ while ((rb_node = rb_first(blocks))) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+ rb_erase(rb_node, blocks);
+ kfree(block);
+ }
+}
+
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *reloc_root)
+{
+ struct btrfs_fs_info *fs_info = reloc_root->fs_info;
+ struct btrfs_root *root;
+ int ret;
+
+ if (reloc_root->last_trans == trans->transid)
+ return 0;
+
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
+
+ /*
+ * This should succeed, since we can't have a reloc root without having
+ * already looked up the actual root and created the reloc root for this
+ * root.
+ *
+ * However if there's some sort of corruption where we have a ref to a
+ * reloc root without a corresponding root this could return ENOENT.
+ */
+ if (IS_ERR(root)) {
+ ASSERT(0);
+ return PTR_ERR(root);
+ }
+ if (root->reloc_root != reloc_root) {
+ ASSERT(0);
+ btrfs_err(fs_info,
+ "root %llu has two reloc roots associated with it",
+ reloc_root->root_key.offset);
+ btrfs_put_root(root);
+ return -EUCLEAN;
+ }
+ ret = btrfs_record_root_in_trans(trans, root);
+ btrfs_put_root(root);
+
+ return ret;
+}
+
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_backref_node *node,
+ struct btrfs_backref_edge *edges[])
+{
+ struct btrfs_backref_node *next;
+ struct btrfs_root *root;
+ int index = 0;
+ int ret;
+
+ next = node;
+ while (1) {
+ cond_resched();
+ next = walk_up_backref(next, edges, &index);
+ root = next->root;
+
+ /*
+ * If there is no root, then our references for this block are
+ * incomplete, as we should be able to walk all the way up to a
+ * block that is owned by a root.
+ *
+ * This path is only for SHAREABLE roots, so if we come upon a
+ * non-SHAREABLE root then we have backrefs that resolve
+ * improperly.
+ *
+ * Both of these cases indicate file system corruption, or a bug
+ * in the backref walking code.
+ */
+ if (!root) {
+ ASSERT(0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu doesn't have a backref path ending in a root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ ASSERT(0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu has multiple refs with one ending in a non-shareable root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ ret = record_reloc_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
+ break;
+ }
+
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
+ root = root->reloc_root;
+
+ /*
+ * We could have raced with another thread which failed, so
+ * root->reloc_root may not be set, return ENOENT in this case.
+ */
+ if (!root)
+ return ERR_PTR(-ENOENT);
+
+ if (next->new_bytenr != root->node->start) {
+ /*
+ * We just created the reloc root, so we shouldn't have
+ * ->new_bytenr set and this shouldn't be in the changed
+ * list. If it is then we have multiple roots pointing
+ * at the same bytenr which indicates corruption, or
+ * we've made a mistake in the backref walking code.
+ */
+ ASSERT(next->new_bytenr == 0);
+ ASSERT(list_empty(&next->list));
+ if (next->new_bytenr || !list_empty(&next->list)) {
+ btrfs_err(trans->fs_info,
+ "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+ node->bytenr, next->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+
+ next->new_bytenr = root->node->start;
+ btrfs_put_root(next->root);
+ next->root = btrfs_grab_root(root);
+ ASSERT(next->root);
+ list_add_tail(&next->list,
+ &rc->backref_cache.changed);
+ mark_block_processed(rc, next);
+ break;
+ }
+
+ WARN_ON(1);
+ root = NULL;
+ next = walk_down_backref(edges, &index);
+ if (!next || next->level <= node->level)
+ break;
+ }
+ if (!root) {
+ /*
+ * This can happen if there's fs corruption or if there's a bug
+ * in the backref lookup code.
+ */
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
+
+ next = node;
+ /* setup backref node path for btrfs_reloc_cow_block */
+ while (1) {
+ rc->backref_cache.path[next->level] = next;
+ if (--index < 0)
+ break;
+ next = edges[index]->node[UPPER];
+ }
+ return root;
+}
+
+/*
+ * Select a tree root for relocation.
+ *
+ * Return NULL if the block is not shareable. We should use do_relocation() in
+ * this case.
+ *
+ * Return a tree root pointer if the block is shareable.
+ * Return -ENOENT if the block is root of reloc tree.
+ */
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_node *next;
+ struct btrfs_root *root;
+ struct btrfs_root *fs_root = NULL;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ int index = 0;
+
+ next = node;
+ while (1) {
+ cond_resched();
+ next = walk_up_backref(next, edges, &index);
+ root = next->root;
+
+ /*
+ * This can occur if we have incomplete extent refs leading all
+ * the way up a particular path, in this case return -EUCLEAN.
+ */
+ if (!root)
+ return ERR_PTR(-EUCLEAN);
+
+ /* No other choice for non-shareable tree */
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+ fs_root = root;
+
+ if (next != node)
+ return NULL;
+
+ next = walk_down_backref(edges, &index);
+ if (!next || next->level <= node->level)
+ break;
+ }
+
+ if (!fs_root)
+ return ERR_PTR(-ENOENT);
+ return fs_root;
+}
+
+static noinline_for_stack
+u64 calcu_metadata_size(struct reloc_control *rc,
+ struct btrfs_backref_node *node, int reserve)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct btrfs_backref_node *next = node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ u64 num_bytes = 0;
+ int index = 0;
+
+ BUG_ON(reserve && node->processed);
+
+ while (next) {
+ cond_resched();
+ while (1) {
+ if (next->processed && (reserve || next != node))
+ break;
+
+ num_bytes += fs_info->nodesize;
+
+ if (list_empty(&next->upper))
+ break;
+
+ edge = list_entry(next->upper.next,
+ struct btrfs_backref_edge, list[LOWER]);
+ edges[index++] = edge;
+ next = edge->node[UPPER];
+ }
+ next = walk_down_backref(edges, &index);
+ }
+ return num_bytes;
+}
+
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_root *root = rc->extent_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 num_bytes;
+ int ret;
+ u64 tmp;
+
+ num_bytes = calcu_metadata_size(rc, node, 1) * 2;
+
+ trans->block_rsv = rc->block_rsv;
+ rc->reserved_bytes += num_bytes;
+
+ /*
+ * We are under a transaction here so we can only do limited flushing.
+ * If we get an enospc just kick back -EAGAIN so we know to drop the
+ * transaction and try to refill when we can flush all the things.
+ */
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_LIMIT);
+ if (ret) {
+ tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
+ while (tmp <= rc->reserved_bytes)
+ tmp <<= 1;
+ /*
+ * only one thread can access block_rsv at this point,
+ * so we don't need hold lock to protect block_rsv.
+ * we expand more reservation size here to allow enough
+ * space for relocation and we will return earlier in
+ * enospc case.
+ */
+ rc->block_rsv->size = tmp + fs_info->nodesize *
+ RELOCATION_RESERVED_NODES;
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_backref_node *node,
+ struct btrfs_key *key,
+ struct btrfs_path *path, int lowest)
+{
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_root *root;
+ struct extent_buffer *eb;
+ u32 blocksize;
+ u64 bytenr;
+ int slot;
+ int ret = 0;
+
+ /*
+ * If we are lowest then this is the first time we're processing this
+ * block, and thus shouldn't have an eb associated with it yet.
+ */
+ ASSERT(!lowest || !node->eb);
+
+ path->lowest_level = node->level + 1;
+ rc->backref_cache.path[node->level] = node;
+ list_for_each_entry(edge, &node->upper, list[LOWER]) {
+ struct btrfs_ref ref = { 0 };
+
+ cond_resched();
+
+ upper = edge->node[UPPER];
+ root = select_reloc_root(trans, rc, upper, edges);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto next;
+ }
+
+ if (upper->eb && !upper->locked) {
+ if (!lowest) {
+ ret = btrfs_bin_search(upper->eb, key, &slot);
+ if (ret < 0)
+ goto next;
+ BUG_ON(ret);
+ bytenr = btrfs_node_blockptr(upper->eb, slot);
+ if (node->eb->start == bytenr)
+ goto next;
+ }
+ btrfs_backref_drop_node_buffer(upper);
+ }
+
+ if (!upper->eb) {
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+
+ btrfs_release_path(path);
+ break;
+ }
+
+ if (!upper->eb) {
+ upper->eb = path->nodes[upper->level];
+ path->nodes[upper->level] = NULL;
+ } else {
+ BUG_ON(upper->eb != path->nodes[upper->level]);
+ }
+
+ upper->locked = 1;
+ path->locks[upper->level] = 0;
+
+ slot = path->slots[upper->level];
+ btrfs_release_path(path);
+ } else {
+ ret = btrfs_bin_search(upper->eb, key, &slot);
+ if (ret < 0)
+ goto next;
+ BUG_ON(ret);
+ }
+
+ bytenr = btrfs_node_blockptr(upper->eb, slot);
+ if (lowest) {
+ if (bytenr != node->bytenr) {
+ btrfs_err(root->fs_info,
+ "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
+ bytenr, node->bytenr, slot,
+ upper->eb->start);
+ ret = -EIO;
+ goto next;
+ }
+ } else {
+ if (node->eb->start == bytenr)
+ goto next;
+ }
+
+ blocksize = root->fs_info->nodesize;
+ eb = btrfs_read_node_slot(upper->eb, slot);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto next;
+ }
+ btrfs_tree_lock(eb);
+
+ if (!node->eb) {
+ ret = btrfs_cow_block(trans, root, eb, upper->eb,
+ slot, &eb, BTRFS_NESTING_COW);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ goto next;
+ /*
+ * We've just COWed this block, it should have updated
+ * the correct backref node entry.
+ */
+ ASSERT(node->eb == eb);
+ } else {
+ btrfs_set_node_blockptr(upper->eb, slot,
+ node->eb->start);
+ btrfs_set_node_ptr_generation(upper->eb, slot,
+ trans->transid);
+ btrfs_mark_buffer_dirty(upper->eb);
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ node->eb->start, blocksize,
+ upper->eb->start);
+ btrfs_init_tree_ref(&ref, node->level,
+ btrfs_header_owner(upper->eb),
+ root->root_key.objectid, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (!ret)
+ ret = btrfs_drop_subtree(trans, root, eb,
+ upper->eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+next:
+ if (!upper->pending)
+ btrfs_backref_drop_node_buffer(upper);
+ else
+ btrfs_backref_unlock_node_buffer(upper);
+ if (ret)
+ break;
+ }
+
+ if (!ret && node->pending) {
+ btrfs_backref_drop_node_buffer(node);
+ list_move_tail(&node->list, &rc->backref_cache.changed);
+ node->pending = 0;
+ }
+
+ path->lowest_level = 0;
+
+ /*
+ * We should have allocated all of our space in the block rsv and thus
+ * shouldn't ENOSPC.
+ */
+ ASSERT(ret != -ENOSPC);
+ return ret;
+}
+
+static int link_to_upper(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_backref_node *node,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+
+ btrfs_node_key_to_cpu(node->eb, &key, 0);
+ return do_relocation(trans, rc, node, &key, path, 0);
+}
+
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_path *path, int err)
+{
+ LIST_HEAD(list);
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct btrfs_backref_node *node;
+ int level;
+ int ret;
+
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ while (!list_empty(&cache->pending[level])) {
+ node = list_entry(cache->pending[level].next,
+ struct btrfs_backref_node, list);
+ list_move_tail(&node->list, &list);
+ BUG_ON(!node->pending);
+
+ if (!err) {
+ ret = link_to_upper(trans, rc, node, path);
+ if (ret < 0)
+ err = ret;
+ }
+ }
+ list_splice_init(&list, &cache->pending[level]);
+ }
+ return err;
+}
+
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_node *next = node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ int index = 0;
+
+ while (next) {
+ cond_resched();
+ while (1) {
+ if (next->processed)
+ break;
+
+ mark_block_processed(rc, next);
+
+ if (list_empty(&next->upper))
+ break;
+
+ edge = list_entry(next->upper.next,
+ struct btrfs_backref_edge, list[LOWER]);
+ edges[index++] = edge;
+ next = edge->node[UPPER];
+ }
+ next = walk_down_backref(edges, &index);
+ }
+}
+
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
+{
+ u32 blocksize = rc->extent_root->fs_info->nodesize;
+
+ if (test_range_bit(&rc->processed_blocks, bytenr,
+ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ return 1;
+ return 0;
+}
+
+static int get_tree_block_key(struct btrfs_fs_info *fs_info,
+ struct tree_block *block)
+{
+ struct extent_buffer *eb;
+
+ eb = read_tree_block(fs_info, block->bytenr, block->owner,
+ block->key.offset, block->level, NULL);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+ if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return -EIO;
+ }
+ if (block->level == 0)
+ btrfs_item_key_to_cpu(eb, &block->key, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &block->key, 0);
+ free_extent_buffer(eb);
+ block->key_ready = 1;
+ return 0;
+}
+
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_backref_node *node,
+ struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root;
+ int ret = 0;
+
+ if (!node)
+ return 0;
+
+ /*
+ * If we fail here we want to drop our backref_node because we are going
+ * to start over and regenerate the tree for it.
+ */
+ ret = reserve_metadata_space(trans, rc, node);
+ if (ret)
+ goto out;
+
+ BUG_ON(node->processed);
+ root = select_one_root(node);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+
+ /* See explanation in select_one_root for the -EUCLEAN case. */
+ ASSERT(ret == -ENOENT);
+ if (ret == -ENOENT) {
+ ret = 0;
+ update_processed_blocks(rc, node);
+ }
+ goto out;
+ }
+
+ if (root) {
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ /*
+ * This block was the root block of a root, and this is
+ * the first time we're processing the block and thus it
+ * should not have had the ->new_bytenr modified and
+ * should have not been included on the changed list.
+ *
+ * However in the case of corruption we could have
+ * multiple refs pointing to the same block improperly,
+ * and thus we would trip over these checks. ASSERT()
+ * for the developer case, because it could indicate a
+ * bug in the backref code, however error out for a
+ * normal user in the case of corruption.
+ */
+ ASSERT(node->new_bytenr == 0);
+ ASSERT(list_empty(&node->list));
+ if (node->new_bytenr || !list_empty(&node->list)) {
+ btrfs_err(root->fs_info,
+ "bytenr %llu has improper references to it",
+ node->bytenr);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ goto out;
+ /*
+ * Another thread could have failed, need to check if we
+ * have reloc_root actually set.
+ */
+ if (!root->reloc_root) {
+ ret = -ENOENT;
+ goto out;
+ }
+ root = root->reloc_root;
+ node->new_bytenr = root->node->start;
+ btrfs_put_root(node->root);
+ node->root = btrfs_grab_root(root);
+ ASSERT(node->root);
+ list_add_tail(&node->list, &rc->backref_cache.changed);
+ } else {
+ path->lowest_level = node->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ btrfs_release_path(path);
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
+ if (ret > 0)
+ ret = 0;
+ }
+ if (!ret)
+ update_processed_blocks(rc, node);
+ } else {
+ ret = do_relocation(trans, rc, node, key, path, 1);
+ }
+out:
+ if (ret || node->level == 0 || node->cowonly)
+ btrfs_backref_cleanup_node(&rc->backref_cache, node);
+ return ret;
+}
+
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, struct rb_root *blocks)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct btrfs_backref_node *node;
+ struct btrfs_path *path;
+ struct tree_block *block;
+ struct tree_block *next;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out_free_blocks;
+ }
+
+ /* Kick in readahead for tree blocks with missing keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
+ if (!block->key_ready)
+ btrfs_readahead_tree_block(fs_info, block->bytenr,
+ block->owner, 0,
+ block->level);
+ }
+
+ /* Get first keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
+ if (!block->key_ready) {
+ err = get_tree_block_key(fs_info, block);
+ if (err)
+ goto out_free_path;
+ }
+ }
+
+ /* Do tree relocation */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
+ node = build_backref_tree(rc, &block->key,
+ block->level, block->bytenr);
+ if (IS_ERR(node)) {
+ err = PTR_ERR(node);
+ goto out;
+ }
+
+ ret = relocate_tree_block(trans, rc, node, &block->key,
+ path);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ }
+out:
+ err = finish_pending_nodes(trans, rc, path, err);
+
+out_free_path:
+ btrfs_free_path(path);
+out_free_blocks:
+ free_block_list(blocks);
+ return err;
+}
+
+static noinline_for_stack int prealloc_file_extent_cluster(
+ struct btrfs_inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ u64 alloc_hint = 0;
+ u64 start;
+ u64 end;
+ u64 offset = inode->index_cnt;
+ u64 num_bytes;
+ int nr;
+ int ret = 0;
+ u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 prealloc_start = cluster->start - offset;
+ u64 prealloc_end = cluster->end - offset;
+ u64 cur_offset = prealloc_start;
+
+ /*
+ * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
+ * This means the range [i_size, PAGE_END + 1) is filled with zeros by
+ * btrfs_do_readpage() call of previously relocated file cluster.
+ *
+ * If the current cluster starts in the above range, btrfs_do_readpage()
+ * will skip the read, and relocate_one_page() will later writeback
+ * the padding zeros as new data, causing data corruption.
+ *
+ * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ */
+ if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
+ struct page *page;
+
+ ASSERT(sectorsize < PAGE_SIZE);
+ ASSERT(IS_ALIGNED(i_size, sectorsize));
+
+ /*
+ * Subpage can't handle page with DIRTY but without UPTODATE
+ * bit as it can lead to the following deadlock:
+ *
+ * btrfs_read_folio()
+ * | Page already *locked*
+ * |- btrfs_lock_and_flush_ordered_range()
+ * |- btrfs_start_ordered_extent()
+ * |- extent_write_cache_pages()
+ * |- lock_page()
+ * We try to lock the page we already hold.
+ *
+ * Here we just writeback the whole data reloc inode, so that
+ * we will be ensured to have no dirty range in the page, and
+ * are safe to clear the uptodate bits.
+ *
+ * This shouldn't cause too much overhead, as we need to write
+ * the data back anyway.
+ */
+ ret = filemap_write_and_wait(mapping);
+ if (ret < 0)
+ return ret;
+
+ clear_extent_bits(&inode->io_tree, i_size,
+ round_up(i_size, PAGE_SIZE) - 1,
+ EXTENT_UPTODATE);
+ page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+ /*
+ * If page is freed we don't need to do anything then, as we
+ * will re-read the whole page anyway.
+ */
+ if (page) {
+ btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ round_up(i_size, PAGE_SIZE) - i_size);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+
+ BUG_ON(cluster->start != cluster->boundary[0]);
+ ret = btrfs_alloc_data_chunk_ondemand(inode,
+ prealloc_end + 1 - prealloc_start);
+ if (ret)
+ return ret;
+
+ btrfs_inode_lock(&inode->vfs_inode, 0);
+ for (nr = 0; nr < cluster->nr; nr++) {
+ start = cluster->boundary[nr] - offset;
+ if (nr + 1 < cluster->nr)
+ end = cluster->boundary[nr + 1] - 1 - offset;
+ else
+ end = cluster->end - offset;
+
+ lock_extent(&inode->io_tree, start, end, NULL);
+ num_bytes = end + 1 - start;
+ ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
+ num_bytes, num_bytes,
+ end + 1, &alloc_hint);
+ cur_offset = end + 1;
+ unlock_extent(&inode->io_tree, start, end, NULL);
+ if (ret)
+ break;
+ }
+ btrfs_inode_unlock(&inode->vfs_inode, 0);
+
+ if (cur_offset < prealloc_end)
+ btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
+ prealloc_end + 1 - cur_offset);
+ return ret;
+}
+
+static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
+ u64 start, u64 end, u64 block_start)
+{
+ struct extent_map *em;
+ int ret = 0;
+
+ em = alloc_extent_map();
+ if (!em)
+ return -ENOMEM;
+
+ em->start = start;
+ em->len = end + 1 - start;
+ em->block_len = em->len;
+ em->block_start = block_start;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ free_extent_map(em);
+
+ return ret;
+}
+
+/*
+ * Allow error injection to test balance/relocation cancellation
+ */
+noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ atomic_read(&fs_info->reloc_cancel_req) ||
+ fatal_signal_pending(current);
+}
+ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
+
+static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+ int cluster_nr)
+{
+ /* Last extent, use cluster end directly */
+ if (cluster_nr >= cluster->nr - 1)
+ return cluster->end;
+
+ /* Use next boundary start*/
+ return cluster->boundary[cluster_nr + 1] - 1;
+}
+
+static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
+ struct file_extent_cluster *cluster,
+ int *cluster_nr, unsigned long page_index)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ struct page *page;
+ u64 page_start;
+ u64 page_end;
+ u64 cur;
+ int ret;
+
+ ASSERT(page_index <= last_index);
+ page = find_lock_page(inode->i_mapping, page_index);
+ if (!page) {
+ page_cache_sync_readahead(inode->i_mapping, ra, NULL,
+ page_index, last_index + 1 - page_index);
+ page = find_or_create_page(inode->i_mapping, page_index, mask);
+ if (!page)
+ return -ENOMEM;
+ }
+
+ if (PageReadahead(page))
+ page_cache_async_readahead(inode->i_mapping, ra, NULL,
+ page_folio(page), page_index,
+ last_index + 1 - page_index);
+
+ if (!PageUptodate(page)) {
+ btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto release_page;
+ }
+ }
+
+ /*
+ * We could have lost page private when we dropped the lock to read the
+ * page above, make sure we set_page_extent_mapped here so we have any
+ * of the subpage blocksize stuff we need in place.
+ */
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto release_page;
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_SIZE - 1;
+
+ /*
+ * Start from the cluster, as for subpage case, the cluster can start
+ * inside the page.
+ */
+ cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
+ while (cur <= page_end) {
+ u64 extent_start = cluster->boundary[*cluster_nr] - offset;
+ u64 extent_end = get_cluster_boundary_end(cluster,
+ *cluster_nr) - offset;
+ u64 clamped_start = max(page_start, extent_start);
+ u64 clamped_end = min(page_end, extent_end);
+ u32 clamped_len = clamped_end + 1 - clamped_start;
+
+ /* Reserve metadata for this range */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+ clamped_len, clamped_len,
+ false);
+ if (ret)
+ goto release_page;
+
+ /* Mark the range delalloc and dirty for later writeback */
+ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
+ clamped_end, 0, NULL);
+ if (ret) {
+ clear_extent_bits(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ clamped_len, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ clamped_len);
+ goto release_page;
+ }
+ btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+
+ /*
+ * Set the boundary if it's inside the page.
+ * Data relocation requires the destination extents to have the
+ * same size as the source.
+ * EXTENT_BOUNDARY bit prevents current extent from being merged
+ * with previous extent.
+ */
+ if (in_range(cluster->boundary[*cluster_nr] - offset,
+ page_start, PAGE_SIZE)) {
+ u64 boundary_start = cluster->boundary[*cluster_nr] -
+ offset;
+ u64 boundary_end = boundary_start +
+ fs_info->sectorsize - 1;
+
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ boundary_start, boundary_end,
+ EXTENT_BOUNDARY);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
+ cur += clamped_len;
+
+ /* Crossed extent end, go to next extent */
+ if (cur >= extent_end) {
+ (*cluster_nr)++;
+ /* Just finished the last extent of the cluster, exit. */
+ if (*cluster_nr >= cluster->nr)
+ break;
+ }
+ }
+ unlock_page(page);
+ put_page(page);
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ btrfs_throttle(fs_info);
+ if (btrfs_should_cancel_balance(fs_info))
+ ret = -ECANCELED;
+ return ret;
+
+release_page:
+ unlock_page(page);
+ put_page(page);
+ return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ unsigned long index;
+ unsigned long last_index;
+ struct file_ra_state *ra;
+ int cluster_nr = 0;
+ int ret = 0;
+
+ if (!cluster->nr)
+ return 0;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+
+ ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
+ if (ret)
+ goto out;
+
+ file_ra_state_init(ra, inode->i_mapping);
+
+ ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
+ cluster->end - offset, cluster->start);
+ if (ret)
+ goto out;
+
+ last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ for (index = (cluster->start - offset) >> PAGE_SHIFT;
+ index <= last_index && !ret; index++)
+ ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
+ if (ret == 0)
+ WARN_ON(cluster_nr != cluster->nr);
+out:
+ kfree(ra);
+ return ret;
+}
+
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+ struct file_extent_cluster *cluster)
+{
+ int ret;
+
+ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+
+ if (!cluster->nr)
+ cluster->start = extent_key->objectid;
+ else
+ BUG_ON(cluster->nr >= MAX_EXTENTS);
+ cluster->end = extent_key->objectid + extent_key->offset - 1;
+ cluster->boundary[cluster->nr] = extent_key->objectid;
+ cluster->nr++;
+
+ if (cluster->nr >= MAX_EXTENTS) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+ return 0;
+}
+
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+ struct btrfs_key *extent_key,
+ struct btrfs_path *path,
+ struct rb_root *blocks)
+{
+ struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ struct btrfs_tree_block_info *bi;
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ u32 item_size;
+ int level = -1;
+ u64 generation;
+ u64 owner = 0;
+
+ eb = path->nodes[0];
+ item_size = btrfs_item_size(eb, path->slots[0]);
+
+ if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+ item_size >= sizeof(*ei) + sizeof(*bi)) {
+ unsigned long ptr = 0, end;
+
+ ei = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_extent_item);
+ end = (unsigned long)ei + item_size;
+ if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ level = btrfs_tree_block_level(eb, bi);
+ ptr = (unsigned long)(bi + 1);
+ } else {
+ level = (int)extent_key->offset;
+ ptr = (unsigned long)(ei + 1);
+ }
+ generation = btrfs_extent_generation(eb, ei);
+
+ /*
+ * We're reading random blocks without knowing their owner ahead
+ * of time. This is ok most of the time, as all reloc roots and
+ * fs roots have the same lock type. However normal trees do
+ * not, and the only way to know ahead of time is to read the
+ * inline ref offset. We know it's an fs root if
+ *
+ * 1. There's more than one ref.
+ * 2. There's a SHARED_DATA_REF_KEY set.
+ * 3. FULL_BACKREF is set on the flags.
+ *
+ * Otherwise it's safe to assume that the ref offset == the
+ * owner of this block, so we can use that when calling
+ * read_tree_block.
+ */
+ if (btrfs_extent_refs(eb, ei) == 1 &&
+ !(btrfs_extent_flags(eb, ei) &
+ BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
+ ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(eb, iref,
+ BTRFS_REF_TYPE_BLOCK);
+ if (type == BTRFS_REF_TYPE_INVALID)
+ return -EINVAL;
+ if (type == BTRFS_TREE_BLOCK_REF_KEY)
+ owner = btrfs_extent_inline_ref_offset(eb, iref);
+ }
+ } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
+ btrfs_print_v0_err(eb->fs_info);
+ btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
+ return -EINVAL;
+ } else {
+ BUG();
+ }
+
+ btrfs_release_path(path);
+
+ BUG_ON(level == -1);
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block)
+ return -ENOMEM;
+
+ block->bytenr = extent_key->objectid;
+ block->key.objectid = rc->extent_root->fs_info->nodesize;
+ block->key.offset = generation;
+ block->level = level;
+ block->key_ready = 0;
+ block->owner = owner;
+
+ rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
+ if (rb_node)
+ btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
+ -EEXIST);
+
+ return 0;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+ u64 bytenr, u32 blocksize,
+ struct rb_root *blocks)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+ bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+
+ if (tree_block_processed(bytenr, rc))
+ return 0;
+
+ if (rb_simple_search(blocks, bytenr))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
+ key.objectid = bytenr;
+ if (skinny) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+ } else {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = blocksize;
+ }
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0 && skinny) {
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ (key.type == BTRFS_METADATA_ITEM_KEY ||
+ (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == blocksize)))
+ ret = 0;
+ }
+
+ if (ret) {
+ skinny = false;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
+ if (ret) {
+ ASSERT(ret == 1);
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_err(fs_info,
+ "tree block extent item (%llu) is not found in extent tree",
+ bytenr);
+ WARN_ON(1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = add_tree_block(rc, &key, path, blocks);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *block_group,
+ struct inode *inode,
+ u64 ino)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (inode)
+ goto truncate;
+
+ inode = btrfs_iget(fs_info->sb, ino, root);
+ if (IS_ERR(inode))
+ return -ENOENT;
+
+truncate:
+ ret = btrfs_check_trunc_cache_free_space(fs_info,
+ &fs_info->global_block_rsv);
+ if (ret)
+ goto out;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+out:
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
+ * cache inode, to avoid free space cache data extent blocking data relocation.
+ */
+static int delete_v1_space_cache(struct extent_buffer *leaf,
+ struct btrfs_block_group *block_group,
+ u64 data_bytenr)
+{
+ u64 space_cache_ino;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ bool found = false;
+ int i;
+ int ret;
+
+ if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
+ return 0;
+
+ for (i = 0; i < btrfs_header_nritems(leaf); i++) {
+ u8 type;
+
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
+
+ if ((type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) &&
+ btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
+ found = true;
+ space_cache_ino = key.objectid;
+ break;
+ }
+ }
+ if (!found)
+ return -ENOENT;
+ ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
+ space_cache_ino);
+ return ret;
+}
+
+/*
+ * helper to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+ struct btrfs_key *extent_key,
+ struct btrfs_path *path,
+ struct rb_root *blocks)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct ulist *leaves = NULL;
+ struct ulist_iterator leaf_uiter;
+ struct ulist_node *ref_node = NULL;
+ const u32 blocksize = fs_info->nodesize;
+ int ret = 0;
+
+ btrfs_release_path(path);
+ ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
+ 0, &leaves, NULL, true);
+ if (ret < 0)
+ return ret;
+
+ ULIST_ITER_INIT(&leaf_uiter);
+ while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
+ struct extent_buffer *eb;
+
+ eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ break;
+ }
+ ret = delete_v1_space_cache(eb, rc->block_group,
+ extent_key->objectid);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ break;
+ ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
+ if (ret < 0)
+ break;
+ }
+ if (ret < 0)
+ free_block_list(blocks);
+ ulist_free(leaves);
+ return ret;
+}
+
+/*
+ * helper to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
+ struct btrfs_key *extent_key)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u64 start, end, last;
+ int ret;
+
+ last = rc->block_group->start + rc->block_group->length;
+ while (1) {
+ cond_resched();
+ if (rc->search_start >= last) {
+ ret = 1;
+ break;
+ }
+
+ key.objectid = rc->search_start;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ break;
+next:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(rc->extent_root, path);
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid >= last) {
+ ret = 1;
+ break;
+ }
+
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.objectid + key.offset <= rc->search_start) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ if (key.type == BTRFS_METADATA_ITEM_KEY &&
+ key.objectid + fs_info->nodesize <=
+ rc->search_start) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ ret = find_first_extent_bit(&rc->processed_blocks,
+ key.objectid, &start, &end,
+ EXTENT_DIRTY, NULL);
+
+ if (ret == 0 && start <= key.objectid) {
+ btrfs_release_path(path);
+ rc->search_start = end + 1;
+ } else {
+ if (key.type == BTRFS_EXTENT_ITEM_KEY)
+ rc->search_start = key.objectid + key.offset;
+ else
+ rc->search_start = key.objectid +
+ fs_info->nodesize;
+ memcpy(extent_key, &key, sizeof(key));
+ return 0;
+ }
+ }
+ btrfs_release_path(path);
+ return ret;
+}
+
+static void set_reloc_control(struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+
+ mutex_lock(&fs_info->reloc_mutex);
+ fs_info->reloc_ctl = rc;
+ mutex_unlock(&fs_info->reloc_mutex);
+}
+
+static void unset_reloc_control(struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+
+ mutex_lock(&fs_info->reloc_mutex);
+ fs_info->reloc_ctl = NULL;
+ mutex_unlock(&fs_info->reloc_mutex);
+}
+
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
+ BTRFS_BLOCK_RSV_TEMP);
+ if (!rc->block_rsv)
+ return -ENOMEM;
+
+ memset(&rc->cluster, 0, sizeof(rc->cluster));
+ rc->search_start = rc->block_group->start;
+ rc->extents_found = 0;
+ rc->nodes_relocated = 0;
+ rc->merging_rsv_size = 0;
+ rc->reserved_bytes = 0;
+ rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
+ RELOCATION_RESERVED_NODES;
+ ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
+ rc->block_rsv, rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret)
+ return ret;
+
+ rc->create_reloc_tree = 1;
+ set_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans)) {
+ unset_reloc_control(rc);
+ /*
+ * extent tree is not a ref_cow tree and has no reloc_root to
+ * cleanup. And callers are responsible to free the above
+ * block rsv.
+ */
+ return PTR_ERR(trans);
+ }
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ unset_reloc_control(rc);
+
+ return ret;
+}
+
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct rb_root blocks = RB_ROOT;
+ struct btrfs_key key;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ u64 flags;
+ int ret;
+ int err = 0;
+ int progress = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ ret = prepare_to_relocate(rc);
+ if (ret) {
+ err = ret;
+ goto out_free;
+ }
+
+ while (1) {
+ rc->reserved_bytes = 0;
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+ rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret) {
+ err = ret;
+ break;
+ }
+ progress++;
+ trans = btrfs_start_transaction(rc->extent_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+restart:
+ if (update_backref_cache(trans, &rc->backref_cache)) {
+ btrfs_end_transaction(trans);
+ trans = NULL;
+ continue;
+ }
+
+ ret = find_next_extent(rc, path, &key);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+
+ rc->extents_found++;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ flags = btrfs_extent_flags(path->nodes[0], ei);
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ ret = add_tree_block(rc, &key, path, &blocks);
+ } else if (rc->stage == UPDATE_DATA_PTRS &&
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ ret = add_data_references(rc, &key, path, &blocks);
+ } else {
+ btrfs_release_path(path);
+ ret = 0;
+ }
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ if (!RB_EMPTY_ROOT(&blocks)) {
+ ret = relocate_tree_blocks(trans, rc, &blocks);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ err = ret;
+ break;
+ }
+ rc->extents_found--;
+ rc->search_start = key.objectid;
+ }
+ }
+
+ btrfs_end_transaction_throttle(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ trans = NULL;
+
+ if (rc->stage == MOVE_DATA_EXTENTS &&
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ rc->found_file_extent = 1;
+ ret = relocate_data_extent(rc->data_inode,
+ &key, &rc->cluster);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ }
+ if (btrfs_should_cancel_balance(fs_info)) {
+ err = -ECANCELED;
+ break;
+ }
+ }
+ if (trans && progress && err == -ENOSPC) {
+ ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
+ if (ret == 1) {
+ err = 0;
+ progress = 0;
+ goto restart;
+ }
+ }
+
+ btrfs_release_path(path);
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
+
+ if (trans) {
+ btrfs_end_transaction_throttle(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ }
+
+ if (!err) {
+ ret = relocate_file_extent_cluster(rc->data_inode,
+ &rc->cluster);
+ if (ret < 0)
+ err = ret;
+ }
+
+ rc->create_reloc_tree = 0;
+ set_reloc_control(rc);
+
+ btrfs_backref_release_cache(&rc->backref_cache);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
+
+ /*
+ * Even in the case when the relocation is cancelled, we should all go
+ * through prepare_to_merge() and merge_reloc_roots().
+ *
+ * For error (including cancelled balance), prepare_to_merge() will
+ * mark all reloc trees orphan, then queue them for cleanup in
+ * merge_reloc_roots()
+ */
+ err = prepare_to_merge(rc, err);
+
+ merge_reloc_roots(rc);
+
+ rc->merge_reloc_tree = 0;
+ unset_reloc_control(rc);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
+
+ /* get rid of pinned extents */
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret && !err)
+ err = ret;
+out_free:
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
+ btrfs_free_block_rsv(fs_info, rc->block_rsv);
+ btrfs_free_path(path);
+ return err;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_inode_item *item;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+ memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
+ btrfs_set_inode_generation(leaf, item, 1);
+ btrfs_set_inode_size(leaf, item, 0);
+ btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static void delete_orphan_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_del_item(trans, root, path);
+out:
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ btrfs_free_path(path);
+}
+
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static noinline_for_stack
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *group)
+{
+ struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root;
+ u64 objectid;
+ int err = 0;
+
+ root = btrfs_grab_root(fs_info->data_reloc_root);
+ trans = btrfs_start_transaction(root, 6);
+ if (IS_ERR(trans)) {
+ btrfs_put_root(root);
+ return ERR_CAST(trans);
+ }
+
+ err = btrfs_get_free_objectid(root, &objectid);
+ if (err)
+ goto out;
+
+ err = __insert_orphan_inode(trans, root, objectid);
+ if (err)
+ goto out;
+
+ inode = btrfs_iget(fs_info->sb, objectid, root);
+ if (IS_ERR(inode)) {
+ delete_orphan_inode(trans, root, objectid);
+ err = PTR_ERR(inode);
+ inode = NULL;
+ goto out;
+ }
+ BTRFS_I(inode)->index_cnt = group->start;
+
+ err = btrfs_orphan_add(trans, BTRFS_I(inode));
+out:
+ btrfs_put_root(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ if (err) {
+ iput(inode);
+ inode = ERR_PTR(err);
+ }
+ return inode;
+}
+
+/*
+ * Mark start of chunk relocation that is cancellable. Check if the cancellation
+ * has been requested meanwhile and don't start in that case.
+ *
+ * Return:
+ * 0 success
+ * -EINPROGRESS operation is already in progress, that's probably a bug
+ * -ECANCELED cancellation request was set before the operation started
+ */
+static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
+{
+ if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
+ /* This should not happen */
+ btrfs_err(fs_info, "reloc already running, cannot start");
+ return -EINPROGRESS;
+ }
+
+ if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
+ btrfs_info(fs_info, "chunk relocation canceled on start");
+ /*
+ * On cancel, clear all requests but let the caller mark
+ * the end after cleanup operations.
+ */
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+ return -ECANCELED;
+ }
+ return 0;
+}
+
+/*
+ * Mark end of chunk relocation that is cancellable and wake any waiters.
+ */
+static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
+{
+ /* Requested after start, clear bit first so any waiters can continue */
+ if (atomic_read(&fs_info->reloc_cancel_req) > 0)
+ btrfs_info(fs_info, "chunk relocation canceled during operation");
+ clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+}
+
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
+{
+ struct reloc_control *rc;
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc)
+ return NULL;
+
+ INIT_LIST_HEAD(&rc->reloc_roots);
+ INIT_LIST_HEAD(&rc->dirty_subvol_roots);
+ btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
+ mapping_tree_init(&rc->reloc_root_tree);
+ extent_io_tree_init(fs_info, &rc->processed_blocks,
+ IO_TREE_RELOC_BLOCKS, NULL);
+ return rc;
+}
+
+static void free_reloc_control(struct reloc_control *rc)
+{
+ struct mapping_node *node, *tmp;
+
+ free_reloc_roots(&rc->reloc_roots);
+ rbtree_postorder_for_each_entry_safe(node, tmp,
+ &rc->reloc_root_tree.rb_root, rb_node)
+ kfree(node);
+
+ kfree(rc);
+}
+
+/*
+ * Print the block group being relocated
+ */
+static void describe_relocation(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *block_group)
+{
+ char buf[128] = {'\0'};
+
+ btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
+
+ btrfs_info(fs_info,
+ "relocating block group %llu flags %s",
+ block_group->start, buf);
+}
+
+static const char *stage_to_string(int stage)
+{
+ if (stage == MOVE_DATA_EXTENTS)
+ return "move data extents";
+ if (stage == UPDATE_DATA_PTRS)
+ return "update data pointers";
+ return "unknown";
+}
+
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
+{
+ struct btrfs_block_group *bg;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
+ struct reloc_control *rc;
+ struct inode *inode;
+ struct btrfs_path *path;
+ int ret;
+ int rw = 0;
+ int err = 0;
+
+ /*
+ * This only gets set if we had a half-deleted snapshot on mount. We
+ * cannot allow relocation to start while we're still trying to clean up
+ * these pending deletions.
+ */
+ ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
+ if (ret)
+ return ret;
+
+ /* We may have been woken up by close_ctree, so bail if we're closing. */
+ if (btrfs_fs_closing(fs_info))
+ return -EINTR;
+
+ bg = btrfs_lookup_block_group(fs_info, group_start);
+ if (!bg)
+ return -ENOENT;
+
+ /*
+ * Relocation of a data block group creates ordered extents. Without
+ * sb_start_write(), we can freeze the filesystem while unfinished
+ * ordered extents are left. Such ordered extents can cause a deadlock
+ * e.g. when syncfs() is waiting for their completion but they can't
+ * finish because they block when joining a transaction, due to the
+ * fact that the freeze locks are being held in write mode.
+ */
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ ASSERT(sb_write_started(fs_info->sb));
+
+ if (btrfs_pinned_by_swapfile(fs_info, bg)) {
+ btrfs_put_block_group(bg);
+ return -ETXTBSY;
+ }
+
+ rc = alloc_reloc_control(fs_info);
+ if (!rc) {
+ btrfs_put_block_group(bg);
+ return -ENOMEM;
+ }
+
+ ret = reloc_chunk_start(fs_info);
+ if (ret < 0) {
+ err = ret;
+ goto out_put_bg;
+ }
+
+ rc->extent_root = extent_root;
+ rc->block_group = bg;
+
+ ret = btrfs_inc_block_group_ro(rc->block_group, true);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ rw = 1;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ inode = lookup_free_space_inode(rc->block_group, path);
+ btrfs_free_path(path);
+
+ if (!IS_ERR(inode))
+ ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
+ else
+ ret = PTR_ERR(inode);
+
+ if (ret && ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+
+ rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+ if (IS_ERR(rc->data_inode)) {
+ err = PTR_ERR(rc->data_inode);
+ rc->data_inode = NULL;
+ goto out;
+ }
+
+ describe_relocation(fs_info, rc->block_group);
+
+ btrfs_wait_block_group_reservations(rc->block_group);
+ btrfs_wait_nocow_writers(rc->block_group);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX,
+ rc->block_group->start,
+ rc->block_group->length);
+
+ ret = btrfs_zone_finish(rc->block_group);
+ WARN_ON(ret && ret != -EAGAIN);
+
+ while (1) {
+ int finishes_stage;
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = relocate_block_group(rc);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0)
+ err = ret;
+
+ finishes_stage = rc->stage;
+ /*
+ * We may have gotten ENOSPC after we already dirtied some
+ * extents. If writeout happens while we're relocating a
+ * different block group we could end up hitting the
+ * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
+ * btrfs_reloc_cow_block. Make sure we write everything out
+ * properly so we don't trip over this problem, and then break
+ * out of the loop if we hit an error.
+ */
+ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+ ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+ (u64)-1);
+ if (ret)
+ err = ret;
+ invalidate_mapping_pages(rc->data_inode->i_mapping,
+ 0, -1);
+ rc->stage = UPDATE_DATA_PTRS;
+ }
+
+ if (err < 0)
+ goto out;
+
+ if (rc->extents_found == 0)
+ break;
+
+ btrfs_info(fs_info, "found %llu extents, stage: %s",
+ rc->extents_found, stage_to_string(finishes_stage));
+ }
+
+ WARN_ON(rc->block_group->pinned > 0);
+ WARN_ON(rc->block_group->reserved > 0);
+ WARN_ON(rc->block_group->used > 0);
+out:
+ if (err && rw)
+ btrfs_dec_block_group_ro(rc->block_group);
+ iput(rc->data_inode);
+out_put_bg:
+ btrfs_put_block_group(bg);
+ reloc_chunk_end(fs_info);
+ free_reloc_control(rc);
+ return err;
+}
+
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans;
+ int ret, err;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ memset(&root->root_item.drop_progress, 0,
+ sizeof(root->root_item.drop_progress));
+ btrfs_set_root_drop_level(&root->root_item, 0);
+ btrfs_set_root_refs(&root->root_item, 0);
+ ret = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key, &root->root_item);
+
+ err = btrfs_end_transaction(trans);
+ if (err)
+ return err;
+ return ret;
+}
+
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
+{
+ LIST_HEAD(reloc_roots);
+ struct btrfs_key key;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct reloc_control *rc = NULL;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_BACK;
+
+ key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
+ path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(path);
+
+ if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+ key.type != BTRFS_ROOT_ITEM_KEY)
+ break;
+
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
+ if (IS_ERR(reloc_root)) {
+ err = PTR_ERR(reloc_root);
+ goto out;
+ }
+
+ set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
+ list_add(&reloc_root->root_list, &reloc_roots);
+
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ fs_root = btrfs_get_fs_root(fs_info,
+ reloc_root->root_key.offset, false);
+ if (IS_ERR(fs_root)) {
+ ret = PTR_ERR(fs_root);
+ if (ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+ ret = mark_garbage_root(reloc_root);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ } else {
+ btrfs_put_root(fs_root);
+ }
+ }
+
+ if (key.offset == 0)
+ break;
+
+ key.offset--;
+ }
+ btrfs_release_path(path);
+
+ if (list_empty(&reloc_roots))
+ goto out;
+
+ rc = alloc_reloc_control(fs_info);
+ if (!rc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ret = reloc_chunk_start(fs_info);
+ if (ret < 0) {
+ err = ret;
+ goto out_end;
+ }
+
+ rc->extent_root = btrfs_extent_root(fs_info, 0);
+
+ set_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_unset;
+ }
+
+ rc->merge_reloc_tree = 1;
+
+ while (!list_empty(&reloc_roots)) {
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&reloc_root->root_list);
+
+ if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+ list_add_tail(&reloc_root->root_list,
+ &rc->reloc_roots);
+ continue;
+ }
+
+ fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
+ if (IS_ERR(fs_root)) {
+ err = PTR_ERR(fs_root);
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
+ btrfs_end_transaction(trans);
+ goto out_unset;
+ }
+
+ err = __add_reloc_root(reloc_root);
+ ASSERT(err != -EEXIST);
+ if (err) {
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(fs_root);
+ btrfs_end_transaction(trans);
+ goto out_unset;
+ }
+ fs_root->reloc_root = btrfs_grab_root(reloc_root);
+ btrfs_put_root(fs_root);
+ }
+
+ err = btrfs_commit_transaction(trans);
+ if (err)
+ goto out_unset;
+
+ merge_reloc_roots(rc);
+
+ unset_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_clean;
+ }
+ err = btrfs_commit_transaction(trans);
+out_clean:
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
+out_unset:
+ unset_reloc_control(rc);
+out_end:
+ reloc_chunk_end(fs_info);
+ free_reloc_control(rc);
+out:
+ free_reloc_roots(&reloc_roots);
+
+ btrfs_free_path(path);
+
+ if (err == 0) {
+ /* cleanup orphan inode in data relocation tree */
+ fs_root = btrfs_grab_root(fs_info->data_reloc_root);
+ ASSERT(fs_root);
+ err = btrfs_orphan_cleanup(fs_root);
+ btrfs_put_root(fs_root);
+ }
+ return err;
+}
+
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_extent *ordered;
+ int ret;
+ u64 disk_bytenr;
+ u64 new_bytenr;
+ LIST_HEAD(list);
+
+ ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+ BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
+
+ disk_bytenr = file_pos + inode->index_cnt;
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
+ disk_bytenr + len - 1, &list, 0, false);
+ if (ret)
+ goto out;
+
+ while (!list_empty(&list)) {
+ sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_del_init(&sums->list);
+
+ /*
+ * We need to offset the new_bytenr based on where the csum is.
+ * We need to do this because we will read in entire prealloc
+ * extents but we may have written to say the middle of the
+ * prealloc extent, so we need to make sure the csum goes with
+ * the right disk offset.
+ *
+ * We can do this because the data reloc inode refers strictly
+ * to the on disk bytes, so we don't have to worry about
+ * disk_len vs real len like with real inodes since it's all
+ * disk length.
+ */
+ new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
+ sums->bytenr = new_bytenr;
+
+ btrfs_add_ordered_sum(ordered, sums);
+ }
+out:
+ btrfs_put_ordered_extent(ordered);
+ return ret;
+}
+
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct reloc_control *rc;
+ struct btrfs_backref_node *node;
+ int first_cow = 0;
+ int level;
+ int ret = 0;
+
+ rc = fs_info->reloc_ctl;
+ if (!rc)
+ return 0;
+
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
+
+ level = btrfs_header_level(buf);
+ if (btrfs_header_generation(buf) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ first_cow = 1;
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ rc->create_reloc_tree) {
+ WARN_ON(!first_cow && level == 0);
+
+ node = rc->backref_cache.path[level];
+ BUG_ON(node->bytenr != buf->start &&
+ node->new_bytenr != buf->start);
+
+ btrfs_backref_drop_node_buffer(node);
+ atomic_inc(&cow->refs);
+ node->eb = cow;
+ node->new_bytenr = cow->start;
+
+ if (!node->pending) {
+ list_move_tail(&node->list,
+ &rc->backref_cache.pending[level]);
+ node->pending = 1;
+ }
+
+ if (first_cow)
+ mark_block_processed(rc, node);
+
+ if (first_cow && level > 0)
+ rc->nodes_relocated += buf->len;
+ }
+
+ if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
+ ret = replace_file_extents(trans, rc, root, cow);
+ return ret;
+}
+
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * required for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve)
+{
+ struct btrfs_root *root = pending->root;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ if (!rc || !have_reloc_root(root))
+ return;
+
+ if (!rc->merge_reloc_tree)
+ return;
+
+ root = root->reloc_root;
+ BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+ /*
+ * relocation is in the stage of merging trees. the space
+ * used by merging a reloc tree is twice the size of
+ * relocated tree nodes in the worst case. half for cowing
+ * the reloc tree, half for cowing the fs tree. the space
+ * used by cowing the reloc tree will be freed after the
+ * tree is dropped. if we create snapshot, cowing the fs
+ * tree may use more space than it frees. so we need
+ * reserve extra space.
+ */
+ *bytes_to_reserve += rc->nodes_relocated;
+}
+
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ *
+ * This is similar to btrfs_init_reloc_root(), we come out of here with two
+ * references held on the reloc_root, one for root->reloc_root and one for
+ * rc->reloc_roots.
+ */
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root *new_root;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+ int ret;
+
+ if (!rc || !have_reloc_root(root))
+ return 0;
+
+ rc = root->fs_info->reloc_ctl;
+ rc->merging_rsv_size += rc->nodes_relocated;
+
+ if (rc->merge_reloc_tree) {
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ rc->block_rsv,
+ rc->nodes_relocated, true);
+ if (ret)
+ return ret;
+ }
+
+ new_root = pending->snap;
+ reloc_root = create_reloc_root(trans, root->reloc_root,
+ new_root->root_key.objectid);
+ if (IS_ERR(reloc_root))
+ return PTR_ERR(reloc_root);
+
+ ret = __add_reloc_root(reloc_root);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ /* Pairs with create_reloc_root */
+ btrfs_put_root(reloc_root);
+ return ret;
+ }
+ new_root->reloc_root = btrfs_grab_root(reloc_root);
+
+ if (rc->create_reloc_tree)
+ ret = clone_backref_node(trans, rc, root, reloc_root);
+ return ret;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000..7d783f094
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/err.h>
+#include <linux/uuid.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "qgroup.h"
+#include "space-info.h"
+
+/*
+ * Read a root item from the tree. In case we detect a root item smaller then
+ * sizeof(root_item), we know it's an old version of the root structure and
+ * initialize all new fields to zero. The same happens if we detect mismatching
+ * generation numbers as then we know the root was once mounted with an older
+ * kernel that was not aware of the root item structure change.
+ */
+static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+ struct btrfs_root_item *item)
+{
+ u32 len;
+ int need_reset = 0;
+
+ len = btrfs_item_size(eb, slot);
+ read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
+ min_t(u32, len, sizeof(*item)));
+ if (len < sizeof(*item))
+ need_reset = 1;
+ if (!need_reset && btrfs_root_generation(item)
+ != btrfs_root_generation_v2(item)) {
+ if (btrfs_root_generation_v2(item) != 0) {
+ btrfs_warn(eb->fs_info,
+ "mismatching generation and generation_v2 found in root item. This root was probably mounted with an older kernel. Resetting all new fields.");
+ }
+ need_reset = 1;
+ }
+ if (need_reset) {
+ /* Clear all members from generation_v2 onwards. */
+ memset_startat(item, 0, generation_v2);
+ generate_random_guid(item->uuid);
+ }
+}
+
+/*
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the root key of the tree we look for
+ *
+ * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
+ */
+int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *l;
+ int ret;
+ int slot;
+
+ ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (search_key->offset != -1ULL) { /* the search key is exact */
+ if (ret > 0)
+ goto out;
+ } else {
+ BUG_ON(ret == 0); /* Logical error */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ ret = 0;
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ if (found_key.objectid != search_key->objectid ||
+ found_key.type != BTRFS_ROOT_ITEM_KEY) {
+ ret = 1;
+ goto out;
+ }
+
+ if (root_item)
+ btrfs_read_root_item(l, slot, root_item);
+ if (root_key)
+ memcpy(root_key, &found_key, sizeof(found_key));
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+void btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node)
+{
+ btrfs_set_root_bytenr(item, node->start);
+ btrfs_set_root_level(item, btrfs_header_level(node));
+ btrfs_set_root_generation(item, btrfs_header_generation(node));
+}
+
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *l;
+ int ret;
+ int slot;
+ unsigned long ptr;
+ u32 old_len;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ btrfs_crit(fs_info,
+ "unable to find root key (%llu %u %llu) in tree %llu",
+ key->objectid, key->type, key->offset,
+ root->root_key.objectid);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr_offset(l, slot);
+ old_len = btrfs_item_size(l, slot);
+
+ /*
+ * If this is the first time we update the root item which originated
+ * from an older kernel, we need to enlarge the item size to make room
+ * for the added fields.
+ */
+ if (old_len < sizeof(*item)) {
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, key, path,
+ -1, 1);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path,
+ key, sizeof(*item));
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr_offset(l, slot);
+ }
+
+ /*
+ * Update generation_v2 so at the next mount we know the new root
+ * fields are valid.
+ */
+ btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+
+ write_extent_buffer(l, item, ptr, sizeof(*item));
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ const struct btrfs_key *key, struct btrfs_root_item *item)
+{
+ /*
+ * Make sure generation v1 and v2 match. See update_root for details.
+ */
+ btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+ return btrfs_insert_item(trans, root, key, item, sizeof(*item));
+}
+
+int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root *root;
+ int err = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = 0;
+
+ while (1) {
+ u64 root_objectid;
+
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(path);
+
+ if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ root_objectid = key.offset;
+ key.offset++;
+
+ root = btrfs_get_fs_root(fs_info, root_objectid, false);
+ err = PTR_ERR_OR_ZERO(root);
+ if (err && err != -ENOENT) {
+ break;
+ } else if (err == -ENOENT) {
+ struct btrfs_trans_handle *trans;
+
+ btrfs_release_path(path);
+
+ trans = btrfs_join_transaction(tree_root);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ btrfs_handle_fs_error(fs_info, err,
+ "Failed to start trans to delete orphan item");
+ break;
+ }
+ err = btrfs_del_orphan_item(trans, tree_root,
+ root_objectid);
+ btrfs_end_transaction(trans);
+ if (err) {
+ btrfs_handle_fs_error(fs_info, err,
+ "Failed to delete root orphan item");
+ break;
+ }
+ continue;
+ }
+
+ WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ struct btrfs_key drop_key;
+
+ btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
+ /*
+ * If we have a non-zero drop_progress then we know we
+ * made it partly through deleting this snapshot, and
+ * thus we need to make sure we block any balance from
+ * happening until this snapshot is completely dropped.
+ */
+ if (drop_key.objectid != 0 || drop_key.type != 0 ||
+ drop_key.offset != 0) {
+ set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+ set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+ }
+
+ set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
+ btrfs_add_dead_root(root);
+ }
+ btrfs_put_root(root);
+ }
+
+ btrfs_free_path(path);
+ return err;
+}
+
+/* drop the root item for 'key' from the tree root */
+int btrfs_del_root(struct btrfs_trans_handle *trans,
+ const struct btrfs_key *key)
+{
+ struct btrfs_root *root = trans->fs_info->tree_root;
+ struct btrfs_path *path;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ BUG_ON(ret != 0);
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+ u64 ref_id, u64 dirid, u64 *sequence,
+ const struct fscrypt_str *name)
+{
+ struct btrfs_root *tree_root = trans->fs_info->tree_root;
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = ref_id;
+again:
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ ptr = (unsigned long)(ref + 1);
+ if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
+ (btrfs_root_ref_name_len(leaf, ref) != name->len) ||
+ memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *sequence = btrfs_root_ref_sequence(leaf, ref);
+
+ ret = btrfs_del_item(trans, tree_root, path);
+ if (ret)
+ goto out;
+ } else {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ *
+ * Will return 0, -ENOMEM, or anything from the CoW path
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+ u64 ref_id, u64 dirid, u64 sequence,
+ const struct fscrypt_str *name)
+{
+ struct btrfs_root *tree_root = trans->fs_info->tree_root;
+ struct btrfs_key key;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = ref_id;
+again:
+ ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+ sizeof(*ref) + name->len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ btrfs_set_root_ref_dirid(leaf, ref, dirid);
+ btrfs_set_root_ref_sequence(leaf, ref, sequence);
+ btrfs_set_root_ref_name_len(leaf, ref, name->len);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(leaf, name->name, ptr, name->len);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * Old btrfs forgets to init root_item->flags and root_item->byte_limit
+ * for subvolumes. To work around this problem, we steal a bit from
+ * root_item->inode_item->flags, and use it to indicate if those fields
+ * have been properly initialized.
+ */
+void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
+{
+ u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);
+
+ if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
+ inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
+ btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
+ btrfs_set_root_flags(root_item, 0);
+ btrfs_set_root_limit(root_item, 0);
+ }
+}
+
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root_item *item = &root->root_item;
+ struct timespec64 ct;
+
+ ktime_get_real_ts64(&ct);
+ spin_lock(&root->root_item_lock);
+ btrfs_set_root_ctransid(item, trans->transid);
+ btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
+ btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
+ spin_unlock(&root->root_item_lock);
+}
+
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * use_global_rsv: allow fallback to the global block reservation
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reservation mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv, int items,
+ bool use_global_rsv)
+{
+ u64 qgroup_num_bytes = 0;
+ u64 num_bytes;
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* One for parent inode, two for dir entries */
+ qgroup_num_bytes = 3 * fs_info->nodesize;
+ ret = btrfs_qgroup_reserve_meta_prealloc(root,
+ qgroup_num_bytes, true,
+ false);
+ if (ret)
+ return ret;
+ }
+
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
+ rsv->space_info = btrfs_find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+ ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+
+ if (ret == -ENOSPC && use_global_rsv)
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
+
+ if (ret && qgroup_num_bytes)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+
+ if (!ret) {
+ spin_lock(&rsv->lock);
+ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&rsv->lock);
+ }
+ return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 qgroup_to_release;
+
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
+}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 000000000..1672d4846
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,4558 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2011, 2012 STRATO. All rights reserved.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
+#include <crypto/hash.h>
+#include "ctree.h"
+#include "discard.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+#include "transaction.h"
+#include "backref.h"
+#include "extent_io.h"
+#include "dev-replace.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "raid56.h"
+#include "block-group.h"
+#include "zoned.h"
+
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ * - In case an unrepairable extent is encountered, track which files are
+ * affected and report them
+ * - track and record media errors, throw out bad devices
+ * - add a mode to also read unallocated space
+ */
+
+struct scrub_block;
+struct scrub_ctx;
+
+/*
+ * The following three values only influence the performance.
+ *
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first one configures an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
+
+/*
+ * The following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ */
+#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+
+#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
+
+struct scrub_recover {
+ refcount_t refs;
+ struct btrfs_io_context *bioc;
+ u64 map_length;
+};
+
+struct scrub_sector {
+ struct scrub_block *sblock;
+ struct list_head list;
+ u64 flags; /* extent flags */
+ u64 generation;
+ /* Offset in bytes to @sblock. */
+ u32 offset;
+ atomic_t refs;
+ unsigned int have_csum:1;
+ unsigned int io_error:1;
+ u8 csum[BTRFS_CSUM_SIZE];
+
+ struct scrub_recover *recover;
+};
+
+struct scrub_bio {
+ int index;
+ struct scrub_ctx *sctx;
+ struct btrfs_device *dev;
+ struct bio *bio;
+ blk_status_t status;
+ u64 logical;
+ u64 physical;
+ struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
+ int sector_count;
+ int next_free;
+ struct work_struct work;
+};
+
+struct scrub_block {
+ /*
+ * Each page will have its page::private used to record the logical
+ * bytenr.
+ */
+ struct page *pages[SCRUB_MAX_PAGES];
+ struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+ struct btrfs_device *dev;
+ /* Logical bytenr of the sblock */
+ u64 logical;
+ u64 physical;
+ u64 physical_for_dev_replace;
+ /* Length of sblock in bytes */
+ u32 len;
+ int sector_count;
+ int mirror_num;
+
+ atomic_t outstanding_sectors;
+ refcount_t refs; /* free mem on transition to zero */
+ struct scrub_ctx *sctx;
+ struct scrub_parity *sparity;
+ struct {
+ unsigned int header_error:1;
+ unsigned int checksum_error:1;
+ unsigned int no_io_error_seen:1;
+ unsigned int generation_error:1; /* also sets header_error */
+
+ /* The following is for the data used to check parity */
+ /* It is for the data with checksum */
+ unsigned int data_corrected:1;
+ };
+ struct work_struct work;
+};
+
+/* Used for the chunks with parity stripe such RAID5/6 */
+struct scrub_parity {
+ struct scrub_ctx *sctx;
+
+ struct btrfs_device *scrub_dev;
+
+ u64 logic_start;
+
+ u64 logic_end;
+
+ int nsectors;
+
+ u32 stripe_len;
+
+ refcount_t refs;
+
+ struct list_head sectors_list;
+
+ /* Work of parity check and repair */
+ struct work_struct work;
+
+ /* Mark the parity blocks which have data */
+ unsigned long dbitmap;
+
+ /*
+ * Mark the parity blocks which have data, but errors happen when
+ * read data or check data
+ */
+ unsigned long ebitmap;
+};
+
+struct scrub_ctx {
+ struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
+ struct btrfs_fs_info *fs_info;
+ int first_free;
+ int curr;
+ atomic_t bios_in_flight;
+ atomic_t workers_pending;
+ spinlock_t list_lock;
+ wait_queue_head_t list_wait;
+ struct list_head csum_list;
+ atomic_t cancel_req;
+ int readonly;
+ int sectors_per_bio;
+
+ /* State of IO submission throttling affecting the associated device */
+ ktime_t throttle_deadline;
+ u64 throttle_sent;
+
+ int is_dev_replace;
+ u64 write_pointer;
+
+ struct scrub_bio *wr_curr_bio;
+ struct mutex wr_lock;
+ struct btrfs_device *wr_tgtdev;
+ bool flush_all_writes;
+
+ /*
+ * statistics
+ */
+ struct btrfs_scrub_progress stat;
+ spinlock_t stat_lock;
+
+ /*
+ * Use a ref counter to avoid use-after-free issues. Scrub workers
+ * decrement bios_in_flight and workers_pending and then do a wakeup
+ * on the list_wait wait queue. We must ensure the main scrub task
+ * doesn't free the scrub context before or while the workers are
+ * doing the wakeup() call.
+ */
+ refcount_t refs;
+};
+
+struct scrub_warning {
+ struct btrfs_path *path;
+ u64 extent_item_size;
+ const char *errstr;
+ u64 physical;
+ u64 logical;
+ struct btrfs_device *dev;
+};
+
+struct full_stripe_lock {
+ struct rb_node node;
+ u64 logical;
+ u64 refs;
+ struct mutex mutex;
+};
+
+#ifndef CONFIG_64BIT
+/* This structure is for archtectures whose (void *) is smaller than u64 */
+struct scrub_page_private {
+ u64 logical;
+};
+#endif
+
+static int attach_scrub_page_private(struct page *page, u64 logical)
+{
+#ifdef CONFIG_64BIT
+ attach_page_private(page, (void *)logical);
+ return 0;
+#else
+ struct scrub_page_private *spp;
+
+ spp = kmalloc(sizeof(*spp), GFP_KERNEL);
+ if (!spp)
+ return -ENOMEM;
+ spp->logical = logical;
+ attach_page_private(page, (void *)spp);
+ return 0;
+#endif
+}
+
+static void detach_scrub_page_private(struct page *page)
+{
+#ifdef CONFIG_64BIT
+ detach_page_private(page);
+ return;
+#else
+ struct scrub_page_private *spp;
+
+ spp = detach_page_private(page);
+ kfree(spp);
+ return;
+#endif
+}
+
+static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
+ struct btrfs_device *dev,
+ u64 logical, u64 physical,
+ u64 physical_for_dev_replace,
+ int mirror_num)
+{
+ struct scrub_block *sblock;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ if (!sblock)
+ return NULL;
+ refcount_set(&sblock->refs, 1);
+ sblock->sctx = sctx;
+ sblock->logical = logical;
+ sblock->physical = physical;
+ sblock->physical_for_dev_replace = physical_for_dev_replace;
+ sblock->dev = dev;
+ sblock->mirror_num = mirror_num;
+ sblock->no_io_error_seen = 1;
+ /*
+ * Scrub_block::pages will be allocated at alloc_scrub_sector() when
+ * the corresponding page is not allocated.
+ */
+ return sblock;
+}
+
+/*
+ * Allocate a new scrub sector and attach it to @sblock.
+ *
+ * Will also allocate new pages for @sblock if needed.
+ */
+static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
+ u64 logical, gfp_t gfp)
+{
+ const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
+ struct scrub_sector *ssector;
+
+ /* We must never have scrub_block exceed U32_MAX in size. */
+ ASSERT(logical - sblock->logical < U32_MAX);
+
+ ssector = kzalloc(sizeof(*ssector), gfp);
+ if (!ssector)
+ return NULL;
+
+ /* Allocate a new page if the slot is not allocated */
+ if (!sblock->pages[page_index]) {
+ int ret;
+
+ sblock->pages[page_index] = alloc_page(gfp);
+ if (!sblock->pages[page_index]) {
+ kfree(ssector);
+ return NULL;
+ }
+ ret = attach_scrub_page_private(sblock->pages[page_index],
+ sblock->logical + (page_index << PAGE_SHIFT));
+ if (ret < 0) {
+ kfree(ssector);
+ __free_page(sblock->pages[page_index]);
+ sblock->pages[page_index] = NULL;
+ return NULL;
+ }
+ }
+
+ atomic_set(&ssector->refs, 1);
+ ssector->sblock = sblock;
+ /* The sector to be added should not be used */
+ ASSERT(sblock->sectors[sblock->sector_count] == NULL);
+ ssector->offset = logical - sblock->logical;
+
+ /* The sector count must be smaller than the limit */
+ ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
+
+ sblock->sectors[sblock->sector_count] = ssector;
+ sblock->sector_count++;
+ sblock->len += sblock->sctx->fs_info->sectorsize;
+
+ return ssector;
+}
+
+static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+ pgoff_t index;
+ /*
+ * When calling this function, ssector must be alreaday attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ index = ssector->offset >> PAGE_SHIFT;
+ ASSERT(index < SCRUB_MAX_PAGES);
+ ASSERT(sblock->pages[index]);
+ ASSERT(PagePrivate(sblock->pages[index]));
+ return sblock->pages[index];
+}
+
+static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+
+ /*
+ * When calling this function, ssector must be already attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ return offset_in_page(ssector->offset);
+}
+
+static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
+{
+ return page_address(scrub_sector_get_page(ssector)) +
+ scrub_sector_get_page_offset(ssector);
+}
+
+static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
+ unsigned int len)
+{
+ return bio_add_page(bio, scrub_sector_get_page(ssector), len,
+ scrub_sector_get_page_offset(ssector));
+}
+
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
+ struct scrub_block *sblocks_for_recheck[]);
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock,
+ int retry_failed_mirror);
+static void scrub_recheck_block_checksum(struct scrub_block *sblock);
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good);
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int sector_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
+ int sector_num);
+static int scrub_checksum_data(struct scrub_block *sblock);
+static int scrub_checksum_tree_block(struct scrub_block *sblock);
+static int scrub_checksum_super(struct scrub_block *sblock);
+static void scrub_block_put(struct scrub_block *sblock);
+static void scrub_sector_get(struct scrub_sector *sector);
+static void scrub_sector_put(struct scrub_sector *sector);
+static void scrub_parity_get(struct scrub_parity *sparity);
+static void scrub_parity_put(struct scrub_parity *sparity);
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum,
+ u64 physical_for_dev_replace);
+static void scrub_bio_end_io(struct bio *bio);
+static void scrub_bio_end_io_worker(struct work_struct *work);
+static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num);
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio);
+static void scrub_wr_bio_end_io_worker(struct work_struct *work);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
+
+static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
+{
+ return sector->recover &&
+ (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+}
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+ refcount_inc(&sctx->refs);
+ atomic_inc(&sctx->bios_in_flight);
+}
+
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+ atomic_dec(&sctx->bios_in_flight);
+ wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
+}
+
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ while (atomic_read(&fs_info->scrub_pause_req)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrub_pause_req) == 0);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+}
+
+static void scrub_pause_on(struct btrfs_fs_info *fs_info)
+{
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+}
+
+static void scrub_pause_off(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->scrub_lock);
+ __scrub_blocked_if_needed(fs_info);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ wake_up(&fs_info->scrub_pause_wait);
+}
+
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ scrub_pause_on(fs_info);
+ scrub_pause_off(fs_info);
+}
+
+/*
+ * Insert new full stripe lock into full stripe locks tree
+ *
+ * Return pointer to existing or newly inserted full_stripe_lock structure if
+ * everything works well.
+ * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
+ *
+ * NOTE: caller must hold full_stripe_locks_root->lock before calling this
+ * function
+ */
+static struct full_stripe_lock *insert_full_stripe_lock(
+ struct btrfs_full_stripe_locks_tree *locks_root,
+ u64 fstripe_logical)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct full_stripe_lock *entry;
+ struct full_stripe_lock *ret;
+
+ lockdep_assert_held(&locks_root->lock);
+
+ p = &locks_root->root.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct full_stripe_lock, node);
+ if (fstripe_logical < entry->logical) {
+ p = &(*p)->rb_left;
+ } else if (fstripe_logical > entry->logical) {
+ p = &(*p)->rb_right;
+ } else {
+ entry->refs++;
+ return entry;
+ }
+ }
+
+ /*
+ * Insert new lock.
+ */
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+ ret->logical = fstripe_logical;
+ ret->refs = 1;
+ mutex_init(&ret->mutex);
+
+ rb_link_node(&ret->node, parent, p);
+ rb_insert_color(&ret->node, &locks_root->root);
+ return ret;
+}
+
+/*
+ * Search for a full stripe lock of a block group
+ *
+ * Return pointer to existing full stripe lock if found
+ * Return NULL if not found
+ */
+static struct full_stripe_lock *search_full_stripe_lock(
+ struct btrfs_full_stripe_locks_tree *locks_root,
+ u64 fstripe_logical)
+{
+ struct rb_node *node;
+ struct full_stripe_lock *entry;
+
+ lockdep_assert_held(&locks_root->lock);
+
+ node = locks_root->root.rb_node;
+ while (node) {
+ entry = rb_entry(node, struct full_stripe_lock, node);
+ if (fstripe_logical < entry->logical)
+ node = node->rb_left;
+ else if (fstripe_logical > entry->logical)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ *
+ * Caller must ensure @cache is a RAID56 block group.
+ */
+static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
+{
+ u64 ret;
+
+ /*
+ * Due to chunk item size limit, full stripe length should not be
+ * larger than U32_MAX. Just a sanity check here.
+ */
+ WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
+
+ /*
+ * round_down() can only handle power of 2, while RAID56 full
+ * stripe length can be 64KiB * n, so we need to manually round down.
+ */
+ ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
+ cache->full_stripe_len + cache->start;
+ return ret;
+}
+
+/*
+ * Lock a full stripe to avoid concurrency of recovery and read
+ *
+ * It's only used for profiles with parities (RAID5/6), for other profiles it
+ * does nothing.
+ *
+ * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
+ * So caller must call unlock_full_stripe() at the same context.
+ *
+ * Return <0 if encounters error.
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+ bool *locked_ret)
+{
+ struct btrfs_block_group *bg_cache;
+ struct btrfs_full_stripe_locks_tree *locks_root;
+ struct full_stripe_lock *existing;
+ u64 fstripe_start;
+ int ret = 0;
+
+ *locked_ret = false;
+ bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg_cache) {
+ ASSERT(0);
+ return -ENOENT;
+ }
+
+ /* Profiles not based on parity don't need full stripe lock */
+ if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ goto out;
+ locks_root = &bg_cache->full_stripe_locks_root;
+
+ fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+ /* Now insert the full stripe lock */
+ mutex_lock(&locks_root->lock);
+ existing = insert_full_stripe_lock(locks_root, fstripe_start);
+ mutex_unlock(&locks_root->lock);
+ if (IS_ERR(existing)) {
+ ret = PTR_ERR(existing);
+ goto out;
+ }
+ mutex_lock(&existing->mutex);
+ *locked_ret = true;
+out:
+ btrfs_put_block_group(bg_cache);
+ return ret;
+}
+
+/*
+ * Unlock a full stripe.
+ *
+ * NOTE: Caller must ensure it's the same context calling corresponding
+ * lock_full_stripe().
+ *
+ * Return 0 if we unlock full stripe without problem.
+ * Return <0 for error
+ */
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+ bool locked)
+{
+ struct btrfs_block_group *bg_cache;
+ struct btrfs_full_stripe_locks_tree *locks_root;
+ struct full_stripe_lock *fstripe_lock;
+ u64 fstripe_start;
+ bool freeit = false;
+ int ret = 0;
+
+ /* If we didn't acquire full stripe lock, no need to continue */
+ if (!locked)
+ return 0;
+
+ bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg_cache) {
+ ASSERT(0);
+ return -ENOENT;
+ }
+ if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ goto out;
+
+ locks_root = &bg_cache->full_stripe_locks_root;
+ fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+ mutex_lock(&locks_root->lock);
+ fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
+ /* Unpaired unlock_full_stripe() detected */
+ if (!fstripe_lock) {
+ WARN_ON(1);
+ ret = -ENOENT;
+ mutex_unlock(&locks_root->lock);
+ goto out;
+ }
+
+ if (fstripe_lock->refs == 0) {
+ WARN_ON(1);
+ btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
+ fstripe_lock->logical);
+ } else {
+ fstripe_lock->refs--;
+ }
+
+ if (fstripe_lock->refs == 0) {
+ rb_erase(&fstripe_lock->node, &locks_root->root);
+ freeit = true;
+ }
+ mutex_unlock(&locks_root->lock);
+
+ mutex_unlock(&fstripe_lock->mutex);
+ if (freeit)
+ kfree(fstripe_lock);
+out:
+ btrfs_put_block_group(bg_cache);
+ return ret;
+}
+
+static void scrub_free_csums(struct scrub_ctx *sctx)
+{
+ while (!list_empty(&sctx->csum_list)) {
+ struct btrfs_ordered_sum *sum;
+ sum = list_first_entry(&sctx->csum_list,
+ struct btrfs_ordered_sum, list);
+ list_del(&sum->list);
+ kfree(sum);
+ }
+}
+
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
+{
+ int i;
+
+ if (!sctx)
+ return;
+
+ /* this can happen when scrub is cancelled */
+ if (sctx->curr != -1) {
+ struct scrub_bio *sbio = sctx->bios[sctx->curr];
+
+ for (i = 0; i < sbio->sector_count; i++)
+ scrub_block_put(sbio->sectors[i]->sblock);
+ bio_put(sbio->bio);
+ }
+
+ for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+ struct scrub_bio *sbio = sctx->bios[i];
+
+ if (!sbio)
+ break;
+ kfree(sbio);
+ }
+
+ kfree(sctx->wr_curr_bio);
+ scrub_free_csums(sctx);
+ kfree(sctx);
+}
+
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+ if (refcount_dec_and_test(&sctx->refs))
+ scrub_free_ctx(sctx);
+}
+
+static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
+ struct btrfs_fs_info *fs_info, int is_dev_replace)
+{
+ struct scrub_ctx *sctx;
+ int i;
+
+ sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
+ if (!sctx)
+ goto nomem;
+ refcount_set(&sctx->refs, 1);
+ sctx->is_dev_replace = is_dev_replace;
+ sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
+ sctx->curr = -1;
+ sctx->fs_info = fs_info;
+ INIT_LIST_HEAD(&sctx->csum_list);
+ for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+ struct scrub_bio *sbio;
+
+ sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
+ if (!sbio)
+ goto nomem;
+ sctx->bios[i] = sbio;
+
+ sbio->index = i;
+ sbio->sctx = sctx;
+ sbio->sector_count = 0;
+ INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
+
+ if (i != SCRUB_BIOS_PER_SCTX - 1)
+ sctx->bios[i]->next_free = i + 1;
+ else
+ sctx->bios[i]->next_free = -1;
+ }
+ sctx->first_free = 0;
+ atomic_set(&sctx->bios_in_flight, 0);
+ atomic_set(&sctx->workers_pending, 0);
+ atomic_set(&sctx->cancel_req, 0);
+
+ spin_lock_init(&sctx->list_lock);
+ spin_lock_init(&sctx->stat_lock);
+ init_waitqueue_head(&sctx->list_wait);
+ sctx->throttle_deadline = 0;
+
+ WARN_ON(sctx->wr_curr_bio != NULL);
+ mutex_init(&sctx->wr_lock);
+ sctx->wr_curr_bio = NULL;
+ if (is_dev_replace) {
+ WARN_ON(!fs_info->dev_replace.tgtdev);
+ sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
+ sctx->flush_all_writes = false;
+ }
+
+ return sctx;
+
+nomem:
+ scrub_free_ctx(sctx);
+ return ERR_PTR(-ENOMEM);
+}
+
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+ void *warn_ctx)
+{
+ u32 nlink;
+ int ret;
+ int i;
+ unsigned nofs_flag;
+ struct extent_buffer *eb;
+ struct btrfs_inode_item *inode_item;
+ struct scrub_warning *swarn = warn_ctx;
+ struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
+ struct inode_fs_paths *ipath = NULL;
+ struct btrfs_root *local_root;
+ struct btrfs_key key;
+
+ local_root = btrfs_get_fs_root(fs_info, root, true);
+ if (IS_ERR(local_root)) {
+ ret = PTR_ERR(local_root);
+ goto err;
+ }
+
+ /*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+ key.objectid = inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
+ if (ret) {
+ btrfs_put_root(local_root);
+ btrfs_release_path(swarn->path);
+ goto err;
+ }
+
+ eb = swarn->path->nodes[0];
+ inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+ struct btrfs_inode_item);
+ nlink = btrfs_inode_nlink(eb, inode_item);
+ btrfs_release_path(swarn->path);
+
+ /*
+ * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
+ * uses GFP_NOFS in this context, so we keep it consistent but it does
+ * not seem to be strictly necessary.
+ */
+ nofs_flag = memalloc_nofs_save();
+ ipath = init_ipath(4096, local_root, swarn->path);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(ipath)) {
+ btrfs_put_root(local_root);
+ ret = PTR_ERR(ipath);
+ ipath = NULL;
+ goto err;
+ }
+ ret = paths_from_inode(inum, ipath);
+
+ if (ret < 0)
+ goto err;
+
+ /*
+ * we deliberately ignore the bit ipath might have been too small to
+ * hold all of the paths here
+ */
+ for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+ btrfs_warn_in_rcu(fs_info,
+"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
+ swarn->errstr, swarn->logical,
+ rcu_str_deref(swarn->dev->name),
+ swarn->physical,
+ root, inum, offset,
+ fs_info->sectorsize, nlink,
+ (char *)(unsigned long)ipath->fspath->val[i]);
+
+ btrfs_put_root(local_root);
+ free_ipath(ipath);
+ return 0;
+
+err:
+ btrfs_warn_in_rcu(fs_info,
+ "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
+ swarn->errstr, swarn->logical,
+ rcu_str_deref(swarn->dev->name),
+ swarn->physical,
+ root, inum, offset, ret);
+
+ free_ipath(ipath);
+ return 0;
+}
+
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+{
+ struct btrfs_device *dev;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_path *path;
+ struct btrfs_key found_key;
+ struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ struct scrub_warning swarn;
+ unsigned long ptr = 0;
+ u64 extent_item_pos;
+ u64 flags = 0;
+ u64 ref_root;
+ u32 item_size;
+ u8 ref_level = 0;
+ int ret;
+
+ WARN_ON(sblock->sector_count < 1);
+ dev = sblock->dev;
+ fs_info = sblock->sctx->fs_info;
+
+ /* Super block error, no need to search extent tree. */
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ errstr, rcu_str_deref(dev->name),
+ sblock->physical);
+ return;
+ }
+ path = btrfs_alloc_path();
+ if (!path)
+ return;
+
+ swarn.physical = sblock->physical;
+ swarn.logical = sblock->logical;
+ swarn.errstr = errstr;
+ swarn.dev = NULL;
+
+ ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+ &flags);
+ if (ret < 0)
+ goto out;
+
+ extent_item_pos = swarn.logical - found_key.objectid;
+ swarn.extent_item_size = found_key.offset;
+
+ eb = path->nodes[0];
+ ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+ item_size = btrfs_item_size(eb, path->slots[0]);
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ do {
+ ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
+ item_size, &ref_root,
+ &ref_level);
+ btrfs_warn_in_rcu(fs_info,
+"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
+ errstr, swarn.logical,
+ rcu_str_deref(dev->name),
+ swarn.physical,
+ ref_level ? "node" : "leaf",
+ ret < 0 ? -1 : ref_level,
+ ret < 0 ? -1 : ref_root);
+ } while (ret != 1);
+ btrfs_release_path(path);
+ } else {
+ btrfs_release_path(path);
+ swarn.path = path;
+ swarn.dev = dev;
+ iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, 1,
+ scrub_print_warning_inode, &swarn, false);
+ }
+
+out:
+ btrfs_free_path(path);
+}
+
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+ refcount_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
+ struct scrub_recover *recover)
+{
+ if (refcount_dec_and_test(&recover->refs)) {
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_put_bioc(recover->bioc);
+ kfree(recover);
+ }
+}
+
+/*
+ * scrub_handle_errored_block gets called when either verification of the
+ * sectors failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all sectors in the bio, even though only one
+ * may be bad.
+ * The goal of this function is to repair the errored block by using the
+ * contents of one of the mirrors.
+ */
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
+{
+ struct scrub_ctx *sctx = sblock_to_check->sctx;
+ struct btrfs_device *dev = sblock_to_check->dev;
+ struct btrfs_fs_info *fs_info;
+ u64 logical;
+ unsigned int failed_mirror_index;
+ unsigned int is_metadata;
+ unsigned int have_csum;
+ /* One scrub_block for each mirror */
+ struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
+ struct scrub_block *sblock_bad;
+ int ret;
+ int mirror_index;
+ int sector_num;
+ int success;
+ bool full_stripe_locked;
+ unsigned int nofs_flag;
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ BUG_ON(sblock_to_check->sector_count < 1);
+ fs_info = sctx->fs_info;
+ if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ /*
+ * If we find an error in a super block, we just report it.
+ * They will get written with the next transaction commit
+ * anyway
+ */
+ scrub_print_warning("super block error", sblock_to_check);
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ return 0;
+ }
+ logical = sblock_to_check->logical;
+ ASSERT(sblock_to_check->mirror_num);
+ failed_mirror_index = sblock_to_check->mirror_num - 1;
+ is_metadata = !(sblock_to_check->sectors[0]->flags &
+ BTRFS_EXTENT_FLAG_DATA);
+ have_csum = sblock_to_check->sectors[0]->have_csum;
+
+ if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
+ return 0;
+
+ /*
+ * We must use GFP_NOFS because the scrub task might be waiting for a
+ * worker task executing this function and in turn a transaction commit
+ * might be waiting the scrub task to pause (which needs to wait for all
+ * the worker tasks to complete before pausing).
+ * We do allocations in the workers through insert_full_stripe_lock()
+ * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
+ * this function.
+ */
+ nofs_flag = memalloc_nofs_save();
+ /*
+ * For RAID5/6, race can happen for a different device scrub thread.
+ * For data corruption, Parity and Data threads will both try
+ * to recovery the data.
+ * Race can lead to doubly added csum error, or even unrecoverable
+ * error.
+ */
+ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
+ if (ret < 0) {
+ memalloc_nofs_restore(nofs_flag);
+ spin_lock(&sctx->stat_lock);
+ if (ret == -ENOMEM)
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return ret;
+ }
+
+ /*
+ * read all mirrors one after the other. This includes to
+ * re-read the extent or metadata block that failed (that was
+ * the cause that this fixup code is called) another time,
+ * sector by sector this time in order to know which sectors
+ * caused I/O errors and which ones are good (for all mirrors).
+ * It is the goal to handle the situation when more than one
+ * mirror contains I/O errors, but the errors do not
+ * overlap, i.e. the data can be repaired by selecting the
+ * sectors from those mirrors without I/O error on the
+ * particular sectors. One example (with blocks >= 2 * sectorsize)
+ * would be that mirror #1 has an I/O error on the first sector,
+ * the second sector is good, and mirror #2 has an I/O error on
+ * the second sector, but the first sector is good.
+ * Then the first sector of the first mirror can be repaired by
+ * taking the first sector of the second mirror, and the
+ * second sector of the second mirror can be repaired by
+ * copying the contents of the 2nd sector of the 1st mirror.
+ * One more note: if the sectors of one mirror contain I/O
+ * errors, the checksum cannot be verified. In order to get
+ * the best data for repairing, the first attempt is to find
+ * a mirror without I/O errors and with a validated checksum.
+ * Only if this is not possible, the sectors are picked from
+ * mirrors with I/O errors without considering the checksum.
+ * If the latter is the case, at the end, the checksum of the
+ * repaired area is verified in order to correctly maintain
+ * the statistics.
+ */
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ /*
+ * Note: the two members refs and outstanding_sectors are not
+ * used in the blocks that are used for the recheck procedure.
+ *
+ * But alloc_scrub_block() will initialize sblock::ref anyway,
+ * so we can use scrub_block_put() to clean them up.
+ *
+ * And here we don't setup the physical/dev for the sblock yet,
+ * they will be correctly initialized in scrub_setup_recheck_block().
+ */
+ sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
+ logical, 0, 0, mirror_index);
+ if (!sblocks_for_recheck[mirror_index]) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ goto out;
+ }
+ }
+
+ /* Setup the context, map the logical blocks and alloc the sectors */
+ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
+ if (ret) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ goto out;
+ }
+ BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+ sblock_bad = sblocks_for_recheck[failed_mirror_index];
+
+ /* build and submit the bios for the failed mirror, check checksums */
+ scrub_recheck_block(fs_info, sblock_bad, 1);
+
+ if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+ sblock_bad->no_io_error_seen) {
+ /*
+ * The error disappeared after reading sector by sector, or
+ * the area was part of a huge bio and other parts of the
+ * bio caused I/O errors, or the block layer merged several
+ * read requests into one and the error is caused by a
+ * different bio (usually one of the two latter cases is
+ * the cause)
+ */
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.unverified_errors++;
+ sblock_to_check->data_corrected = 1;
+ spin_unlock(&sctx->stat_lock);
+
+ if (sctx->is_dev_replace)
+ scrub_write_block_to_dev_replace(sblock_bad);
+ goto out;
+ }
+
+ if (!sblock_bad->no_io_error_seen) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ spin_unlock(&sctx->stat_lock);
+ if (__ratelimit(&rs))
+ scrub_print_warning("i/o error", sblock_to_check);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ } else if (sblock_bad->checksum_error) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.csum_errors++;
+ spin_unlock(&sctx->stat_lock);
+ if (__ratelimit(&rs))
+ scrub_print_warning("checksum error", sblock_to_check);
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ } else if (sblock_bad->header_error) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.verify_errors++;
+ spin_unlock(&sctx->stat_lock);
+ if (__ratelimit(&rs))
+ scrub_print_warning("checksum/header error",
+ sblock_to_check);
+ if (sblock_bad->generation_error)
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS);
+ else
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ }
+
+ if (sctx->readonly) {
+ ASSERT(!sctx->is_dev_replace);
+ goto out;
+ }
+
+ /*
+ * now build and submit the bios for the other mirrors, check
+ * checksums.
+ * First try to pick the mirror which is completely without I/O
+ * errors and also does not have a checksum error.
+ * If one is found, and if a checksum is present, the full block
+ * that is known to contain an error is rewritten. Afterwards
+ * the block is known to be corrected.
+ * If a mirror is found which is completely correct, and no
+ * checksum is present, only those sectors are rewritten that had
+ * an I/O error in the block to be repaired, since it cannot be
+ * determined, which copy of the other sectors is better (and it
+ * could happen otherwise that a correct sector would be
+ * overwritten by a bad one).
+ */
+ for (mirror_index = 0; ;mirror_index++) {
+ struct scrub_block *sblock_other;
+
+ if (mirror_index == failed_mirror_index)
+ continue;
+
+ /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
+ if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
+ if (mirror_index >= BTRFS_MAX_MIRRORS)
+ break;
+ if (!sblocks_for_recheck[mirror_index]->sector_count)
+ break;
+
+ sblock_other = sblocks_for_recheck[mirror_index];
+ } else {
+ struct scrub_recover *r = sblock_bad->sectors[0]->recover;
+ int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
+
+ if (mirror_index >= max_allowed)
+ break;
+ if (!sblocks_for_recheck[1]->sector_count)
+ break;
+
+ ASSERT(failed_mirror_index == 0);
+ sblock_other = sblocks_for_recheck[1];
+ sblock_other->mirror_num = 1 + mirror_index;
+ }
+
+ /* build and submit the bios, check checksums */
+ scrub_recheck_block(fs_info, sblock_other, 0);
+
+ if (!sblock_other->header_error &&
+ !sblock_other->checksum_error &&
+ sblock_other->no_io_error_seen) {
+ if (sctx->is_dev_replace) {
+ scrub_write_block_to_dev_replace(sblock_other);
+ goto corrected_error;
+ } else {
+ ret = scrub_repair_block_from_good_copy(
+ sblock_bad, sblock_other);
+ if (!ret)
+ goto corrected_error;
+ }
+ }
+ }
+
+ if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+ goto did_not_correct_error;
+
+ /*
+ * In case of I/O errors in the area that is supposed to be
+ * repaired, continue by picking good copies of those sectors.
+ * Select the good sectors from mirrors to rewrite bad sectors from
+ * the area to fix. Afterwards verify the checksum of the block
+ * that is supposed to be repaired. This verification step is
+ * only done for the purpose of statistic counting and for the
+ * final scrub report, whether errors remain.
+ * A perfect algorithm could make use of the checksum and try
+ * all possible combinations of sectors from the different mirrors
+ * until the checksum verification succeeds. For example, when
+ * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
+ * of mirror #2 is readable but the final checksum test fails,
+ * then the 2nd sector of mirror #3 could be tried, whether now
+ * the final checksum succeeds. But this would be a rare
+ * exception and is therefore not implemented. At least it is
+ * avoided that the good copy is overwritten.
+ * A more useful improvement would be to pick the sectors
+ * without I/O error based on sector sizes (512 bytes on legacy
+ * disks) instead of on sectorsize. Then maybe 512 byte of one
+ * mirror could be repaired by taking 512 byte of a different
+ * mirror, even if other 512 byte sectors in the same sectorsize
+ * area are unreadable.
+ */
+ success = 1;
+ for (sector_num = 0; sector_num < sblock_bad->sector_count;
+ sector_num++) {
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
+ struct scrub_block *sblock_other = NULL;
+
+ /* Skip no-io-error sectors in scrub */
+ if (!sector_bad->io_error && !sctx->is_dev_replace)
+ continue;
+
+ if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
+ /*
+ * In case of dev replace, if raid56 rebuild process
+ * didn't work out correct data, then copy the content
+ * in sblock_bad to make sure target device is identical
+ * to source device, instead of writing garbage data in
+ * sblock_for_recheck array to target device.
+ */
+ sblock_other = NULL;
+ } else if (sector_bad->io_error) {
+ /* Try to find no-io-error sector in mirrors */
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index]->sector_count > 0;
+ mirror_index++) {
+ if (!sblocks_for_recheck[mirror_index]->
+ sectors[sector_num]->io_error) {
+ sblock_other = sblocks_for_recheck[mirror_index];
+ break;
+ }
+ }
+ if (!sblock_other)
+ success = 0;
+ }
+
+ if (sctx->is_dev_replace) {
+ /*
+ * Did not find a mirror to fetch the sector from.
+ * scrub_write_sector_to_dev_replace() handles this
+ * case (sector->io_error), by filling the block with
+ * zeros before submitting the write request
+ */
+ if (!sblock_other)
+ sblock_other = sblock_bad;
+
+ if (scrub_write_sector_to_dev_replace(sblock_other,
+ sector_num) != 0) {
+ atomic64_inc(
+ &fs_info->dev_replace.num_write_errors);
+ success = 0;
+ }
+ } else if (sblock_other) {
+ ret = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_other,
+ sector_num, 0);
+ if (0 == ret)
+ sector_bad->io_error = 0;
+ else
+ success = 0;
+ }
+ }
+
+ if (success && !sctx->is_dev_replace) {
+ if (is_metadata || have_csum) {
+ /*
+ * need to verify the checksum now that all
+ * sectors on disk are repaired (the write
+ * request for data to be repaired is on its way).
+ * Just be lazy and use scrub_recheck_block()
+ * which re-reads the data before the checksum
+ * is verified, but most likely the data comes out
+ * of the page cache.
+ */
+ scrub_recheck_block(fs_info, sblock_bad, 1);
+ if (!sblock_bad->header_error &&
+ !sblock_bad->checksum_error &&
+ sblock_bad->no_io_error_seen)
+ goto corrected_error;
+ else
+ goto did_not_correct_error;
+ } else {
+corrected_error:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.corrected_errors++;
+ sblock_to_check->data_corrected = 1;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_err_rl_in_rcu(fs_info,
+ "fixed up error at logical %llu on dev %s",
+ logical, rcu_str_deref(dev->name));
+ }
+ } else {
+did_not_correct_error:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_err_rl_in_rcu(fs_info,
+ "unable to fixup (regular) error at logical %llu on dev %s",
+ logical, rcu_str_deref(dev->name));
+ }
+
+out:
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
+ struct scrub_recover *recover;
+ int sector_index;
+
+ /* Not allocated, continue checking the next mirror */
+ if (!sblock)
+ continue;
+
+ for (sector_index = 0; sector_index < sblock->sector_count;
+ sector_index++) {
+ /*
+ * Here we just cleanup the recover, each sector will be
+ * properly cleaned up by later scrub_block_put()
+ */
+ recover = sblock->sectors[sector_index]->recover;
+ if (recover) {
+ scrub_put_recover(fs_info, recover);
+ sblock->sectors[sector_index]->recover = NULL;
+ }
+ }
+ scrub_block_put(sblock);
+ }
+
+ ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
+{
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ return 2;
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ return 3;
+ else
+ return (int)bioc->num_stripes;
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+ u64 *raid_map,
+ int nstripes, int mirror,
+ int *stripe_index,
+ u64 *stripe_offset)
+{
+ int i;
+
+ if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /* RAID5/6 */
+ for (i = 0; i < nstripes; i++) {
+ if (raid_map[i] == RAID6_Q_STRIPE ||
+ raid_map[i] == RAID5_P_STRIPE)
+ continue;
+
+ if (logical >= raid_map[i] &&
+ logical < raid_map[i] + BTRFS_STRIPE_LEN)
+ break;
+ }
+
+ *stripe_index = i;
+ *stripe_offset = logical - raid_map[i];
+ } else {
+ /* The other RAID type */
+ *stripe_index = mirror;
+ *stripe_offset = 0;
+ }
+}
+
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
+ struct scrub_block *sblocks_for_recheck[])
+{
+ struct scrub_ctx *sctx = original_sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ u64 logical = original_sblock->logical;
+ u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
+ u64 generation = original_sblock->sectors[0]->generation;
+ u64 flags = original_sblock->sectors[0]->flags;
+ u64 have_csum = original_sblock->sectors[0]->have_csum;
+ struct scrub_recover *recover;
+ struct btrfs_io_context *bioc;
+ u64 sublen;
+ u64 mapped_length;
+ u64 stripe_offset;
+ int stripe_index;
+ int sector_index = 0;
+ int mirror_index;
+ int nmirrors;
+ int ret;
+
+ while (length > 0) {
+ sublen = min_t(u64, length, fs_info->sectorsize);
+ mapped_length = sublen;
+ bioc = NULL;
+
+ /*
+ * With a length of sectorsize, each returned stripe represents
+ * one mirror
+ */
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ logical, &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < sublen) {
+ btrfs_put_bioc(bioc);
+ btrfs_bio_counter_dec(fs_info);
+ return -EIO;
+ }
+
+ recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+ if (!recover) {
+ btrfs_put_bioc(bioc);
+ btrfs_bio_counter_dec(fs_info);
+ return -ENOMEM;
+ }
+
+ refcount_set(&recover->refs, 1);
+ recover->bioc = bioc;
+ recover->map_length = mapped_length;
+
+ ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
+
+ nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
+
+ for (mirror_index = 0; mirror_index < nmirrors;
+ mirror_index++) {
+ struct scrub_block *sblock;
+ struct scrub_sector *sector;
+
+ sblock = sblocks_for_recheck[mirror_index];
+ sblock->sctx = sctx;
+
+ sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);
+ if (!sector) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_put_recover(fs_info, recover);
+ return -ENOMEM;
+ }
+ sector->flags = flags;
+ sector->generation = generation;
+ sector->have_csum = have_csum;
+ if (have_csum)
+ memcpy(sector->csum,
+ original_sblock->sectors[0]->csum,
+ sctx->fs_info->csum_size);
+
+ scrub_stripe_index_and_offset(logical,
+ bioc->map_type,
+ bioc->raid_map,
+ bioc->num_stripes -
+ bioc->num_tgtdevs,
+ mirror_index,
+ &stripe_index,
+ &stripe_offset);
+ /*
+ * We're at the first sector, also populate @sblock
+ * physical and dev.
+ */
+ if (sector_index == 0) {
+ sblock->physical =
+ bioc->stripes[stripe_index].physical +
+ stripe_offset;
+ sblock->dev = bioc->stripes[stripe_index].dev;
+ sblock->physical_for_dev_replace =
+ original_sblock->physical_for_dev_replace;
+ }
+
+ BUG_ON(sector_index >= original_sblock->sector_count);
+ scrub_get_recover(recover);
+ sector->recover = recover;
+ }
+ scrub_put_recover(fs_info, recover);
+ length -= sublen;
+ logical += sublen;
+ sector_index++;
+ }
+
+ return 0;
+}
+
+static void scrub_bio_wait_endio(struct bio *bio)
+{
+ complete(bio->bi_private);
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+ struct bio *bio,
+ struct scrub_sector *sector)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
+ SECTOR_SHIFT;
+ bio->bi_private = &done;
+ bio->bi_end_io = scrub_bio_wait_endio;
+ raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
+
+ wait_for_completion_io(&done);
+ return blk_status_to_errno(bio->bi_status);
+}
+
+static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock)
+{
+ struct scrub_sector *first_sector = sblock->sectors[0];
+ struct bio *bio;
+ int i;
+
+ /* All sectors in sblock belong to the same stripe on the same device. */
+ ASSERT(sblock->dev);
+ if (!sblock->dev->bdev)
+ goto out;
+
+ bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
+
+ bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
+ }
+
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
+ bio_put(bio);
+ goto out;
+ }
+
+ bio_put(bio);
+
+ scrub_recheck_block_checksum(sblock);
+
+ return;
+out:
+ for (i = 0; i < sblock->sector_count; i++)
+ sblock->sectors[i]->io_error = 1;
+
+ sblock->no_io_error_seen = 0;
+}
+
+/*
+ * This function will check the on disk data for checksum errors, header errors
+ * and read I/O errors. If any I/O errors happen, the exact sectors which are
+ * errored are marked as being bad. The goal is to enable scrub to take those
+ * sectors that are not errored from all the mirrors so that the sectors that
+ * are errored in the just handled mirror can be repaired.
+ */
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock,
+ int retry_failed_mirror)
+{
+ int i;
+
+ sblock->no_io_error_seen = 1;
+
+ /* short cut for raid56 */
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
+ return scrub_recheck_block_on_raid56(fs_info, sblock);
+
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
+ struct bio bio;
+ struct bio_vec bvec;
+
+ if (sblock->dev->bdev == NULL) {
+ sector->io_error = 1;
+ sblock->no_io_error_seen = 0;
+ continue;
+ }
+
+ bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
+ bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
+ SECTOR_SHIFT;
+
+ btrfsic_check_bio(&bio);
+ if (submit_bio_wait(&bio)) {
+ sector->io_error = 1;
+ sblock->no_io_error_seen = 0;
+ }
+
+ bio_uninit(&bio);
+ }
+
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(sblock);
+}
+
+static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
+{
+ struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
+ int ret;
+
+ ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ return !ret;
+}
+
+static void scrub_recheck_block_checksum(struct scrub_block *sblock)
+{
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
+ sblock->generation_error = 0;
+
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ scrub_checksum_data(sblock);
+ else
+ scrub_checksum_tree_block(sblock);
+}
+
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good)
+{
+ int i;
+ int ret = 0;
+
+ for (i = 0; i < sblock_bad->sector_count; i++) {
+ int ret_sub;
+
+ ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_good, i, 1);
+ if (ret_sub)
+ ret = ret_sub;
+ }
+
+ return ret;
+}
+
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int sector_num, int force_write)
+{
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
+ struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
+ struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
+
+ if (force_write || sblock_bad->header_error ||
+ sblock_bad->checksum_error || sector_bad->io_error) {
+ struct bio bio;
+ struct bio_vec bvec;
+ int ret;
+
+ if (!sblock_bad->dev->bdev) {
+ btrfs_warn_rl(fs_info,
+ "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
+ return -EIO;
+ }
+
+ bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+ bio.bi_iter.bi_sector = (sblock_bad->physical +
+ sector_bad->offset) >> SECTOR_SHIFT;
+ ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
+
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+
+ if (ret) {
+ btrfs_dev_stat_inc_and_print(sblock_bad->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+ struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
+ int i;
+
+ /*
+ * This block is used for the check of the parity on the source device,
+ * so the data needn't be written into the destination device.
+ */
+ if (sblock->sparity)
+ return;
+
+ for (i = 0; i < sblock->sector_count; i++) {
+ int ret;
+
+ ret = scrub_write_sector_to_dev_replace(sblock, i);
+ if (ret)
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
+ }
+}
+
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
+{
+ const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
+ struct scrub_sector *sector = sblock->sectors[sector_num];
+
+ if (sector->io_error)
+ memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
+
+ return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
+}
+
+static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
+{
+ int ret = 0;
+ u64 length;
+
+ if (!btrfs_is_zoned(sctx->fs_info))
+ return 0;
+
+ if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
+ return 0;
+
+ if (sctx->write_pointer < physical) {
+ length = physical - sctx->write_pointer;
+
+ ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
+ sctx->write_pointer, length);
+ if (!ret)
+ sctx->write_pointer = physical;
+ }
+ return ret;
+}
+
+static void scrub_block_get(struct scrub_block *sblock)
+{
+ refcount_inc(&sblock->refs);
+}
+
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
+{
+ struct scrub_block *sblock = sector->sblock;
+ struct scrub_bio *sbio;
+ int ret;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
+
+ mutex_lock(&sctx->wr_lock);
+again:
+ if (!sctx->wr_curr_bio) {
+ sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
+ GFP_KERNEL);
+ if (!sctx->wr_curr_bio) {
+ mutex_unlock(&sctx->wr_lock);
+ return -ENOMEM;
+ }
+ sctx->wr_curr_bio->sctx = sctx;
+ sctx->wr_curr_bio->sector_count = 0;
+ }
+ sbio = sctx->wr_curr_bio;
+ if (sbio->sector_count == 0) {
+ ret = fill_writer_pointer_gap(sctx, sector->offset +
+ sblock->physical_for_dev_replace);
+ if (ret) {
+ mutex_unlock(&sctx->wr_lock);
+ return ret;
+ }
+
+ sbio->physical = sblock->physical_for_dev_replace + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
+ sbio->dev = sctx->wr_tgtdev;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_WRITE, GFP_NOFS);
+ }
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_wr_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
+ sbio->status = 0;
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sblock->physical_for_dev_replace + sector->offset ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sblock->logical + sector->offset) {
+ scrub_wr_submit(sctx);
+ goto again;
+ }
+
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
+ if (ret != sectorsize) {
+ if (sbio->sector_count < 1) {
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ mutex_unlock(&sctx->wr_lock);
+ return -EIO;
+ }
+ scrub_wr_submit(sctx);
+ goto again;
+ }
+
+ sbio->sectors[sbio->sector_count] = sector;
+ scrub_sector_get(sector);
+ /*
+ * Since ssector no longer holds a page, but uses sblock::pages, we
+ * have to ensure the sblock had not been freed before our write bio
+ * finished.
+ */
+ scrub_block_get(sector->sblock);
+
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+
+ return 0;
+}
+
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+ struct scrub_bio *sbio;
+
+ if (!sctx->wr_curr_bio)
+ return;
+
+ sbio = sctx->wr_curr_bio;
+ sctx->wr_curr_bio = NULL;
+ scrub_pending_bio_inc(sctx);
+ /* process all writes in a single worker thread. Then the block layer
+ * orders the requests before sending them to the driver which
+ * doubled the write performance on spinning disks when measured
+ * with Linux 3.5 */
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
+
+ if (btrfs_is_zoned(sctx->fs_info))
+ sctx->write_pointer = sbio->physical + sbio->sector_count *
+ sctx->fs_info->sectorsize;
+}
+
+static void scrub_wr_bio_end_io(struct bio *bio)
+{
+ struct scrub_bio *sbio = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
+
+ sbio->status = bio->bi_status;
+ sbio->bio = bio;
+
+ INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
+ queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+
+static void scrub_wr_bio_end_io_worker(struct work_struct *work)
+{
+ struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ struct scrub_ctx *sctx = sbio->sctx;
+ int i;
+
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
+ if (sbio->status) {
+ struct btrfs_dev_replace *dev_replace =
+ &sbio->sctx->fs_info->dev_replace;
+
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
+
+ sector->io_error = 1;
+ atomic64_inc(&dev_replace->num_write_errors);
+ }
+ }
+
+ /*
+ * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
+ * endio we should put the sblock.
+ */
+ for (i = 0; i < sbio->sector_count; i++) {
+ scrub_block_put(sbio->sectors[i]->sblock);
+ scrub_sector_put(sbio->sectors[i]);
+ }
+
+ bio_put(sbio->bio);
+ kfree(sbio);
+ scrub_pending_bio_dec(sctx);
+}
+
+static int scrub_checksum(struct scrub_block *sblock)
+{
+ u64 flags;
+ int ret;
+
+ /*
+ * No need to initialize these stats currently,
+ * because this function only use return value
+ * instead of these stats value.
+ *
+ * Todo:
+ * always use stats
+ */
+ sblock->header_error = 0;
+ sblock->generation_error = 0;
+ sblock->checksum_error = 0;
+
+ WARN_ON(sblock->sector_count < 1);
+ flags = sblock->sectors[0]->flags;
+ ret = 0;
+ if (flags & BTRFS_EXTENT_FLAG_DATA)
+ ret = scrub_checksum_data(sblock);
+ else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ ret = scrub_checksum_tree_block(sblock);
+ else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+ ret = scrub_checksum_super(sblock);
+ else
+ WARN_ON(1);
+ if (ret)
+ scrub_handle_errored_block(sblock);
+
+ return ret;
+}
+
+static int scrub_checksum_data(struct scrub_block *sblock)
+{
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ u8 csum[BTRFS_CSUM_SIZE];
+ struct scrub_sector *sector;
+ char *kaddr;
+
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ if (!sector->have_csum)
+ return 0;
+
+ kaddr = scrub_sector_get_kaddr(sector);
+
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+
+ if (memcmp(csum, sector->csum, fs_info->csum_size))
+ sblock->checksum_error = 1;
+ return sblock->checksum_error;
+}
+
+static int scrub_checksum_tree_block(struct scrub_block *sblock)
+{
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_header *h;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ /*
+ * This is done in sectorsize steps even for metadata as there's a
+ * constraint for nodesize to be aligned to sectorsize. This will need
+ * to change so we don't misuse data and metadata units like that.
+ */
+ const u32 sectorsize = sctx->fs_info->sectorsize;
+ const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
+ int i;
+ struct scrub_sector *sector;
+ char *kaddr;
+
+ BUG_ON(sblock->sector_count < 1);
+
+ /* Each member in sectors is just one sector */
+ ASSERT(sblock->sector_count == num_sectors);
+
+ sector = sblock->sectors[0];
+ kaddr = scrub_sector_get_kaddr(sector);
+ h = (struct btrfs_header *)kaddr;
+ memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
+
+ /*
+ * we don't use the getter functions here, as we
+ * a) don't have an extent buffer and
+ * b) the page is already kmapped
+ */
+ if (sblock->logical != btrfs_stack_header_bytenr(h)) {
+ sblock->header_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+ sblock->logical, sblock->mirror_num,
+ btrfs_stack_header_bytenr(h),
+ sblock->logical);
+ goto out;
+ }
+
+ if (!scrub_check_fsid(h->fsid, sector)) {
+ sblock->header_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+ sblock->logical, sblock->mirror_num,
+ h->fsid, sblock->dev->fs_devices->fsid);
+ goto out;
+ }
+
+ if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
+ sblock->header_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+ sblock->logical, sblock->mirror_num,
+ h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
+ goto out;
+ }
+
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ sectorsize - BTRFS_CSUM_SIZE);
+
+ for (i = 1; i < num_sectors; i++) {
+ kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
+ crypto_shash_update(shash, kaddr, sectorsize);
+ }
+
+ crypto_shash_final(shash, calculated_csum);
+ if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
+ sblock->checksum_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+ sblock->logical, sblock->mirror_num,
+ CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
+ CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
+ goto out;
+ }
+
+ if (sector->generation != btrfs_stack_header_generation(h)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ btrfs_warn_rl(fs_info,
+ "tree block %llu mirror %u has bad generation, has %llu want %llu",
+ sblock->logical, sblock->mirror_num,
+ btrfs_stack_header_generation(h),
+ sector->generation);
+ }
+
+out:
+ return sblock->header_error || sblock->checksum_error;
+}
+
+static int scrub_checksum_super(struct scrub_block *sblock)
+{
+ struct btrfs_super_block *s;
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ struct scrub_sector *sector;
+ char *kaddr;
+ int fail_gen = 0;
+ int fail_cor = 0;
+
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ kaddr = scrub_sector_get_kaddr(sector);
+ s = (struct btrfs_super_block *)kaddr;
+
+ if (sblock->logical != btrfs_super_bytenr(s))
+ ++fail_cor;
+
+ if (sector->generation != btrfs_super_generation(s))
+ ++fail_gen;
+
+ if (!scrub_check_fsid(s->fsid, sector))
+ ++fail_cor;
+
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
+
+ if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
+ ++fail_cor;
+
+ return fail_cor + fail_gen;
+}
+
+static void scrub_block_put(struct scrub_block *sblock)
+{
+ if (refcount_dec_and_test(&sblock->refs)) {
+ int i;
+
+ if (sblock->sparity)
+ scrub_parity_put(sblock->sparity);
+
+ for (i = 0; i < sblock->sector_count; i++)
+ scrub_sector_put(sblock->sectors[i]);
+ for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
+ if (sblock->pages[i]) {
+ detach_scrub_page_private(sblock->pages[i]);
+ __free_page(sblock->pages[i]);
+ }
+ }
+ kfree(sblock);
+ }
+}
+
+static void scrub_sector_get(struct scrub_sector *sector)
+{
+ atomic_inc(&sector->refs);
+}
+
+static void scrub_sector_put(struct scrub_sector *sector)
+{
+ if (atomic_dec_and_test(&sector->refs))
+ kfree(sector);
+}
+
+/*
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+ */
+static void scrub_throttle(struct scrub_ctx *sctx)
+{
+ const int time_slice = 1000;
+ struct scrub_bio *sbio;
+ struct btrfs_device *device;
+ s64 delta;
+ ktime_t now;
+ u32 div;
+ u64 bwlimit;
+
+ sbio = sctx->bios[sctx->curr];
+ device = sbio->dev;
+ bwlimit = READ_ONCE(device->scrub_speed_max);
+ if (bwlimit == 0)
+ return;
+
+ /*
+ * Slice is divided into intervals when the IO is submitted, adjust by
+ * bwlimit and maximum of 64 intervals.
+ */
+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+ div = min_t(u32, 64, div);
+
+ /* Start new epoch, set deadline */
+ now = ktime_get();
+ if (sctx->throttle_deadline == 0) {
+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+ sctx->throttle_sent = 0;
+ }
+
+ /* Still in the time to send? */
+ if (ktime_before(now, sctx->throttle_deadline)) {
+ /* If current bio is within the limit, send it */
+ sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
+ if (sctx->throttle_sent <= div_u64(bwlimit, div))
+ return;
+
+ /* We're over the limit, sleep until the rest of the slice */
+ delta = ktime_ms_delta(sctx->throttle_deadline, now);
+ } else {
+ /* New request after deadline, start new epoch */
+ delta = 0;
+ }
+
+ if (delta) {
+ long timeout;
+
+ timeout = div_u64(delta * HZ, 1000);
+ schedule_timeout_interruptible(timeout);
+ }
+
+ /* Next call will start the deadline period */
+ sctx->throttle_deadline = 0;
+}
+
+static void scrub_submit(struct scrub_ctx *sctx)
+{
+ struct scrub_bio *sbio;
+
+ if (sctx->curr == -1)
+ return;
+
+ scrub_throttle(sctx);
+
+ sbio = sctx->bios[sctx->curr];
+ sctx->curr = -1;
+ scrub_pending_bio_inc(sctx);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
+}
+
+static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
+{
+ struct scrub_block *sblock = sector->sblock;
+ struct scrub_bio *sbio;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
+ int ret;
+
+again:
+ /*
+ * grab a fresh bio or wait for one to become available
+ */
+ while (sctx->curr == -1) {
+ spin_lock(&sctx->list_lock);
+ sctx->curr = sctx->first_free;
+ if (sctx->curr != -1) {
+ sctx->first_free = sctx->bios[sctx->curr]->next_free;
+ sctx->bios[sctx->curr]->next_free = -1;
+ sctx->bios[sctx->curr]->sector_count = 0;
+ spin_unlock(&sctx->list_lock);
+ } else {
+ spin_unlock(&sctx->list_lock);
+ wait_event(sctx->list_wait, sctx->first_free != -1);
+ }
+ }
+ sbio = sctx->bios[sctx->curr];
+ if (sbio->sector_count == 0) {
+ sbio->physical = sblock->physical + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
+ sbio->dev = sblock->dev;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_READ, GFP_NOFS);
+ }
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
+ sbio->status = 0;
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sblock->physical + sector->offset ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sblock->logical + sector->offset ||
+ sbio->dev != sblock->dev) {
+ scrub_submit(sctx);
+ goto again;
+ }
+
+ sbio->sectors[sbio->sector_count] = sector;
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
+ if (ret != sectorsize) {
+ if (sbio->sector_count < 1) {
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ return -EIO;
+ }
+ scrub_submit(sctx);
+ goto again;
+ }
+
+ scrub_block_get(sblock); /* one for the page added to the bio */
+ atomic_inc(&sblock->outstanding_sectors);
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
+ scrub_submit(sctx);
+
+ return 0;
+}
+
+static void scrub_missing_raid56_end_io(struct bio *bio)
+{
+ struct scrub_block *sblock = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
+
+ btrfs_bio_counter_dec(fs_info);
+ if (bio->bi_status)
+ sblock->no_io_error_seen = 0;
+
+ bio_put(bio);
+
+ queue_work(fs_info->scrub_workers, &sblock->work);
+}
+
+static void scrub_missing_raid56_worker(struct work_struct *work)
+{
+ struct scrub_block *sblock = container_of(work, struct scrub_block, work);
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ u64 logical;
+ struct btrfs_device *dev;
+
+ logical = sblock->logical;
+ dev = sblock->dev;
+
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(sblock);
+
+ if (!sblock->no_io_error_seen) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_err_rl_in_rcu(fs_info,
+ "IO error rebuilding logical %llu for dev %s",
+ logical, rcu_str_deref(dev->name));
+ } else if (sblock->header_error || sblock->checksum_error) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_err_rl_in_rcu(fs_info,
+ "failed to rebuild valid logical %llu for dev %s",
+ logical, rcu_str_deref(dev->name));
+ } else {
+ scrub_write_block_to_dev_replace(sblock);
+ }
+
+ if (sctx->is_dev_replace && sctx->flush_all_writes) {
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+ }
+
+ scrub_block_put(sblock);
+ scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_missing_raid56_pages(struct scrub_block *sblock)
+{
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ u64 length = sblock->sector_count << fs_info->sectorsize_bits;
+ u64 logical = sblock->logical;
+ struct btrfs_io_context *bioc = NULL;
+ struct bio *bio;
+ struct btrfs_raid_bio *rbio;
+ int ret;
+ int i;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
+
+ if (WARN_ON(!sctx->is_dev_replace ||
+ !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ /*
+ * We shouldn't be scrubbing a missing device. Even for dev
+ * replace, we should only get here for RAID 5/6. We either
+ * managed to mount something with no mirrors remaining or
+ * there's a bug in scrub_find_good_copy()/btrfs_map_block().
+ */
+ goto bioc_out;
+ }
+
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+ bio->bi_iter.bi_sector = logical >> 9;
+ bio->bi_private = sblock;
+ bio->bi_end_io = scrub_missing_raid56_end_io;
+
+ rbio = raid56_alloc_missing_rbio(bio, bioc);
+ if (!rbio)
+ goto rbio_out;
+
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
+
+ raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
+ scrub_sector_get_page_offset(sector),
+ sector->offset + sector->sblock->logical);
+ }
+
+ INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
+ scrub_block_get(sblock);
+ scrub_pending_bio_inc(sctx);
+ raid56_submit_missing_rbio(rbio);
+ btrfs_put_bioc(bioc);
+ return;
+
+rbio_out:
+ bio_put(bio);
+bioc_out:
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_put_bioc(bioc);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+}
+
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum,
+ u64 physical_for_dev_replace)
+{
+ struct scrub_block *sblock;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
+ int index;
+
+ sblock = alloc_scrub_block(sctx, dev, logical, physical,
+ physical_for_dev_replace, mirror_num);
+ if (!sblock) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ for (index = 0; len > 0; index++) {
+ struct scrub_sector *sector;
+ /*
+ * Here we will allocate one page for one sector to scrub.
+ * This is fine if PAGE_SIZE == sectorsize, but will cost
+ * more memory for PAGE_SIZE > sectorsize case.
+ */
+ u32 l = min(sectorsize, len);
+
+ sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+ if (!sector) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_block_put(sblock);
+ return -ENOMEM;
+ }
+ sector->flags = flags;
+ sector->generation = gen;
+ if (csum) {
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
+ } else {
+ sector->have_csum = 0;
+ }
+ len -= l;
+ logical += l;
+ physical += l;
+ physical_for_dev_replace += l;
+ }
+
+ WARN_ON(sblock->sector_count == 0);
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
+ /*
+ * This case should only be hit for RAID 5/6 device replace. See
+ * the comment in scrub_missing_raid56_pages() for details.
+ */
+ scrub_missing_raid56_pages(sblock);
+ } else {
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
+ int ret;
+
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
+ }
+
+ if (flags & BTRFS_EXTENT_FLAG_SUPER)
+ scrub_submit(sctx);
+ }
+
+ /* last one frees, either here or in bio completion for last page */
+ scrub_block_put(sblock);
+ return 0;
+}
+
+static void scrub_bio_end_io(struct bio *bio)
+{
+ struct scrub_bio *sbio = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
+
+ sbio->status = bio->bi_status;
+ sbio->bio = bio;
+
+ queue_work(fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_bio_end_io_worker(struct work_struct *work)
+{
+ struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ struct scrub_ctx *sctx = sbio->sctx;
+ int i;
+
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
+ if (sbio->status) {
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
+
+ sector->io_error = 1;
+ sector->sblock->no_io_error_seen = 0;
+ }
+ }
+
+ /* Now complete the scrub_block items that have all pages completed */
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
+ struct scrub_block *sblock = sector->sblock;
+
+ if (atomic_dec_and_test(&sblock->outstanding_sectors))
+ scrub_block_complete(sblock);
+ scrub_block_put(sblock);
+ }
+
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ spin_lock(&sctx->list_lock);
+ sbio->next_free = sctx->first_free;
+ sctx->first_free = sbio->index;
+ spin_unlock(&sctx->list_lock);
+
+ if (sctx->is_dev_replace && sctx->flush_all_writes) {
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+ }
+
+ scrub_pending_bio_dec(sctx);
+}
+
+static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
+ unsigned long *bitmap,
+ u64 start, u32 len)
+{
+ u64 offset;
+ u32 nsectors;
+ u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
+
+ if (len >= sparity->stripe_len) {
+ bitmap_set(bitmap, 0, sparity->nsectors);
+ return;
+ }
+
+ start -= sparity->logic_start;
+ start = div64_u64_rem(start, sparity->stripe_len, &offset);
+ offset = offset >> sectorsize_bits;
+ nsectors = len >> sectorsize_bits;
+
+ if (offset + nsectors <= sparity->nsectors) {
+ bitmap_set(bitmap, offset, nsectors);
+ return;
+ }
+
+ bitmap_set(bitmap, offset, sparity->nsectors - offset);
+ bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
+}
+
+static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
+ u64 start, u32 len)
+{
+ __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
+}
+
+static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
+ u64 start, u32 len)
+{
+ __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
+}
+
+static void scrub_block_complete(struct scrub_block *sblock)
+{
+ int corrupted = 0;
+
+ if (!sblock->no_io_error_seen) {
+ corrupted = 1;
+ scrub_handle_errored_block(sblock);
+ } else {
+ /*
+ * if has checksum error, write via repair mechanism in
+ * dev replace case, otherwise write here in dev replace
+ * case.
+ */
+ corrupted = scrub_checksum(sblock);
+ if (!corrupted && sblock->sctx->is_dev_replace)
+ scrub_write_block_to_dev_replace(sblock);
+ }
+
+ if (sblock->sparity && corrupted && !sblock->data_corrected) {
+ u64 start = sblock->logical;
+ u64 end = sblock->logical +
+ sblock->sectors[sblock->sector_count - 1]->offset +
+ sblock->sctx->fs_info->sectorsize;
+
+ ASSERT(end - start <= U32_MAX);
+ scrub_parity_mark_sectors_error(sblock->sparity,
+ start, end - start);
+ }
+}
+
+static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
+{
+ sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
+ list_del(&sum->list);
+ kfree(sum);
+}
+
+/*
+ * Find the desired csum for range [logical, logical + sectorsize), and store
+ * the csum into @csum.
+ *
+ * The search source is sctx->csum_list, which is a pre-populated list
+ * storing bytenr ordered csum ranges. We're responsible to cleanup any range
+ * that is before @logical.
+ *
+ * Return 0 if there is no csum for the range.
+ * Return 1 if there is csum for the range and copied to @csum.
+ */
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
+{
+ bool found = false;
+
+ while (!list_empty(&sctx->csum_list)) {
+ struct btrfs_ordered_sum *sum = NULL;
+ unsigned long index;
+ unsigned long num_sectors;
+
+ sum = list_first_entry(&sctx->csum_list,
+ struct btrfs_ordered_sum, list);
+ /* The current csum range is beyond our range, no csum found */
+ if (sum->bytenr > logical)
+ break;
+
+ /*
+ * The current sum is before our bytenr, since scrub is always
+ * done in bytenr order, the csum will never be used anymore,
+ * clean it up so that later calls won't bother with the range,
+ * and continue search the next range.
+ */
+ if (sum->bytenr + sum->len <= logical) {
+ drop_csum_range(sctx, sum);
+ continue;
+ }
+
+ /* Now the csum range covers our bytenr, copy the csum */
+ found = true;
+ index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
+ num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
+
+ memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
+ sctx->fs_info->csum_size);
+
+ /* Cleanup the range if we're at the end of the csum range */
+ if (index == num_sectors - 1)
+ drop_csum_range(sctx, sum);
+ break;
+ }
+ if (!found)
+ return 0;
+ return 1;
+}
+
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
+ u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num)
+{
+ struct btrfs_device *src_dev = dev;
+ u64 src_physical = physical;
+ int src_mirror = mirror_num;
+ int ret;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 blocksize;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ blocksize = map->stripe_len;
+ else
+ blocksize = sctx->fs_info->sectorsize;
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.data_extents_scrubbed++;
+ sctx->stat.data_bytes_scrubbed += len;
+ spin_unlock(&sctx->stat_lock);
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ blocksize = map->stripe_len;
+ else
+ blocksize = sctx->fs_info->nodesize;
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.tree_extents_scrubbed++;
+ sctx->stat.tree_bytes_scrubbed += len;
+ spin_unlock(&sctx->stat_lock);
+ } else {
+ blocksize = sctx->fs_info->sectorsize;
+ WARN_ON(1);
+ }
+
+ /*
+ * For dev-replace case, we can have @dev being a missing device.
+ * Regular scrub will avoid its execution on missing device at all,
+ * as that would trigger tons of read error.
+ *
+ * Reading from missing device will cause read error counts to
+ * increase unnecessarily.
+ * So here we change the read source to a good mirror.
+ */
+ if (sctx->is_dev_replace && !dev->bdev)
+ scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
+ &src_dev, &src_mirror);
+ while (len) {
+ u32 l = min(len, blocksize);
+ int have_csum = 0;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ /* push csums to sbio */
+ have_csum = scrub_find_csum(sctx, logical, csum);
+ if (have_csum == 0)
+ ++sctx->stat.no_csum;
+ }
+ ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
+ flags, gen, src_mirror,
+ have_csum ? csum : NULL, physical);
+ if (ret)
+ return ret;
+ len -= l;
+ logical += l;
+ physical += l;
+ src_physical += l;
+ }
+ return 0;
+}
+
+static int scrub_sectors_for_parity(struct scrub_parity *sparity,
+ u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev,
+ u64 flags, u64 gen, int mirror_num, u8 *csum)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct scrub_block *sblock;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
+ int index;
+
+ ASSERT(IS_ALIGNED(len, sectorsize));
+
+ sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
+ if (!sblock) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ sblock->sparity = sparity;
+ scrub_parity_get(sparity);
+
+ for (index = 0; len > 0; index++) {
+ struct scrub_sector *sector;
+
+ sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+ if (!sector) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_block_put(sblock);
+ return -ENOMEM;
+ }
+ sblock->sectors[index] = sector;
+ /* For scrub parity */
+ scrub_sector_get(sector);
+ list_add_tail(&sector->list, &sparity->sectors_list);
+ sector->flags = flags;
+ sector->generation = gen;
+ if (csum) {
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
+ } else {
+ sector->have_csum = 0;
+ }
+
+ /* Iterate over the stripe range in sectorsize steps */
+ len -= sectorsize;
+ logical += sectorsize;
+ physical += sectorsize;
+ }
+
+ WARN_ON(sblock->sector_count == 0);
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
+ int ret;
+
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
+ }
+
+ /* Last one frees, either here or in bio completion for last sector */
+ scrub_block_put(sblock);
+ return 0;
+}
+
+static int scrub_extent_for_parity(struct scrub_parity *sparity,
+ u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev,
+ u64 flags, u64 gen, int mirror_num)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ int ret;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 blocksize;
+
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
+ scrub_parity_mark_sectors_error(sparity, logical, len);
+ return 0;
+ }
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ blocksize = sparity->stripe_len;
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ blocksize = sparity->stripe_len;
+ } else {
+ blocksize = sctx->fs_info->sectorsize;
+ WARN_ON(1);
+ }
+
+ while (len) {
+ u32 l = min(len, blocksize);
+ int have_csum = 0;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ /* push csums to sbio */
+ have_csum = scrub_find_csum(sctx, logical, csum);
+ if (have_csum == 0)
+ goto skip;
+ }
+ ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
+ flags, gen, mirror_num,
+ have_csum ? csum : NULL);
+ if (ret)
+ return ret;
+skip:
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+ return 0;
+}
+
+/*
+ * Given a physical address, this will calculate it's
+ * logical offset. if this is a parity stripe, it will return
+ * the most left data stripe's logical offset.
+ *
+ * return 0 if it is a data stripe, 1 means parity stripe.
+ */
+static int get_raid56_logic_offset(u64 physical, int num,
+ struct map_lookup *map, u64 *offset,
+ u64 *stripe_start)
+{
+ int i;
+ int j = 0;
+ u64 stripe_nr;
+ u64 last_offset;
+ u32 stripe_index;
+ u32 rot;
+ const int data_stripes = nr_data_stripes(map);
+
+ last_offset = (physical - map->stripes[num].physical) * data_stripes;
+ if (stripe_start)
+ *stripe_start = last_offset;
+
+ *offset = last_offset;
+ for (i = 0; i < data_stripes; i++) {
+ *offset = last_offset + i * map->stripe_len;
+
+ stripe_nr = div64_u64(*offset, map->stripe_len);
+ stripe_nr = div_u64(stripe_nr, data_stripes);
+
+ /* Work out the disk rotation on this stripe-set */
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
+ /* calculate which stripe this data locates */
+ rot += i;
+ stripe_index = rot % map->num_stripes;
+ if (stripe_index == num)
+ return 0;
+ if (stripe_index < num)
+ j++;
+ }
+ *offset = last_offset + j * map->stripe_len;
+ return 1;
+}
+
+static void scrub_free_parity(struct scrub_parity *sparity)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct scrub_sector *curr, *next;
+ int nbits;
+
+ nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
+ if (nbits) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors += nbits;
+ sctx->stat.uncorrectable_errors += nbits;
+ spin_unlock(&sctx->stat_lock);
+ }
+
+ list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
+ list_del_init(&curr->list);
+ scrub_sector_put(curr);
+ }
+
+ kfree(sparity);
+}
+
+static void scrub_parity_bio_endio_worker(struct work_struct *work)
+{
+ struct scrub_parity *sparity = container_of(work, struct scrub_parity,
+ work);
+ struct scrub_ctx *sctx = sparity->sctx;
+
+ btrfs_bio_counter_dec(sctx->fs_info);
+ scrub_free_parity(sparity);
+ scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_parity_bio_endio(struct bio *bio)
+{
+ struct scrub_parity *sparity = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
+
+ if (bio->bi_status)
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
+ &sparity->dbitmap, sparity->nsectors);
+
+ bio_put(bio);
+
+ INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
+ queue_work(fs_info->scrub_parity_workers, &sparity->work);
+}
+
+static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct bio *bio;
+ struct btrfs_raid_bio *rbio;
+ struct btrfs_io_context *bioc = NULL;
+ u64 length;
+ int ret;
+
+ if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
+ &sparity->ebitmap, sparity->nsectors))
+ goto out;
+
+ length = sparity->logic_end - sparity->logic_start;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
+
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+ bio->bi_iter.bi_sector = sparity->logic_start >> 9;
+ bio->bi_private = sparity;
+ bio->bi_end_io = scrub_parity_bio_endio;
+
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
+ sparity->scrub_dev,
+ &sparity->dbitmap,
+ sparity->nsectors);
+ btrfs_put_bioc(bioc);
+ if (!rbio)
+ goto rbio_out;
+
+ scrub_pending_bio_inc(sctx);
+ raid56_parity_submit_scrub_rbio(rbio);
+ return;
+
+rbio_out:
+ bio_put(bio);
+bioc_out:
+ btrfs_bio_counter_dec(fs_info);
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
+ sparity->nsectors);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+out:
+ scrub_free_parity(sparity);
+}
+
+static void scrub_parity_get(struct scrub_parity *sparity)
+{
+ refcount_inc(&sparity->refs);
+}
+
+static void scrub_parity_put(struct scrub_parity *sparity)
+{
+ if (!refcount_dec_and_test(&sparity->refs))
+ return;
+
+ scrub_parity_check_and_repair(sparity);
+}
+
+/*
+ * Return 0 if the extent item range covers any byte of the range.
+ * Return <0 if the extent item is before @search_start.
+ * Return >0 if the extent item is after @start_start + @search_len.
+ */
+static int compare_extent_item_range(struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
+ u64 len;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ len = fs_info->nodesize;
+ else
+ len = key.offset;
+
+ if (key.objectid + len <= search_start)
+ return -1;
+ if (key.objectid >= search_start + search_len)
+ return 1;
+ return 0;
+}
+
+/*
+ * Locate one extent item which covers any byte in range
+ * [@search_start, @search_start + @search_length)
+ *
+ * If the path is not initialized, we will initialize the search by doing
+ * a btrfs_search_slot().
+ * If the path is already initialized, we will use the path as the initial
+ * slot, to avoid duplicated btrfs_search_slot() calls.
+ *
+ * NOTE: If an extent item starts before @search_start, we will still
+ * return the extent item. This is for data extent crossing stripe boundary.
+ *
+ * Return 0 if we found such extent item, and @path will point to the extent item.
+ * Return >0 if no such extent item can be found, and @path will be released.
+ * Return <0 if hit fatal error, and @path will be released.
+ */
+static int find_first_extent_item(struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ struct btrfs_key key;
+ int ret;
+
+ /* Continue using the existing path */
+ if (path->nodes[0])
+ goto search_forward;
+
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = search_start;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(ret > 0);
+ /*
+ * Here we intentionally pass 0 as @min_objectid, as there could be
+ * an extent item starting before @search_start.
+ */
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
+ if (ret < 0)
+ return ret;
+ /*
+ * No matter whether we have found an extent item, the next loop will
+ * properly do every check on the key.
+ */
+search_forward:
+ while (true) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid >= search_start + search_len)
+ break;
+ if (key.type != BTRFS_METADATA_ITEM_KEY &&
+ key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto next;
+
+ ret = compare_extent_item_range(path, search_start, search_len);
+ if (ret == 0)
+ return ret;
+ if (ret > 0)
+ break;
+next:
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret) {
+ /* Either no more item or fatal error */
+ btrfs_release_path(path);
+ return ret;
+ }
+ }
+ }
+ btrfs_release_path(path);
+ return 1;
+}
+
+static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
+ u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_item *ei;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
+ key.type == BTRFS_EXTENT_ITEM_KEY);
+ *extent_start_ret = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ *size_ret = path->nodes[0]->fs_info->nodesize;
+ else
+ *size_ret = key.offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
+ *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
+ *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
+}
+
+static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
+ u64 boundary_start, u64 boudary_len)
+{
+ return (extent_start < boundary_start &&
+ extent_start + extent_len > boundary_start) ||
+ (extent_start < boundary_start + boudary_len &&
+ extent_start + extent_len > boundary_start + boudary_len);
+}
+
+static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
+ struct scrub_parity *sparity,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ struct btrfs_path *path,
+ u64 logical)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
+ u64 cur_logical = logical;
+ int ret;
+
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+
+ /* Path must not be populated */
+ ASSERT(!path->nodes[0]);
+
+ while (cur_logical < logical + map->stripe_len) {
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_device *extent_dev;
+ u64 extent_start;
+ u64 extent_size;
+ u64 mapped_length;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 extent_physical;
+ u64 extent_mirror_num;
+
+ ret = find_first_extent_item(extent_root, path, cur_logical,
+ logical + map->stripe_len - cur_logical);
+ /* No more extent item in this data stripe */
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(path, &extent_start, &extent_size, &extent_flags,
+ &extent_gen);
+
+ /* Metadata should not cross stripe boundaries */
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_size,
+ logical, map->stripe_len)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ extent_start, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += extent_size;
+ continue;
+ }
+
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /* Truncate the range inside this data stripe */
+ extent_size = min(extent_start + extent_size,
+ logical + map->stripe_len) - cur_logical;
+ extent_start = cur_logical;
+ ASSERT(extent_size <= U32_MAX);
+
+ scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
+
+ mapped_length = extent_size;
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
+ &mapped_length, &bioc, 0);
+ if (!ret && (!bioc || mapped_length < extent_size))
+ ret = -EIO;
+ if (ret) {
+ btrfs_put_bioc(bioc);
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
+
+ ret = btrfs_lookup_csums_range(csum_root, extent_start,
+ extent_start + extent_size - 1,
+ &sctx->csum_list, 1, false);
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ ret = scrub_extent_for_parity(sparity, extent_start,
+ extent_size, extent_physical,
+ extent_dev, extent_flags,
+ extent_gen, extent_mirror_num);
+ scrub_free_csums(sctx);
+
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ cond_resched();
+ cur_logical += extent_size;
+ }
+ btrfs_release_path(path);
+ return ret;
+}
+
+static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ u64 logic_start,
+ u64 logic_end)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_path *path;
+ u64 cur_logical;
+ int ret;
+ struct scrub_parity *sparity;
+ int nsectors;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ ASSERT(map->stripe_len <= U32_MAX);
+ nsectors = map->stripe_len >> fs_info->sectorsize_bits;
+ ASSERT(nsectors <= BITS_PER_LONG);
+ sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
+ if (!sparity) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ ASSERT(map->stripe_len <= U32_MAX);
+ sparity->stripe_len = map->stripe_len;
+ sparity->nsectors = nsectors;
+ sparity->sctx = sctx;
+ sparity->scrub_dev = sdev;
+ sparity->logic_start = logic_start;
+ sparity->logic_end = logic_end;
+ refcount_set(&sparity->refs, 1);
+ INIT_LIST_HEAD(&sparity->sectors_list);
+
+ ret = 0;
+ for (cur_logical = logic_start; cur_logical < logic_end;
+ cur_logical += map->stripe_len) {
+ ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
+ sdev, path, cur_logical);
+ if (ret < 0)
+ break;
+ }
+
+ scrub_parity_put(sparity);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+
+ btrfs_free_path(path);
+ return ret < 0 ? ret : 0;
+}
+
+static void sync_replace_for_zoned(struct scrub_ctx *sctx)
+{
+ if (!btrfs_is_zoned(sctx->fs_info))
+ return;
+
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+}
+
+static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
+ u64 physical, u64 physical_end)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ int ret = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+
+ mutex_lock(&sctx->wr_lock);
+ if (sctx->write_pointer < physical_end) {
+ ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
+ physical,
+ sctx->write_pointer);
+ if (ret)
+ btrfs_err(fs_info,
+ "zoned: failed to recover write pointer");
+ }
+ mutex_unlock(&sctx->wr_lock);
+ btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
+
+ return ret;
+}
+
+/*
+ * Scrub one range which can only has simple mirror based profile.
+ * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
+ * RAID0/RAID10).
+ *
+ * Since we may need to handle a subset of block group, we need @logical_start
+ * and @logical_length parameter.
+ */
+static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ u64 logical_start, u64 logical_length,
+ struct btrfs_device *device,
+ u64 physical, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ const u64 logical_end = logical_start + logical_length;
+ /* An artificial limit, inherit from old scrub behavior */
+ const u32 max_length = SZ_64K;
+ struct btrfs_path path = { 0 };
+ u64 cur_logical = logical_start;
+ int ret;
+
+ /* The range must be inside the bg */
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+
+ path.search_commit_root = 1;
+ path.skip_locking = 1;
+ /* Go through each extent items inside the logical range */
+ while (cur_logical < logical_end) {
+ u64 extent_start;
+ u64 extent_len;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 scrub_len;
+
+ /* Canceled? */
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req)) {
+ ret = -ECANCELED;
+ break;
+ }
+ /* Paused? */
+ if (atomic_read(&fs_info->scrub_pause_req)) {
+ /* Push queued extents */
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ sctx->flush_all_writes = false;
+ scrub_blocked_if_needed(fs_info);
+ }
+ /* Block group removed? */
+ spin_lock(&bg->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&bg->lock);
+
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ logical_end - cur_logical);
+ if (ret > 0) {
+ /* No more extent, just update the accounting */
+ sctx->stat.last_physical = physical + logical_length;
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(&path, &extent_start, &extent_len,
+ &extent_flags, &extent_gen);
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /*
+ * Scrub len has three limits:
+ * - Extent size limit
+ * - Scrub range limit
+ * This is especially imporatant for RAID0/RAID10 to reuse
+ * this function
+ * - Max scrub size limit
+ */
+ scrub_len = min(min(extent_start + extent_len,
+ logical_end), cur_logical + max_length) -
+ cur_logical;
+
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
+ ret = btrfs_lookup_csums_range(csum_root, cur_logical,
+ cur_logical + scrub_len - 1,
+ &sctx->csum_list, 1, false);
+ if (ret)
+ break;
+ }
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_len,
+ logical_start, logical_length)) {
+ btrfs_err(fs_info,
+"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
+ extent_start, logical_start, logical_end);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += scrub_len;
+ continue;
+ }
+ ret = scrub_extent(sctx, map, cur_logical, scrub_len,
+ cur_logical - logical_start + physical,
+ device, extent_flags, extent_gen,
+ mirror_num);
+ scrub_free_csums(sctx);
+ if (ret)
+ break;
+ if (sctx->is_dev_replace)
+ sync_replace_for_zoned(sctx);
+ cur_logical += scrub_len;
+ /* Don't hold CPU for too long time */
+ cond_resched();
+ }
+ btrfs_release_path(&path);
+ return ret;
+}
+
+/* Calculate the full stripe length for simple stripe based profiles */
+static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+
+ return map->num_stripes / map->sub_stripes * map->stripe_len;
+}
+
+/* Get the logical bytenr for the stripe */
+static u64 simple_stripe_get_logical(struct map_lookup *map,
+ struct btrfs_block_group *bg,
+ int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
+
+ /*
+ * (stripe_index / sub_stripes) gives how many data stripes we need to
+ * skip.
+ */
+ return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+}
+
+/* Get the mirror number for the stripe */
+static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
+
+ /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
+ return stripe_index % map->sub_stripes + 1;
+}
+
+static int scrub_simple_stripe(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct btrfs_device *device,
+ int stripe_index)
+{
+ const u64 logical_increment = simple_stripe_full_stripe_len(map);
+ const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
+ const u64 orig_physical = map->stripes[stripe_index].physical;
+ const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
+ u64 cur_logical = orig_logical;
+ u64 cur_physical = orig_physical;
+ int ret = 0;
+
+ while (cur_logical < bg->start + bg->length) {
+ /*
+ * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
+ * just RAID1, so we can reuse scrub_simple_mirror() to scrub
+ * this stripe.
+ */
+ ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
+ cur_logical, map->stripe_len, device,
+ cur_physical, mirror_num);
+ if (ret)
+ return ret;
+ /* Skip to next stripe which belongs to the target device */
+ cur_logical += logical_increment;
+ /* For physical offset, we just go to next stripe */
+ cur_physical += map->stripe_len;
+ }
+ return ret;
+}
+
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
+ struct extent_map *em,
+ struct btrfs_device *scrub_dev,
+ int stripe_index)
+{
+ struct btrfs_path *path;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *root;
+ struct btrfs_root *csum_root;
+ struct blk_plug plug;
+ struct map_lookup *map = em->map_lookup;
+ const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ const u64 chunk_logical = bg->start;
+ int ret;
+ u64 physical = map->stripes[stripe_index].physical;
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 physical_end = physical + dev_stripe_len;
+ u64 logical;
+ u64 logic_end;
+ /* The logical increment after finishing one stripe */
+ u64 increment;
+ /* Offset inside the chunk */
+ u64 offset;
+ u64 stripe_logical;
+ u64 stripe_end;
+ int stop_loop = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * work on commit root. The related disk blocks are static as
+ * long as COW is applied. This means, it is save to rewrite
+ * them to repair disk errors without any race conditions
+ */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ path->reada = READA_FORWARD;
+
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ scrub_blocked_if_needed(fs_info);
+
+ root = btrfs_extent_root(fs_info, bg->start);
+ csum_root = btrfs_csum_root(fs_info, bg->start);
+
+ /*
+ * collect all data csums for the stripe to avoid seeking during
+ * the scrub. This might currently (crc32) end up to be about 1MB
+ */
+ blk_start_plug(&plug);
+
+ if (sctx->is_dev_replace &&
+ btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
+ mutex_lock(&sctx->wr_lock);
+ sctx->write_pointer = physical;
+ mutex_unlock(&sctx->wr_lock);
+ sctx->flush_all_writes = true;
+ }
+
+ /*
+ * There used to be a big double loop to handle all profiles using the
+ * same routine, which grows larger and more gross over time.
+ *
+ * So here we handle each profile differently, so simpler profiles
+ * have simpler scrubbing function.
+ */
+ if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ /*
+ * Above check rules out all complex profile, the remaining
+ * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
+ * mirrored duplication without stripe.
+ *
+ * Only @physical and @mirror_num needs to calculated using
+ * @stripe_index.
+ */
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ bg->start, bg->length, scrub_dev,
+ map->stripes[stripe_index].physical,
+ stripe_index + 1);
+ offset = 0;
+ goto out;
+ }
+ if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
+ scrub_dev, stripe_index);
+ offset = map->stripe_len * (stripe_index / map->sub_stripes);
+ goto out;
+ }
+
+ /* Only RAID56 goes through the old code */
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ ret = 0;
+
+ /* Calculate the logical end of the stripe */
+ get_raid56_logic_offset(physical_end, stripe_index,
+ map, &logic_end, NULL);
+ logic_end += chunk_logical;
+
+ /* Initialize @offset in case we need to go to out: label */
+ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
+ increment = map->stripe_len * nr_data_stripes(map);
+
+ /*
+ * Due to the rotation, for RAID56 it's better to iterate each stripe
+ * using their physical offset.
+ */
+ while (physical < physical_end) {
+ ret = get_raid56_logic_offset(physical, stripe_index, map,
+ &logical, &stripe_logical);
+ logical += chunk_logical;
+ if (ret) {
+ /* it is parity strip */
+ stripe_logical += chunk_logical;
+ stripe_end = stripe_logical + increment;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
+ goto next;
+ }
+
+ /*
+ * Now we're at a data stripe, scrub each extents in the range.
+ *
+ * At this stage, if we ignore the repair part, inside each data
+ * stripe it is no different than SINGLE profile.
+ * We can reuse scrub_simple_mirror() here, as the repair part
+ * is still based on @mirror_num.
+ */
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ logical, map->stripe_len,
+ scrub_dev, physical, 1);
+ if (ret < 0)
+ goto out;
+next:
+ logical += increment;
+ physical += map->stripe_len;
+ spin_lock(&sctx->stat_lock);
+ if (stop_loop)
+ sctx->stat.last_physical =
+ map->stripes[stripe_index].physical + dev_stripe_len;
+ else
+ sctx->stat.last_physical = physical;
+ spin_unlock(&sctx->stat_lock);
+ if (stop_loop)
+ break;
+ }
+out:
+ /* push queued extents */
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+
+ blk_finish_plug(&plug);
+ btrfs_free_path(path);
+
+ if (sctx->is_dev_replace && ret >= 0) {
+ int ret2;
+
+ ret2 = sync_write_pointer_for_zoned(sctx,
+ chunk_logical + offset,
+ map->stripes[stripe_index].physical,
+ physical_end);
+ if (ret2)
+ ret = ret2;
+ }
+
+ return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
+ struct btrfs_device *scrub_dev,
+ u64 dev_offset,
+ u64 dev_extent_len)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ int i;
+ int ret = 0;
+
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, bg->start, bg->length);
+ read_unlock(&map_tree->lock);
+
+ if (!em) {
+ /*
+ * Might have been an unused block group deleted by the cleaner
+ * kthread or relocation.
+ */
+ spin_lock(&bg->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
+ ret = -EINVAL;
+ spin_unlock(&bg->lock);
+
+ return ret;
+ }
+ if (em->start != bg->start)
+ goto out;
+ if (em->len < dev_extent_len)
+ goto out;
+
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; ++i) {
+ if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
+ map->stripes[i].physical == dev_offset) {
+ ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
+ if (ret)
+ goto out;
+ }
+ }
+out:
+ free_extent_map(em);
+
+ return ret;
+}
+
+static int finish_extent_writes_for_zoned(struct btrfs_root *root,
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_trans_handle *trans;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ btrfs_wait_block_group_reservations(cache);
+ btrfs_wait_nocow_writers(cache);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ return btrfs_commit_transaction(trans);
+}
+
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev, u64 start, u64 end)
+{
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ u64 chunk_offset;
+ int ret = 0;
+ int ro_set;
+ int slot;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_block_group *cache;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = READA_FORWARD;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = scrub_dev->devid;
+ key.offset = 0ull;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ while (1) {
+ u64 dev_extent_len;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ } else {
+ ret = 0;
+ }
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.objectid != scrub_dev->devid)
+ break;
+
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
+ break;
+
+ if (found_key.offset >= end)
+ break;
+
+ if (found_key.offset < key.offset)
+ break;
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
+
+ if (found_key.offset + dev_extent_len <= start)
+ goto skip;
+
+ chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+
+ /*
+ * get a reference on the corresponding block group to prevent
+ * the chunk from going away while we scrub it
+ */
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+
+ /* some chunks are removed but not committed to disk yet,
+ * continue scrubbing */
+ if (!cache)
+ goto skip;
+
+ ASSERT(cache->start <= chunk_offset);
+ /*
+ * We are using the commit root to search for device extents, so
+ * that means we could have found a device extent item from a
+ * block group that was deleted in the current transaction. The
+ * logical start offset of the deleted block group, stored at
+ * @chunk_offset, might be part of the logical address range of
+ * a new block group (which uses different physical extents).
+ * In this case btrfs_lookup_block_group() has returned the new
+ * block group, and its start address is less than @chunk_offset.
+ *
+ * We skip such new block groups, because it's pointless to
+ * process them, as we won't find their extents because we search
+ * for them using the commit root of the extent tree. For a device
+ * replace it's also fine to skip it, we won't miss copying them
+ * to the target device because we have the write duplication
+ * setup through the regular write path (by btrfs_map_block()),
+ * and we have committed a transaction when we started the device
+ * replace, right after setting up the device replace state.
+ */
+ if (cache->start < chunk_offset) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+
+ if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
+ if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+ }
+
+ /*
+ * Make sure that while we are scrubbing the corresponding block
+ * group doesn't get its logical address and its device extents
+ * reused for another block group, which can possibly be of a
+ * different type and different profile. We do this to prevent
+ * false error detections and crashes due to bogus attempts to
+ * repair extents.
+ */
+ spin_lock(&cache->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
+ spin_unlock(&cache->lock);
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+ btrfs_freeze_block_group(cache);
+ spin_unlock(&cache->lock);
+
+ /*
+ * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+ * to avoid deadlock caused by:
+ * btrfs_inc_block_group_ro()
+ * -> btrfs_wait_for_commit()
+ * -> btrfs_commit_transaction()
+ * -> btrfs_scrub_pause()
+ */
+ scrub_pause_on(fs_info);
+
+ /*
+ * Don't do chunk preallocation for scrub.
+ *
+ * This is especially important for SYSTEM bgs, or we can hit
+ * -EFBIG from btrfs_finish_chunk_alloc() like:
+ * 1. The only SYSTEM bg is marked RO.
+ * Since SYSTEM bg is small, that's pretty common.
+ * 2. New SYSTEM bg will be allocated
+ * Due to regular version will allocate new chunk.
+ * 3. New SYSTEM bg is empty and will get cleaned up
+ * Before cleanup really happens, it's marked RO again.
+ * 4. Empty SYSTEM bg get scrubbed
+ * We go back to 2.
+ *
+ * This can easily boost the amount of SYSTEM chunks if cleaner
+ * thread can't be triggered fast enough, and use up all space
+ * of btrfs_super_block::sys_chunk_array
+ *
+ * While for dev replace, we need to try our best to mark block
+ * group RO, to prevent race between:
+ * - Write duplication
+ * Contains latest data
+ * - Scrub copy
+ * Contains data from commit tree
+ *
+ * If target block group is not marked RO, nocow writes can
+ * be overwritten by scrub copy, causing data corruption.
+ * So for dev-replace, it's not allowed to continue if a block
+ * group is not RO.
+ */
+ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
+ if (!ret && sctx->is_dev_replace) {
+ ret = finish_extent_writes_for_zoned(root, cache);
+ if (ret) {
+ btrfs_dec_block_group_ro(cache);
+ scrub_pause_off(fs_info);
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+
+ if (ret == 0) {
+ ro_set = 1;
+ } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
+ !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
+ /*
+ * btrfs_inc_block_group_ro return -ENOSPC when it
+ * failed in creating new chunk for metadata.
+ * It is not a problem for scrub, because
+ * metadata are always cowed, and our scrub paused
+ * commit_transactions.
+ *
+ * For RAID56 chunks, we have to mark them read-only
+ * for scrub, as later we would use our own cache
+ * out of RAID56 realm.
+ * Thus we want the RAID56 bg to be marked RO to
+ * prevent RMW from screwing up out cache.
+ */
+ ro_set = 0;
+ } else if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "skipping scrub of block group %llu due to active swapfile",
+ cache->start);
+ scrub_pause_off(fs_info);
+ ret = 0;
+ goto skip_unfreeze;
+ } else {
+ btrfs_warn(fs_info,
+ "failed setting block group ro: %d", ret);
+ btrfs_unfreeze_block_group(cache);
+ btrfs_put_block_group(cache);
+ scrub_pause_off(fs_info);
+ break;
+ }
+
+ /*
+ * Now the target block is marked RO, wait for nocow writes to
+ * finish before dev-replace.
+ * COW is fine, as COW never overwrites extents in commit tree.
+ */
+ if (sctx->is_dev_replace) {
+ btrfs_wait_nocow_writers(cache);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
+ cache->length);
+ }
+
+ scrub_pause_off(fs_info);
+ down_write(&dev_replace->rwsem);
+ dev_replace->cursor_right = found_key.offset + dev_extent_len;
+ dev_replace->cursor_left = found_key.offset;
+ dev_replace->item_needs_writeback = 1;
+ up_write(&dev_replace->rwsem);
+
+ ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
+ dev_extent_len);
+
+ /*
+ * flush, submit all pending read and write bios, afterwards
+ * wait for them.
+ * Note that in the dev replace case, a read request causes
+ * write requests that are submitted in the read completion
+ * worker. Therefore in the current situation, it is required
+ * that all write requests are flushed, so that all read and
+ * write requests are really completed when bios_in_flight
+ * changes to 0.
+ */
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+
+ scrub_pause_on(fs_info);
+
+ /*
+ * must be called before we decrease @scrub_paused.
+ * make sure we don't block transaction commit while
+ * we are waiting pending workers finished.
+ */
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->workers_pending) == 0);
+ sctx->flush_all_writes = false;
+
+ scrub_pause_off(fs_info);
+
+ if (sctx->is_dev_replace &&
+ !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
+ cache, found_key.offset))
+ ro_set = 0;
+
+ down_write(&dev_replace->rwsem);
+ dev_replace->cursor_left = dev_replace->cursor_right;
+ dev_replace->item_needs_writeback = 1;
+ up_write(&dev_replace->rwsem);
+
+ if (ro_set)
+ btrfs_dec_block_group_ro(cache);
+
+ /*
+ * We might have prevented the cleaner kthread from deleting
+ * this block group if it was already unused because we raced
+ * and set it to RO mode first. So add it back to the unused
+ * list, otherwise it might not ever be deleted unless a manual
+ * balance is triggered or it becomes used and unused again.
+ */
+ spin_lock(&cache->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
+ !cache->ro && cache->reserved == 0 && cache->used == 0) {
+ spin_unlock(&cache->lock);
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ } else {
+ spin_unlock(&cache->lock);
+ }
+skip_unfreeze:
+ btrfs_unfreeze_block_group(cache);
+ btrfs_put_block_group(cache);
+ if (ret)
+ break;
+ if (sctx->is_dev_replace &&
+ atomic64_read(&dev_replace->num_write_errors) > 0) {
+ ret = -EIO;
+ break;
+ }
+ if (sctx->stat.malloc_errors > 0) {
+ ret = -ENOMEM;
+ break;
+ }
+skip:
+ key.offset = found_key.offset + dev_extent_len;
+ btrfs_release_path(path);
+ }
+
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev)
+{
+ int i;
+ u64 bytenr;
+ u64 gen;
+ int ret;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+
+ if (BTRFS_FS_ERROR(fs_info))
+ return -EROFS;
+
+ /* Seed devices of a new filesystem has their own generation. */
+ if (scrub_dev->fs_devices != fs_info->fs_devices)
+ gen = scrub_dev->generation;
+ else
+ gen = fs_info->last_trans_committed;
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >
+ scrub_dev->commit_total_bytes)
+ break;
+ if (!btrfs_check_super_location(scrub_dev, bytenr))
+ continue;
+
+ ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+ NULL, bytenr);
+ if (ret)
+ return ret;
+ }
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+
+ return 0;
+}
+
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
+ &fs_info->scrub_lock)) {
+ struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
+ struct workqueue_struct *scrub_wr_comp =
+ fs_info->scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity =
+ fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ if (scrub_workers)
+ destroy_workqueue(scrub_workers);
+ if (scrub_wr_comp)
+ destroy_workqueue(scrub_wr_comp);
+ if (scrub_parity)
+ destroy_workqueue(scrub_parity);
+ }
+}
+
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+ int is_dev_replace)
+{
+ struct workqueue_struct *scrub_workers = NULL;
+ struct workqueue_struct *scrub_wr_comp = NULL;
+ struct workqueue_struct *scrub_parity = NULL;
+ unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
+ int max_active = fs_info->thread_pool_size;
+ int ret = -ENOMEM;
+
+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
+ return 0;
+
+ scrub_workers = alloc_workqueue("btrfs-scrub", flags,
+ is_dev_replace ? 1 : max_active);
+ if (!scrub_workers)
+ goto fail_scrub_workers;
+
+ scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
+ if (!scrub_wr_comp)
+ goto fail_scrub_wr_completion_workers;
+
+ scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
+ if (!scrub_parity)
+ goto fail_scrub_parity_workers;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL &&
+ fs_info->scrub_wr_completion_workers == NULL &&
+ fs_info->scrub_parity_workers == NULL);
+ fs_info->scrub_workers = scrub_workers;
+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+ fs_info->scrub_parity_workers = scrub_parity;
+ refcount_set(&fs_info->scrub_workers_refcnt, 1);
+ mutex_unlock(&fs_info->scrub_lock);
+ return 0;
+ }
+ /* Other thread raced in and created the workers for us */
+ refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ ret = 0;
+ destroy_workqueue(scrub_parity);
+fail_scrub_parity_workers:
+ destroy_workqueue(scrub_wr_comp);
+fail_scrub_wr_completion_workers:
+ destroy_workqueue(scrub_workers);
+fail_scrub_workers:
+ return ret;
+}
+
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace)
+{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
+ struct scrub_ctx *sctx;
+ int ret;
+ struct btrfs_device *dev;
+ unsigned int nofs_flag;
+ bool need_commit = false;
+
+ if (btrfs_fs_closing(fs_info))
+ return -EAGAIN;
+
+ /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
+ ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
+
+ /*
+ * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
+ * value (max nodesize / min sectorsize), thus nodesize should always
+ * be fine.
+ */
+ ASSERT(fs_info->nodesize <=
+ SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
+
+ /* Allocate outside of device_list_mutex */
+ sctx = scrub_setup_ctx(fs_info, is_dev_replace);
+ if (IS_ERR(sctx))
+ return PTR_ERR(sctx);
+
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret)
+ goto out_free_ctx;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
+ if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
+ !is_dev_replace)) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!is_dev_replace && !readonly &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ btrfs_err_in_rcu(fs_info,
+ "scrub on devid %llu: filesystem on %s is not writable",
+ devid, rcu_str_deref(dev->name));
+ ret = -EROFS;
+ goto out;
+ }
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ ret = -EIO;
+ goto out;
+ }
+
+ down_read(&fs_info->dev_replace.rwsem);
+ if (dev->scrub_ctx ||
+ (!is_dev_replace &&
+ btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+ up_read(&fs_info->dev_replace.rwsem);
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ up_read(&fs_info->dev_replace.rwsem);
+
+ sctx->readonly = readonly;
+ dev->scrub_ctx = sctx;
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ /*
+ * checking @scrub_pause_req here, we can avoid
+ * race between committing transaction and scrubbing.
+ */
+ __scrub_blocked_if_needed(fs_info);
+ atomic_inc(&fs_info->scrubs_running);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ /*
+ * In order to avoid deadlock with reclaim when there is a transaction
+ * trying to pause scrub, make sure we use GFP_NOFS for all the
+ * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
+ * invoked by our callees. The pausing request is done when the
+ * transaction commit starts, and it blocks the transaction until scrub
+ * is paused (done at specific points at scrub_stripe() or right above
+ * before incrementing fs_info->scrubs_running).
+ */
+ nofs_flag = memalloc_nofs_save();
+ if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
+ btrfs_info(fs_info, "scrub: started on devid %llu", devid);
+ /*
+ * by holding device list mutex, we can
+ * kick off writing super in log tree sync.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ ret = scrub_supers(sctx, dev);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
+ }
+
+ if (!ret)
+ ret = scrub_enumerate_chunks(sctx, dev, start, end);
+ memalloc_nofs_restore(nofs_flag);
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+ atomic_dec(&fs_info->scrubs_running);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
+
+ if (progress)
+ memcpy(progress, &sctx->stat, sizeof(*progress));
+
+ if (!is_dev_replace)
+ btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
+ ret ? "not finished" : "finished", devid, ret);
+
+ mutex_lock(&fs_info->scrub_lock);
+ dev->scrub_ctx = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ scrub_workers_put(fs_info);
+ scrub_put_ctx(sctx);
+
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
+ return ret;
+out:
+ scrub_workers_put(fs_info);
+out_free_ctx:
+ scrub_free_ctx(sctx);
+
+ return ret;
+}
+
+void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_inc(&fs_info->scrub_pause_req);
+ while (atomic_read(&fs_info->scrubs_paused) !=
+ atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrubs_paused) ==
+ atomic_read(&fs_info->scrubs_running));
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ mutex_unlock(&fs_info->scrub_lock);
+}
+
+void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
+{
+ atomic_dec(&fs_info->scrub_pause_req);
+ wake_up(&fs_info->scrub_pause_wait);
+}
+
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->scrub_lock);
+ if (!atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ return -ENOTCONN;
+ }
+
+ atomic_inc(&fs_info->scrub_cancel_req);
+ while (atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrubs_running) == 0);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ atomic_dec(&fs_info->scrub_cancel_req);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ return 0;
+}
+
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct scrub_ctx *sctx;
+
+ mutex_lock(&fs_info->scrub_lock);
+ sctx = dev->scrub_ctx;
+ if (!sctx) {
+ mutex_unlock(&fs_info->scrub_lock);
+ return -ENOTCONN;
+ }
+ atomic_inc(&sctx->cancel_req);
+ while (dev->scrub_ctx) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ dev->scrub_ctx == NULL);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ mutex_unlock(&fs_info->scrub_lock);
+
+ return 0;
+}
+
+int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
+ struct btrfs_scrub_progress *progress)
+{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
+ struct btrfs_device *dev;
+ struct scrub_ctx *sctx = NULL;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
+ if (dev)
+ sctx = dev->scrub_ctx;
+ if (sctx)
+ memcpy(progress, &sctx->stat, sizeof(*progress));
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num)
+{
+ u64 mapped_length;
+ struct btrfs_io_context *bioc = NULL;
+ int ret;
+
+ mapped_length = extent_len;
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
+ &mapped_length, &bioc, 0);
+ if (ret || !bioc || mapped_length < extent_len ||
+ !bioc->stripes[0].dev->bdev) {
+ btrfs_put_bioc(bioc);
+ return;
+ }
+
+ *extent_physical = bioc->stripes[0].physical;
+ *extent_mirror_num = bioc->mirror_num;
+ *extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
+}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
new file mode 100644
index 000000000..4a4d65b5e
--- /dev/null
+++ b/fs/btrfs/send.c
@@ -0,0 +1,8136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2012 Alexander Block. All rights reserved.
+ */
+
+#include <linux/bsearch.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sort.h>
+#include <linux/mount.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/radix-tree.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/compat.h>
+#include <linux/crc32c.h>
+#include <linux/fsverity.h>
+
+#include "send.h"
+#include "ctree.h"
+#include "backref.h"
+#include "locking.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "compression.h"
+#include "xattr.h"
+#include "print-tree.h"
+
+/*
+ * Maximum number of references an extent can have in order for us to attempt to
+ * issue clone operations instead of write operations. This currently exists to
+ * avoid hitting limitations of the backreference walking code (taking a lot of
+ * time and using too much memory for extents with large number of references).
+ */
+#define SEND_MAX_EXTENT_REFS 64
+
+/*
+ * A fs_path is a helper to dynamically build path names with unknown size.
+ * It reallocates the internal buffer on demand.
+ * It allows fast adding of path elements on the right side (normal path) and
+ * fast adding to the left side (reversed path). A reversed path can also be
+ * unreversed if needed.
+ */
+struct fs_path {
+ union {
+ struct {
+ char *start;
+ char *end;
+
+ char *buf;
+ unsigned short buf_len:15;
+ unsigned short reversed:1;
+ char inline_buf[];
+ };
+ /*
+ * Average path length does not exceed 200 bytes, we'll have
+ * better packing in the slab and higher chance to satisfy
+ * a allocation later during send.
+ */
+ char pad[256];
+ };
+};
+#define FS_PATH_INLINE_SIZE \
+ (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
+
+
+/* reused for each extent */
+struct clone_root {
+ struct btrfs_root *root;
+ u64 ino;
+ u64 offset;
+
+ u64 found_refs;
+};
+
+#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
+#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+
+struct send_ctx {
+ struct file *send_filp;
+ loff_t send_off;
+ char *send_buf;
+ u32 send_size;
+ u32 send_max_size;
+ /*
+ * Whether BTRFS_SEND_A_DATA attribute was already added to current
+ * command (since protocol v2, data must be the last attribute).
+ */
+ bool put_data;
+ struct page **send_buf_pages;
+ u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
+ /* Protocol version compatibility requested */
+ u32 proto;
+
+ struct btrfs_root *send_root;
+ struct btrfs_root *parent_root;
+ struct clone_root *clone_roots;
+ int clone_roots_cnt;
+
+ /* current state of the compare_tree call */
+ struct btrfs_path *left_path;
+ struct btrfs_path *right_path;
+ struct btrfs_key *cmp_key;
+
+ /*
+ * Keep track of the generation of the last transaction that was used
+ * for relocating a block group. This is periodically checked in order
+ * to detect if a relocation happened since the last check, so that we
+ * don't operate on stale extent buffers for nodes (level >= 1) or on
+ * stale disk_bytenr values of file extent items.
+ */
+ u64 last_reloc_trans;
+
+ /*
+ * infos of the currently processed inode. In case of deleted inodes,
+ * these are the values from the deleted inode.
+ */
+ u64 cur_ino;
+ u64 cur_inode_gen;
+ u64 cur_inode_size;
+ u64 cur_inode_mode;
+ u64 cur_inode_rdev;
+ u64 cur_inode_last_extent;
+ u64 cur_inode_next_write_offset;
+ bool cur_inode_new;
+ bool cur_inode_new_gen;
+ bool cur_inode_deleted;
+ bool ignore_cur_inode;
+ bool cur_inode_needs_verity;
+ void *verity_descriptor;
+
+ u64 send_progress;
+
+ struct list_head new_refs;
+ struct list_head deleted_refs;
+
+ struct radix_tree_root name_cache;
+ struct list_head name_cache_list;
+ int name_cache_size;
+
+ /*
+ * The inode we are currently processing. It's not NULL only when we
+ * need to issue write commands for data extents from this inode.
+ */
+ struct inode *cur_inode;
+ struct file_ra_state ra;
+ u64 page_cache_clear_start;
+ bool clean_page_cache;
+
+ /*
+ * We process inodes by their increasing order, so if before an
+ * incremental send we reverse the parent/child relationship of
+ * directories such that a directory with a lower inode number was
+ * the parent of a directory with a higher inode number, and the one
+ * becoming the new parent got renamed too, we can't rename/move the
+ * directory with lower inode number when we finish processing it - we
+ * must process the directory with higher inode number first, then
+ * rename/move it and then rename/move the directory with lower inode
+ * number. Example follows.
+ *
+ * Tree state when the first send was performed:
+ *
+ * .
+ * |-- a (ino 257)
+ * |-- b (ino 258)
+ * |
+ * |
+ * |-- c (ino 259)
+ * | |-- d (ino 260)
+ * |
+ * |-- c2 (ino 261)
+ *
+ * Tree state when the second (incremental) send is performed:
+ *
+ * .
+ * |-- a (ino 257)
+ * |-- b (ino 258)
+ * |-- c2 (ino 261)
+ * |-- d2 (ino 260)
+ * |-- cc (ino 259)
+ *
+ * The sequence of steps that lead to the second state was:
+ *
+ * mv /a/b/c/d /a/b/c2/d2
+ * mv /a/b/c /a/b/c2/d2/cc
+ *
+ * "c" has lower inode number, but we can't move it (2nd mv operation)
+ * before we move "d", which has higher inode number.
+ *
+ * So we just memorize which move/rename operations must be performed
+ * later when their respective parent is processed and moved/renamed.
+ */
+
+ /* Indexed by parent directory inode number. */
+ struct rb_root pending_dir_moves;
+
+ /*
+ * Reverse index, indexed by the inode number of a directory that
+ * is waiting for the move/rename of its immediate parent before its
+ * own move/rename can be performed.
+ */
+ struct rb_root waiting_dir_moves;
+
+ /*
+ * A directory that is going to be rm'ed might have a child directory
+ * which is in the pending directory moves index above. In this case,
+ * the directory can only be removed after the move/rename of its child
+ * is performed. Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- c/ (ino 259)
+ * | |-- x/ (ino 260)
+ * |
+ * |-- y/ (ino 261)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- YY/ (ino 261)
+ * |-- x/ (ino 260)
+ *
+ * Sequence of steps that lead to the send snapshot:
+ * rm -f /a/b/c/foo.txt
+ * mv /a/b/y /a/b/YY
+ * mv /a/b/c/x /a/b/YY
+ * rmdir /a/b/c
+ *
+ * When the child is processed, its move/rename is delayed until its
+ * parent is processed (as explained above), but all other operations
+ * like update utimes, chown, chgrp, etc, are performed and the paths
+ * that it uses for those operations must use the orphanized name of
+ * its parent (the directory we're going to rm later), so we need to
+ * memorize that name.
+ *
+ * Indexed by the inode number of the directory to be deleted.
+ */
+ struct rb_root orphan_dirs;
+
+ struct rb_root rbtree_new_refs;
+ struct rb_root rbtree_deleted_refs;
+};
+
+struct pending_dir_move {
+ struct rb_node node;
+ struct list_head list;
+ u64 parent_ino;
+ u64 ino;
+ u64 gen;
+ struct list_head update_refs;
+};
+
+struct waiting_dir_move {
+ struct rb_node node;
+ u64 ino;
+ /*
+ * There might be some directory that could not be removed because it
+ * was waiting for this directory inode to be moved first. Therefore
+ * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+ */
+ u64 rmdir_ino;
+ u64 rmdir_gen;
+ bool orphanized;
+};
+
+struct orphan_dir_info {
+ struct rb_node node;
+ u64 ino;
+ u64 gen;
+ u64 last_dir_index_offset;
+};
+
+struct name_cache_entry {
+ struct list_head list;
+ /*
+ * radix_tree has only 32bit entries but we need to handle 64bit inums.
+ * We use the lower 32bit of the 64bit inum to store it in the tree. If
+ * more then one inum would fall into the same entry, we use radix_list
+ * to store the additional entries. radix_list is also used to store
+ * entries where two entries have the same inum but different
+ * generations.
+ */
+ struct list_head radix_list;
+ u64 ino;
+ u64 gen;
+ u64 parent_ino;
+ u64 parent_gen;
+ int ret;
+ int need_later_update;
+ int name_len;
+ char name[];
+};
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+enum btrfs_compare_tree_result {
+ BTRFS_COMPARE_TREE_NEW,
+ BTRFS_COMPARE_TREE_DELETED,
+ BTRFS_COMPARE_TREE_CHANGED,
+ BTRFS_COMPARE_TREE_SAME,
+};
+
+__cold
+static void inconsistent_snapshot_error(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result,
+ const char *what)
+{
+ const char *result_string;
+
+ switch (result) {
+ case BTRFS_COMPARE_TREE_NEW:
+ result_string = "new";
+ break;
+ case BTRFS_COMPARE_TREE_DELETED:
+ result_string = "deleted";
+ break;
+ case BTRFS_COMPARE_TREE_CHANGED:
+ result_string = "updated";
+ break;
+ case BTRFS_COMPARE_TREE_SAME:
+ ASSERT(0);
+ result_string = "unchanged";
+ break;
+ default:
+ ASSERT(0);
+ result_string = "unexpected";
+ }
+
+ btrfs_err(sctx->send_root->fs_info,
+ "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
+ result_string, what, sctx->cmp_key->objectid,
+ sctx->send_root->root_key.objectid,
+ (sctx->parent_root ?
+ sctx->parent_root->root_key.objectid : 0));
+}
+
+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+ switch (sctx->proto) {
+ case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
+ case 3: return cmd <= BTRFS_SEND_C_MAX_V3;
+ default: return false;
+ }
+}
+
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
+
+static int need_send_hole(struct send_ctx *sctx)
+{
+ return (sctx->parent_root && !sctx->cur_inode_new &&
+ !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+ S_ISREG(sctx->cur_inode_mode));
+}
+
+static void fs_path_reset(struct fs_path *p)
+{
+ if (p->reversed) {
+ p->start = p->buf + p->buf_len - 1;
+ p->end = p->start;
+ *p->start = 0;
+ } else {
+ p->start = p->buf;
+ p->end = p->start;
+ *p->start = 0;
+ }
+}
+
+static struct fs_path *fs_path_alloc(void)
+{
+ struct fs_path *p;
+
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return NULL;
+ p->reversed = 0;
+ p->buf = p->inline_buf;
+ p->buf_len = FS_PATH_INLINE_SIZE;
+ fs_path_reset(p);
+ return p;
+}
+
+static struct fs_path *fs_path_alloc_reversed(void)
+{
+ struct fs_path *p;
+
+ p = fs_path_alloc();
+ if (!p)
+ return NULL;
+ p->reversed = 1;
+ fs_path_reset(p);
+ return p;
+}
+
+static void fs_path_free(struct fs_path *p)
+{
+ if (!p)
+ return;
+ if (p->buf != p->inline_buf)
+ kfree(p->buf);
+ kfree(p);
+}
+
+static int fs_path_len(struct fs_path *p)
+{
+ return p->end - p->start;
+}
+
+static int fs_path_ensure_buf(struct fs_path *p, int len)
+{
+ char *tmp_buf;
+ int path_len;
+ int old_buf_len;
+
+ len++;
+
+ if (p->buf_len >= len)
+ return 0;
+
+ if (len > PATH_MAX) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
+ path_len = p->end - p->start;
+ old_buf_len = p->buf_len;
+
+ /*
+ * First time the inline_buf does not suffice
+ */
+ if (p->buf == p->inline_buf) {
+ tmp_buf = kmalloc(len, GFP_KERNEL);
+ if (tmp_buf)
+ memcpy(tmp_buf, p->buf, old_buf_len);
+ } else {
+ tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
+ }
+ if (!tmp_buf)
+ return -ENOMEM;
+ p->buf = tmp_buf;
+ /*
+ * The real size of the buffer is bigger, this will let the fast path
+ * happen most of the time
+ */
+ p->buf_len = ksize(p->buf);
+
+ if (p->reversed) {
+ tmp_buf = p->buf + old_buf_len - path_len - 1;
+ p->end = p->buf + p->buf_len - 1;
+ p->start = p->end - path_len;
+ memmove(p->start, tmp_buf, path_len + 1);
+ } else {
+ p->start = p->buf;
+ p->end = p->start + path_len;
+ }
+ return 0;
+}
+
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+ char **prepared)
+{
+ int ret;
+ int new_len;
+
+ new_len = p->end - p->start + name_len;
+ if (p->start != p->end)
+ new_len++;
+ ret = fs_path_ensure_buf(p, new_len);
+ if (ret < 0)
+ goto out;
+
+ if (p->reversed) {
+ if (p->start != p->end)
+ *--p->start = '/';
+ p->start -= name_len;
+ *prepared = p->start;
+ } else {
+ if (p->start != p->end)
+ *p->end++ = '/';
+ *prepared = p->end;
+ p->end += name_len;
+ *p->end = 0;
+ }
+
+out:
+ return ret;
+}
+
+static int fs_path_add(struct fs_path *p, const char *name, int name_len)
+{
+ int ret;
+ char *prepared;
+
+ ret = fs_path_prepare_for_add(p, name_len, &prepared);
+ if (ret < 0)
+ goto out;
+ memcpy(prepared, name, name_len);
+
+out:
+ return ret;
+}
+
+static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
+{
+ int ret;
+ char *prepared;
+
+ ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
+ if (ret < 0)
+ goto out;
+ memcpy(prepared, p2->start, p2->end - p2->start);
+
+out:
+ return ret;
+}
+
+static int fs_path_add_from_extent_buffer(struct fs_path *p,
+ struct extent_buffer *eb,
+ unsigned long off, int len)
+{
+ int ret;
+ char *prepared;
+
+ ret = fs_path_prepare_for_add(p, len, &prepared);
+ if (ret < 0)
+ goto out;
+
+ read_extent_buffer(eb, prepared, off, len);
+
+out:
+ return ret;
+}
+
+static int fs_path_copy(struct fs_path *p, struct fs_path *from)
+{
+ p->reversed = from->reversed;
+ fs_path_reset(p);
+
+ return fs_path_add_path(p, from);
+}
+
+static void fs_path_unreverse(struct fs_path *p)
+{
+ char *tmp;
+ int len;
+
+ if (!p->reversed)
+ return;
+
+ tmp = p->start;
+ len = p->end - p->start;
+ p->start = p->buf;
+ p->end = p->start + len;
+ memmove(p->start, tmp, len + 1);
+ p->reversed = 0;
+}
+
+static struct btrfs_path *alloc_path_for_send(void)
+{
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return NULL;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ path->need_commit_sem = 1;
+ return path;
+}
+
+static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
+{
+ int ret;
+ u32 pos = 0;
+
+ while (pos < len) {
+ ret = kernel_write(filp, buf + pos, len - pos, off);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -EIO;
+ pos += ret;
+ }
+
+ return 0;
+}
+
+static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
+{
+ struct btrfs_tlv_header *hdr;
+ int total_len = sizeof(*hdr) + len;
+ int left = sctx->send_max_size - sctx->send_size;
+
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+
+ if (unlikely(left < total_len))
+ return -EOVERFLOW;
+
+ hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ memcpy(hdr + 1, data, len);
+ sctx->send_size += total_len;
+
+ return 0;
+}
+
+#define TLV_PUT_DEFINE_INT(bits) \
+ static int tlv_put_u##bits(struct send_ctx *sctx, \
+ u##bits attr, u##bits value) \
+ { \
+ __le##bits __tmp = cpu_to_le##bits(value); \
+ return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
+ }
+
+TLV_PUT_DEFINE_INT(8)
+TLV_PUT_DEFINE_INT(32)
+TLV_PUT_DEFINE_INT(64)
+
+static int tlv_put_string(struct send_ctx *sctx, u16 attr,
+ const char *str, int len)
+{
+ if (len == -1)
+ len = strlen(str);
+ return tlv_put(sctx, attr, str, len);
+}
+
+static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
+ const u8 *uuid)
+{
+ return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
+}
+
+static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
+ struct extent_buffer *eb,
+ struct btrfs_timespec *ts)
+{
+ struct btrfs_timespec bts;
+ read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
+ return tlv_put(sctx, attr, &bts, sizeof(bts));
+}
+
+
+#define TLV_PUT(sctx, attrtype, data, attrlen) \
+ do { \
+ ret = tlv_put(sctx, attrtype, data, attrlen); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+
+#define TLV_PUT_INT(sctx, attrtype, bits, value) \
+ do { \
+ ret = tlv_put_u##bits(sctx, attrtype, value); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+
+#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
+#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
+#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
+#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
+#define TLV_PUT_STRING(sctx, attrtype, str, len) \
+ do { \
+ ret = tlv_put_string(sctx, attrtype, str, len); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+#define TLV_PUT_PATH(sctx, attrtype, p) \
+ do { \
+ ret = tlv_put_string(sctx, attrtype, p->start, \
+ p->end - p->start); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while(0)
+#define TLV_PUT_UUID(sctx, attrtype, uuid) \
+ do { \
+ ret = tlv_put_uuid(sctx, attrtype, uuid); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
+ do { \
+ ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+
+static int send_header(struct send_ctx *sctx)
+{
+ struct btrfs_stream_header hdr;
+
+ strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
+ hdr.version = cpu_to_le32(sctx->proto);
+ return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+ &sctx->send_off);
+}
+
+/*
+ * For each command/item we want to send to userspace, we call this function.
+ */
+static int begin_cmd(struct send_ctx *sctx, int cmd)
+{
+ struct btrfs_cmd_header *hdr;
+
+ if (WARN_ON(!sctx->send_buf))
+ return -EINVAL;
+
+ BUG_ON(sctx->send_size);
+
+ sctx->send_size += sizeof(*hdr);
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ put_unaligned_le16(cmd, &hdr->cmd);
+
+ return 0;
+}
+
+static int send_cmd(struct send_ctx *sctx)
+{
+ int ret;
+ struct btrfs_cmd_header *hdr;
+ u32 crc;
+
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
+
+ crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ put_unaligned_le32(crc, &hdr->crc);
+
+ ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+ &sctx->send_off);
+
+ sctx->send_size = 0;
+ sctx->put_data = false;
+
+ return ret;
+}
+
+/*
+ * Sends a move instruction to user space
+ */
+static int send_rename(struct send_ctx *sctx,
+ struct fs_path *from, struct fs_path *to)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret;
+
+ btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+/*
+ * Sends a link instruction to user space
+ */
+static int send_link(struct send_ctx *sctx,
+ struct fs_path *path, struct fs_path *lnk)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret;
+
+ btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+/*
+ * Sends an unlink instruction to user space
+ */
+static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret;
+
+ btrfs_debug(fs_info, "send_unlink %s", path->start);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+/*
+ * Sends a rmdir instruction to user space
+ */
+static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret;
+
+ btrfs_debug(fs_info, "send_rmdir %s", path->start);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+struct btrfs_inode_info {
+ u64 size;
+ u64 gen;
+ u64 mode;
+ u64 uid;
+ u64 gid;
+ u64 rdev;
+ u64 fileattr;
+ u64 nlink;
+};
+
+/*
+ * Helper function to retrieve some fields from an inode item.
+ */
+static int get_inode_info(struct btrfs_root *root, u64 ino,
+ struct btrfs_inode_info *info)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_inode_item *ii;
+ struct btrfs_key key;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!info)
+ goto out;
+
+ ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ info->size = btrfs_inode_size(path->nodes[0], ii);
+ info->gen = btrfs_inode_generation(path->nodes[0], ii);
+ info->mode = btrfs_inode_mode(path->nodes[0], ii);
+ info->uid = btrfs_inode_uid(path->nodes[0], ii);
+ info->gid = btrfs_inode_gid(path->nodes[0], ii);
+ info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
+ /*
+ * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
+ * otherwise logically split to 32/32 parts.
+ */
+ info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
+{
+ int ret;
+ struct btrfs_inode_info info;
+
+ if (!gen)
+ return -EPERM;
+
+ ret = get_inode_info(root, ino, &info);
+ if (!ret)
+ *gen = info.gen;
+ return ret;
+}
+
+typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
+ struct fs_path *p,
+ void *ctx);
+
+/*
+ * Helper function to iterate the entries in ONE btrfs_inode_ref or
+ * btrfs_inode_extref.
+ * The iterate callback may return a non zero value to stop iteration. This can
+ * be a negative value for error codes or 1 to simply stop it.
+ *
+ * path must point to the INODE_REF or INODE_EXTREF when called.
+ */
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *found_key, int resolve,
+ iterate_inode_ref_t iterate, void *ctx)
+{
+ struct extent_buffer *eb = path->nodes[0];
+ struct btrfs_inode_ref *iref;
+ struct btrfs_inode_extref *extref;
+ struct btrfs_path *tmp_path;
+ struct fs_path *p;
+ u32 cur = 0;
+ u32 total;
+ int slot = path->slots[0];
+ u32 name_len;
+ char *start;
+ int ret = 0;
+ int num = 0;
+ int index;
+ u64 dir;
+ unsigned long name_off;
+ unsigned long elem_size;
+ unsigned long ptr;
+
+ p = fs_path_alloc_reversed();
+ if (!p)
+ return -ENOMEM;
+
+ tmp_path = alloc_path_for_send();
+ if (!tmp_path) {
+ fs_path_free(p);
+ return -ENOMEM;
+ }
+
+
+ if (found_key->type == BTRFS_INODE_REF_KEY) {
+ ptr = (unsigned long)btrfs_item_ptr(eb, slot,
+ struct btrfs_inode_ref);
+ total = btrfs_item_size(eb, slot);
+ elem_size = sizeof(*iref);
+ } else {
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ total = btrfs_item_size(eb, slot);
+ elem_size = sizeof(*extref);
+ }
+
+ while (cur < total) {
+ fs_path_reset(p);
+
+ if (found_key->type == BTRFS_INODE_REF_KEY) {
+ iref = (struct btrfs_inode_ref *)(ptr + cur);
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_off = (unsigned long)(iref + 1);
+ index = btrfs_inode_ref_index(eb, iref);
+ dir = found_key->offset;
+ } else {
+ extref = (struct btrfs_inode_extref *)(ptr + cur);
+ name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_off = (unsigned long)&extref->name;
+ index = btrfs_inode_extref_index(eb, extref);
+ dir = btrfs_inode_extref_parent(eb, extref);
+ }
+
+ if (resolve) {
+ start = btrfs_ref_to_path(root, tmp_path, name_len,
+ name_off, eb, dir,
+ p->buf, p->buf_len);
+ if (IS_ERR(start)) {
+ ret = PTR_ERR(start);
+ goto out;
+ }
+ if (start < p->buf) {
+ /* overflow , try again with larger buffer */
+ ret = fs_path_ensure_buf(p,
+ p->buf_len + p->buf - start);
+ if (ret < 0)
+ goto out;
+ start = btrfs_ref_to_path(root, tmp_path,
+ name_len, name_off,
+ eb, dir,
+ p->buf, p->buf_len);
+ if (IS_ERR(start)) {
+ ret = PTR_ERR(start);
+ goto out;
+ }
+ BUG_ON(start < p->buf);
+ }
+ p->start = start;
+ } else {
+ ret = fs_path_add_from_extent_buffer(p, eb, name_off,
+ name_len);
+ if (ret < 0)
+ goto out;
+ }
+
+ cur += elem_size + name_len;
+ ret = iterate(num, dir, index, p, ctx);
+ if (ret)
+ goto out;
+ num++;
+ }
+
+out:
+ btrfs_free_path(tmp_path);
+ fs_path_free(p);
+ return ret;
+}
+
+typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ void *ctx);
+
+/*
+ * Helper function to iterate the entries in ONE btrfs_dir_item.
+ * The iterate callback may return a non zero value to stop iteration. This can
+ * be a negative value for error codes or 1 to simply stop it.
+ *
+ * path must point to the dir item when called.
+ */
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+ iterate_dir_item_t iterate, void *ctx)
+{
+ int ret = 0;
+ struct extent_buffer *eb;
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ char *buf = NULL;
+ int buf_len;
+ u32 name_len;
+ u32 data_len;
+ u32 cur;
+ u32 len;
+ u32 total;
+ int slot;
+ int num;
+
+ /*
+ * Start with a small buffer (1 page). If later we end up needing more
+ * space, which can happen for xattrs on a fs with a leaf size greater
+ * then the page size, attempt to increase the buffer. Typically xattr
+ * values are small.
+ */
+ buf_len = PATH_MAX;
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ cur = 0;
+ len = 0;
+ total = btrfs_item_size(eb, slot);
+
+ num = 0;
+ while (cur < total) {
+ name_len = btrfs_dir_name_len(eb, di);
+ data_len = btrfs_dir_data_len(eb, di);
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+ if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
+ if (name_len > XATTR_NAME_MAX) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+ if (name_len + data_len >
+ BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
+ ret = -E2BIG;
+ goto out;
+ }
+ } else {
+ /*
+ * Path too long
+ */
+ if (name_len + data_len > PATH_MAX) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+ }
+
+ if (name_len + data_len > buf_len) {
+ buf_len = name_len + data_len;
+ if (is_vmalloc_addr(buf)) {
+ vfree(buf);
+ buf = NULL;
+ } else {
+ char *tmp = krealloc(buf, buf_len,
+ GFP_KERNEL | __GFP_NOWARN);
+
+ if (!tmp)
+ kfree(buf);
+ buf = tmp;
+ }
+ if (!buf) {
+ buf = kvmalloc(buf_len, GFP_KERNEL);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
+ read_extent_buffer(eb, buf, (unsigned long)(di + 1),
+ name_len + data_len);
+
+ len = sizeof(*di) + name_len + data_len;
+ di = (struct btrfs_dir_item *)((char *)di + len);
+ cur += len;
+
+ ret = iterate(num, &di_key, buf, name_len, buf + name_len,
+ data_len, ctx);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ num++;
+ }
+
+out:
+ kvfree(buf);
+ return ret;
+}
+
+static int __copy_first_ref(int num, u64 dir, int index,
+ struct fs_path *p, void *ctx)
+{
+ int ret;
+ struct fs_path *pt = ctx;
+
+ ret = fs_path_copy(pt, p);
+ if (ret < 0)
+ return ret;
+
+ /* we want the first only */
+ return 1;
+}
+
+/*
+ * Retrieve the first path of an inode. If an inode has more then one
+ * ref/hardlink, this is ignored.
+ */
+static int get_inode_path(struct btrfs_root *root,
+ u64 ino, struct fs_path *path)
+{
+ int ret;
+ struct btrfs_key key, found_key;
+ struct btrfs_path *p;
+
+ p = alloc_path_for_send();
+ if (!p)
+ return -ENOMEM;
+
+ fs_path_reset(path);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 1;
+ goto out;
+ }
+ btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
+ if (found_key.objectid != ino ||
+ (found_key.type != BTRFS_INODE_REF_KEY &&
+ found_key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = iterate_inode_ref(root, p, &found_key, 1,
+ __copy_first_ref, path);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ btrfs_free_path(p);
+ return ret;
+}
+
+struct backref_ctx {
+ struct send_ctx *sctx;
+
+ /* number of total found references */
+ u64 found;
+
+ /*
+ * used for clones found in send_root. clones found behind cur_objectid
+ * and cur_offset are not considered as allowed clones.
+ */
+ u64 cur_objectid;
+ u64 cur_offset;
+
+ /* may be truncated in case it's the last extent in a file */
+ u64 extent_len;
+
+ /* Just to check for bugs in backref resolving */
+ int found_itself;
+};
+
+static int __clone_root_cmp_bsearch(const void *key, const void *elt)
+{
+ u64 root = (u64)(uintptr_t)key;
+ const struct clone_root *cr = elt;
+
+ if (root < cr->root->root_key.objectid)
+ return -1;
+ if (root > cr->root->root_key.objectid)
+ return 1;
+ return 0;
+}
+
+static int __clone_root_cmp_sort(const void *e1, const void *e2)
+{
+ const struct clone_root *cr1 = e1;
+ const struct clone_root *cr2 = e2;
+
+ if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
+ return -1;
+ if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
+ return 1;
+ return 0;
+}
+
+/*
+ * Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ */
+static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+{
+ struct backref_ctx *bctx = ctx_;
+ struct clone_root *found;
+
+ /* First check if the root is in the list of accepted clone sources */
+ found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
+ bctx->sctx->clone_roots_cnt,
+ sizeof(struct clone_root),
+ __clone_root_cmp_bsearch);
+ if (!found)
+ return 0;
+
+ if (found->root == bctx->sctx->send_root &&
+ ino == bctx->cur_objectid &&
+ offset == bctx->cur_offset) {
+ bctx->found_itself = 1;
+ }
+
+ /*
+ * Make sure we don't consider clones from send_root that are
+ * behind the current inode/offset.
+ */
+ if (found->root == bctx->sctx->send_root) {
+ /*
+ * If the source inode was not yet processed we can't issue a
+ * clone operation, as the source extent does not exist yet at
+ * the destination of the stream.
+ */
+ if (ino > bctx->cur_objectid)
+ return 0;
+ /*
+ * We clone from the inode currently being sent as long as the
+ * source extent is already processed, otherwise we could try
+ * to clone from an extent that does not exist yet at the
+ * destination of the stream.
+ */
+ if (ino == bctx->cur_objectid &&
+ offset + bctx->extent_len >
+ bctx->sctx->cur_inode_next_write_offset)
+ return 0;
+ }
+
+ bctx->found++;
+ found->found_refs++;
+ if (ino < found->ino) {
+ found->ino = ino;
+ found->offset = offset;
+ } else if (found->ino == ino) {
+ /*
+ * same extent found more then once in the same file.
+ */
+ if (found->offset > offset + bctx->extent_len)
+ found->offset = offset;
+ }
+
+ return 0;
+}
+
+/*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
+ * path must point to the extent item when called.
+ */
+static int find_extent_clone(struct send_ctx *sctx,
+ struct btrfs_path *path,
+ u64 ino, u64 data_offset,
+ u64 ino_size,
+ struct clone_root **found)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret;
+ int extent_type;
+ u64 logical;
+ u64 disk_byte;
+ u64 num_bytes;
+ u64 extent_item_pos;
+ u64 flags = 0;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *eb = path->nodes[0];
+ struct backref_ctx backref_ctx = {0};
+ struct clone_root *cur_clone_root;
+ struct btrfs_key found_key;
+ struct btrfs_path *tmp_path;
+ struct btrfs_extent_item *ei;
+ int compressed;
+ u32 i;
+
+ tmp_path = alloc_path_for_send();
+ if (!tmp_path)
+ return -ENOMEM;
+
+ /* We only use this path under the commit sem */
+ tmp_path->need_commit_sem = 0;
+
+ if (data_offset >= ino_size) {
+ /*
+ * There may be extents that lie behind the file's size.
+ * I at least had this in combination with snapshotting while
+ * writing large files.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ fi = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -ENOENT;
+ goto out;
+ }
+ compressed = btrfs_file_extent_compression(eb, fi);
+
+ num_bytes = btrfs_file_extent_num_bytes(eb, fi);
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte == 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+ logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+
+ down_read(&fs_info->commit_root_sem);
+ ret = extent_from_logical(fs_info, disk_byte, tmp_path,
+ &found_key, &flags);
+ up_read(&fs_info->commit_root_sem);
+
+ if (ret < 0)
+ goto out;
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ ret = -EIO;
+ goto out;
+ }
+
+ ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
+ struct btrfs_extent_item);
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
+ ret = -ENOENT;
+ goto out;
+ }
+ btrfs_release_path(tmp_path);
+
+ /*
+ * Setup the clone roots.
+ */
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
+ cur_clone_root = sctx->clone_roots + i;
+ cur_clone_root->ino = (u64)-1;
+ cur_clone_root->offset = 0;
+ cur_clone_root->found_refs = 0;
+ }
+
+ backref_ctx.sctx = sctx;
+ backref_ctx.found = 0;
+ backref_ctx.cur_objectid = ino;
+ backref_ctx.cur_offset = data_offset;
+ backref_ctx.found_itself = 0;
+ backref_ctx.extent_len = num_bytes;
+
+ /*
+ * The last extent of a file may be too large due to page alignment.
+ * We need to adjust extent_len in this case so that the checks in
+ * __iterate_backrefs work.
+ */
+ if (data_offset + num_bytes >= ino_size)
+ backref_ctx.extent_len = ino_size - data_offset;
+
+ /*
+ * Now collect all backrefs.
+ */
+ if (compressed == BTRFS_COMPRESS_NONE)
+ extent_item_pos = logical - found_key.objectid;
+ else
+ extent_item_pos = 0;
+ ret = iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, 1, __iterate_backrefs,
+ &backref_ctx, false);
+
+ if (ret < 0)
+ goto out;
+
+ down_read(&fs_info->commit_root_sem);
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ /*
+ * A transaction commit for a transaction in which block group
+ * relocation was done just happened.
+ * The disk_bytenr of the file extent item we processed is
+ * possibly stale, referring to the extent's location before
+ * relocation. So act as if we haven't found any clone sources
+ * and fallback to write commands, which will read the correct
+ * data from the new extent location. Otherwise we will fail
+ * below because we haven't found our own back reference or we
+ * could be getting incorrect sources in case the old extent
+ * was already reallocated after the relocation.
+ */
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOENT;
+ goto out;
+ }
+ up_read(&fs_info->commit_root_sem);
+
+ if (!backref_ctx.found_itself) {
+ /* found a bug in backref code? */
+ ret = -EIO;
+ btrfs_err(fs_info,
+ "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
+ ino, data_offset, disk_byte, found_key.objectid);
+ goto out;
+ }
+
+ btrfs_debug(fs_info,
+ "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
+ data_offset, ino, num_bytes, logical);
+
+ if (!backref_ctx.found)
+ btrfs_debug(fs_info, "no clones found");
+
+ cur_clone_root = NULL;
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
+ if (sctx->clone_roots[i].found_refs) {
+ if (!cur_clone_root)
+ cur_clone_root = sctx->clone_roots + i;
+ else if (sctx->clone_roots[i].root == sctx->send_root)
+ /* prefer clones from send_root over others */
+ cur_clone_root = sctx->clone_roots + i;
+ }
+
+ }
+
+ if (cur_clone_root) {
+ *found = cur_clone_root;
+ ret = 0;
+ } else {
+ ret = -ENOENT;
+ }
+
+out:
+ btrfs_free_path(tmp_path);
+ return ret;
+}
+
+static int read_symlink(struct btrfs_root *root,
+ u64 ino,
+ struct fs_path *dest)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+ u8 compression;
+ unsigned long off;
+ int len;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ /*
+ * An empty symlink inode. Can happen in rare error paths when
+ * creating a symlink (transaction committed before the inode
+ * eviction handler removed the symlink inode items and a crash
+ * happened in between or the subvol was snapshoted in between).
+ * Print an informative message to dmesg/syslog so that the user
+ * can delete the symlink.
+ */
+ btrfs_err(root->fs_info,
+ "Found empty symlink inode %llu at root %llu",
+ ino, root->root_key.objectid);
+ ret = -EIO;
+ goto out;
+ }
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], ei);
+ compression = btrfs_file_extent_compression(path->nodes[0], ei);
+ BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
+ BUG_ON(compression);
+
+ off = btrfs_file_extent_inline_start(ei);
+ len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
+
+ ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Helper function to generate a file name that is unique in the root of
+ * send_root and parent_root. This is used to generate names for orphan inodes.
+ */
+static int gen_unique_name(struct send_ctx *sctx,
+ u64 ino, u64 gen,
+ struct fs_path *dest)
+{
+ int ret = 0;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ char tmp[64];
+ int len;
+ u64 idx = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ struct fscrypt_str tmp_name;
+
+ len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
+ ino, gen, idx);
+ ASSERT(len < sizeof(tmp));
+ tmp_name.name = tmp;
+ tmp_name.len = strlen(tmp);
+
+ di = btrfs_lookup_dir_item(NULL, sctx->send_root,
+ path, BTRFS_FIRST_FREE_OBJECTID,
+ &tmp_name, 0);
+ btrfs_release_path(path);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ if (di) {
+ /* not unique, try again */
+ idx++;
+ continue;
+ }
+
+ if (!sctx->parent_root) {
+ /* unique */
+ ret = 0;
+ break;
+ }
+
+ di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
+ path, BTRFS_FIRST_FREE_OBJECTID,
+ &tmp_name, 0);
+ btrfs_release_path(path);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ if (di) {
+ /* not unique, try again */
+ idx++;
+ continue;
+ }
+ /* unique */
+ break;
+ }
+
+ ret = fs_path_add(dest, tmp, strlen(tmp));
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+enum inode_state {
+ inode_state_no_change,
+ inode_state_will_create,
+ inode_state_did_create,
+ inode_state_will_delete,
+ inode_state_did_delete,
+};
+
+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ int ret;
+ int left_ret;
+ int right_ret;
+ u64 left_gen;
+ u64 right_gen = 0;
+ struct btrfs_inode_info info;
+
+ ret = get_inode_info(sctx->send_root, ino, &info);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ left_ret = (info.nlink == 0) ? -ENOENT : ret;
+ left_gen = info.gen;
+
+ if (!sctx->parent_root) {
+ right_ret = -ENOENT;
+ } else {
+ ret = get_inode_info(sctx->parent_root, ino, &info);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ right_ret = (info.nlink == 0) ? -ENOENT : ret;
+ right_gen = info.gen;
+ }
+
+ if (!left_ret && !right_ret) {
+ if (left_gen == gen && right_gen == gen) {
+ ret = inode_state_no_change;
+ } else if (left_gen == gen) {
+ if (ino < sctx->send_progress)
+ ret = inode_state_did_create;
+ else
+ ret = inode_state_will_create;
+ } else if (right_gen == gen) {
+ if (ino < sctx->send_progress)
+ ret = inode_state_did_delete;
+ else
+ ret = inode_state_will_delete;
+ } else {
+ ret = -ENOENT;
+ }
+ } else if (!left_ret) {
+ if (left_gen == gen) {
+ if (ino < sctx->send_progress)
+ ret = inode_state_did_create;
+ else
+ ret = inode_state_will_create;
+ } else {
+ ret = -ENOENT;
+ }
+ } else if (!right_ret) {
+ if (right_gen == gen) {
+ if (ino < sctx->send_progress)
+ ret = inode_state_did_delete;
+ else
+ ret = inode_state_will_delete;
+ } else {
+ ret = -ENOENT;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+out:
+ return ret;
+}
+
+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ int ret;
+
+ if (ino == BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+
+ ret = get_cur_inode_state(sctx, ino, gen);
+ if (ret < 0)
+ goto out;
+
+ if (ret == inode_state_no_change ||
+ ret == inode_state_did_create ||
+ ret == inode_state_will_delete)
+ ret = 1;
+ else
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ * Helper function to lookup a dir item in a dir.
+ */
+static int lookup_dir_item_inode(struct btrfs_root *root,
+ u64 dir, const char *name, int name_len,
+ u64 *found_inode)
+{
+ int ret = 0;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
+ goto out;
+ }
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.type == BTRFS_ROOT_ITEM_KEY) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *found_inode = key.objectid;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
+static int get_first_ref(struct btrfs_root *root, u64 ino,
+ u64 *dir, u64 *dir_gen, struct fs_path *name)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ int len;
+ u64 parent_dir;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out;
+ if (!ret)
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (ret || found_key.objectid != ino ||
+ (found_key.type != BTRFS_INODE_REF_KEY &&
+ found_key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (found_key.type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+ iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(path->nodes[0], iref);
+ ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+ (unsigned long)(iref + 1),
+ len);
+ parent_dir = found_key.offset;
+ } else {
+ struct btrfs_inode_extref *extref;
+ extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_extref);
+ len = btrfs_inode_extref_name_len(path->nodes[0], extref);
+ ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+ (unsigned long)&extref->name, len);
+ parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
+ }
+ if (ret < 0)
+ goto out;
+ btrfs_release_path(path);
+
+ if (dir_gen) {
+ ret = get_inode_gen(root, parent_dir, dir_gen);
+ if (ret < 0)
+ goto out;
+ }
+
+ *dir = parent_dir;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int is_first_ref(struct btrfs_root *root,
+ u64 ino, u64 dir,
+ const char *name, int name_len)
+{
+ int ret;
+ struct fs_path *tmp_name;
+ u64 tmp_dir;
+
+ tmp_name = fs_path_alloc();
+ if (!tmp_name)
+ return -ENOMEM;
+
+ ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
+ if (ret < 0)
+ goto out;
+
+ if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = !memcmp(tmp_name->start, name, name_len);
+
+out:
+ fs_path_free(tmp_name);
+ return ret;
+}
+
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
+static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ const char *name, int name_len,
+ u64 *who_ino, u64 *who_gen, u64 *who_mode)
+{
+ int ret = 0;
+ u64 gen;
+ u64 other_inode = 0;
+ struct btrfs_inode_info info;
+
+ if (!sctx->parent_root)
+ goto out;
+
+ ret = is_inode_existent(sctx, dir, dir_gen);
+ if (ret <= 0)
+ goto out;
+
+ /*
+ * If we have a parent root we need to verify that the parent dir was
+ * not deleted and then re-created, if it was then we have no overwrite
+ * and we can just unlink this entry.
+ */
+ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
+ ret = get_inode_gen(sctx->parent_root, dir, &gen);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ if (gen != dir_gen)
+ goto out;
+ }
+
+ ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
+ &other_inode);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Check if the overwritten ref was already processed. If yes, the ref
+ * was already unlinked/moved, so we can safely assume that we will not
+ * overwrite anything at this point in time.
+ */
+ if (other_inode > sctx->send_progress ||
+ is_waiting_for_move(sctx, other_inode)) {
+ ret = get_inode_info(sctx->parent_root, other_inode, &info);
+ if (ret < 0)
+ goto out;
+
+ ret = 1;
+ *who_ino = other_inode;
+ *who_gen = info.gen;
+ *who_mode = info.mode;
+ } else {
+ ret = 0;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
+static int did_overwrite_ref(struct send_ctx *sctx,
+ u64 dir, u64 dir_gen,
+ u64 ino, u64 ino_gen,
+ const char *name, int name_len)
+{
+ int ret = 0;
+ u64 gen;
+ u64 ow_inode;
+
+ if (!sctx->parent_root)
+ goto out;
+
+ ret = is_inode_existent(sctx, dir, dir_gen);
+ if (ret <= 0)
+ goto out;
+
+ if (dir != BTRFS_FIRST_FREE_OBJECTID) {
+ ret = get_inode_gen(sctx->send_root, dir, &gen);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ if (gen != dir_gen)
+ goto out;
+ }
+
+ /* check if the ref was overwritten by another ref */
+ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
+ &ow_inode);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ if (ret) {
+ /* was never and will never be overwritten */
+ ret = 0;
+ goto out;
+ }
+
+ ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
+ if (ret < 0)
+ goto out;
+
+ if (ow_inode == ino && gen == ino_gen) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * We know that it is or will be overwritten. Check this now.
+ * The current inode being processed might have been the one that caused
+ * inode 'ino' to be orphanized, therefore check if ow_inode matches
+ * the current inode being processed.
+ */
+ if ((ow_inode < sctx->send_progress) ||
+ (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
+ gen == sctx->cur_inode_gen))
+ ret = 1;
+ else
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
+static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ int ret = 0;
+ struct fs_path *name = NULL;
+ u64 dir;
+ u64 dir_gen;
+
+ if (!sctx->parent_root)
+ goto out;
+
+ name = fs_path_alloc();
+ if (!name)
+ return -ENOMEM;
+
+ ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
+ if (ret < 0)
+ goto out;
+
+ ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
+ name->start, fs_path_len(name));
+
+out:
+ fs_path_free(name);
+ return ret;
+}
+
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
+ */
+static int name_cache_insert(struct send_ctx *sctx,
+ struct name_cache_entry *nce)
+{
+ int ret = 0;
+ struct list_head *nce_head;
+
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
+ if (!nce_head) {
+ nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
+ if (!nce_head) {
+ kfree(nce);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(nce_head);
+
+ ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+ if (ret < 0) {
+ kfree(nce_head);
+ kfree(nce);
+ return ret;
+ }
+ }
+ list_add_tail(&nce->radix_list, nce_head);
+ list_add_tail(&nce->list, &sctx->name_cache_list);
+ sctx->name_cache_size++;
+
+ return ret;
+}
+
+static void name_cache_delete(struct send_ctx *sctx,
+ struct name_cache_entry *nce)
+{
+ struct list_head *nce_head;
+
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
+ if (!nce_head) {
+ btrfs_err(sctx->send_root->fs_info,
+ "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+ nce->ino, sctx->name_cache_size);
+ }
+
+ list_del(&nce->radix_list);
+ list_del(&nce->list);
+ sctx->name_cache_size--;
+
+ /*
+ * We may not get to the final release of nce_head if the lookup fails
+ */
+ if (nce_head && list_empty(nce_head)) {
+ radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+ kfree(nce_head);
+ }
+}
+
+static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
+ u64 ino, u64 gen)
+{
+ struct list_head *nce_head;
+ struct name_cache_entry *cur;
+
+ nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+ if (!nce_head)
+ return NULL;
+
+ list_for_each_entry(cur, nce_head, radix_list) {
+ if (cur->ino == ino && cur->gen == gen)
+ return cur;
+ }
+ return NULL;
+}
+
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
+static void name_cache_clean_unused(struct send_ctx *sctx)
+{
+ struct name_cache_entry *nce;
+
+ if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
+ return;
+
+ while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
+ nce = list_entry(sctx->name_cache_list.next,
+ struct name_cache_entry, list);
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ }
+}
+
+static void name_cache_free(struct send_ctx *sctx)
+{
+ struct name_cache_entry *nce;
+
+ while (!list_empty(&sctx->name_cache_list)) {
+ nce = list_entry(sctx->name_cache_list.next,
+ struct name_cache_entry, list);
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ }
+}
+
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
+static int __get_cur_name_and_parent(struct send_ctx *sctx,
+ u64 ino, u64 gen,
+ u64 *parent_ino,
+ u64 *parent_gen,
+ struct fs_path *dest)
+{
+ int ret;
+ int nce_ret;
+ struct name_cache_entry *nce = NULL;
+
+ /*
+ * First check if we already did a call to this function with the same
+ * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+ * return the cached result.
+ */
+ nce = name_cache_search(sctx, ino, gen);
+ if (nce) {
+ if (ino < sctx->send_progress && nce->need_later_update) {
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ nce = NULL;
+ } else {
+ /*
+ * Removes the entry from the list and adds it back to
+ * the end. This marks the entry as recently used so
+ * that name_cache_clean_unused does not remove it.
+ */
+ list_move_tail(&nce->list, &sctx->name_cache_list);
+
+ *parent_ino = nce->parent_ino;
+ *parent_gen = nce->parent_gen;
+ ret = fs_path_add(dest, nce->name, nce->name_len);
+ if (ret < 0)
+ goto out;
+ ret = nce->ret;
+ goto out;
+ }
+ }
+
+ /*
+ * If the inode is not existent yet, add the orphan name and return 1.
+ * This should only happen for the parent dir that we determine in
+ * record_new_ref_if_needed().
+ */
+ ret = is_inode_existent(sctx, ino, gen);
+ if (ret < 0)
+ goto out;
+
+ if (!ret) {
+ ret = gen_unique_name(sctx, ino, gen, dest);
+ if (ret < 0)
+ goto out;
+ ret = 1;
+ goto out_cache;
+ }
+
+ /*
+ * Depending on whether the inode was already processed or not, use
+ * send_root or parent_root for ref lookup.
+ */
+ if (ino < sctx->send_progress)
+ ret = get_first_ref(sctx->send_root, ino,
+ parent_ino, parent_gen, dest);
+ else
+ ret = get_first_ref(sctx->parent_root, ino,
+ parent_ino, parent_gen, dest);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Check if the ref was overwritten by an inode's ref that was processed
+ * earlier. If yes, treat as orphan and return 1.
+ */
+ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
+ dest->start, dest->end - dest->start);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ fs_path_reset(dest);
+ ret = gen_unique_name(sctx, ino, gen, dest);
+ if (ret < 0)
+ goto out;
+ ret = 1;
+ }
+
+out_cache:
+ /*
+ * Store the result of the lookup in the name cache.
+ */
+ nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
+ if (!nce) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ nce->ino = ino;
+ nce->gen = gen;
+ nce->parent_ino = *parent_ino;
+ nce->parent_gen = *parent_gen;
+ nce->name_len = fs_path_len(dest);
+ nce->ret = ret;
+ strcpy(nce->name, dest->start);
+
+ if (ino < sctx->send_progress)
+ nce->need_later_update = 0;
+ else
+ nce->need_later_update = 1;
+
+ nce_ret = name_cache_insert(sctx, nce);
+ if (nce_ret < 0)
+ ret = nce_ret;
+ name_cache_clean_unused(sctx);
+
+out:
+ return ret;
+}
+
+/*
+ * Magic happens here. This function returns the first ref to an inode as it
+ * would look like while receiving the stream at this point in time.
+ * We walk the path up to the root. For every inode in between, we check if it
+ * was already processed/sent. If yes, we continue with the parent as found
+ * in send_root. If not, we continue with the parent as found in parent_root.
+ * If we encounter an inode that was deleted at this point in time, we use the
+ * inodes "orphan" name instead of the real name and stop. Same with new inodes
+ * that were not created yet and overwritten inodes/refs.
+ *
+ * When do we have orphan inodes:
+ * 1. When an inode is freshly created and thus no valid refs are available yet
+ * 2. When a directory lost all it's refs (deleted) but still has dir items
+ * inside which were not processed yet (pending for move/delete). If anyone
+ * tried to get the path to the dir items, it would get a path inside that
+ * orphan directory.
+ * 3. When an inode is moved around or gets new links, it may overwrite the ref
+ * of an unprocessed inode. If in that case the first ref would be
+ * overwritten, the overwritten inode gets "orphanized". Later when we
+ * process this overwritten inode, it is restored at a new place by moving
+ * the orphan inode.
+ *
+ * sctx->send_progress tells this function at which point in time receiving
+ * would be.
+ */
+static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
+ struct fs_path *dest)
+{
+ int ret = 0;
+ struct fs_path *name = NULL;
+ u64 parent_inode = 0;
+ u64 parent_gen = 0;
+ int stop = 0;
+
+ name = fs_path_alloc();
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dest->reversed = 1;
+ fs_path_reset(dest);
+
+ while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+ struct waiting_dir_move *wdm;
+
+ fs_path_reset(name);
+
+ if (is_waiting_for_rm(sctx, ino, gen)) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(dest, name);
+ break;
+ }
+
+ wdm = get_waiting_dir_move(sctx, ino);
+ if (wdm && wdm->orphanized) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ stop = 1;
+ } else if (wdm) {
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret)
+ stop = 1;
+ }
+
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add_path(dest, name);
+ if (ret < 0)
+ goto out;
+
+ ino = parent_inode;
+ gen = parent_gen;
+ }
+
+out:
+ fs_path_free(name);
+ if (!ret)
+ fs_path_unreverse(dest);
+ return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
+ */
+static int send_subvol_begin(struct send_ctx *sctx)
+{
+ int ret;
+ struct btrfs_root *send_root = sctx->send_root;
+ struct btrfs_root *parent_root = sctx->parent_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ char *name = NULL;
+ int namelen;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
+ if (!name) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ key.objectid = send_root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
+ &key, path, 1, 0);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.type != BTRFS_ROOT_BACKREF_KEY ||
+ key.objectid != send_root->root_key.objectid) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ namelen = btrfs_root_ref_name_len(leaf, ref);
+ read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
+ btrfs_release_path(path);
+
+ if (parent_root) {
+ ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
+ if (ret < 0)
+ goto out;
+ } else {
+ ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
+ if (ret < 0)
+ goto out;
+ }
+
+ TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
+
+ if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.uuid);
+
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
+ btrfs_root_ctransid(&sctx->send_root->root_item));
+ if (parent_root) {
+ if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.uuid);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+ btrfs_root_ctransid(&sctx->parent_root->root_item));
+ }
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ btrfs_free_path(path);
+ kfree(name);
+ return ret;
+}
+
+static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ if (sctx->proto < 2)
+ return 0;
+
+ btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
+ ino, uid, gid);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p = NULL;
+ struct btrfs_inode_item *ii;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ int slot;
+
+ btrfs_debug(fs_info, "send_utimes %llu", ino);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ path = alloc_path_for_send();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
+ if (sctx->proto >= 2)
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
+ * a valid path yet because we did not process the refs yet. So, the inode
+ * is created as orphan.
+ */
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+ int cmd;
+ struct btrfs_inode_info info;
+ u64 gen;
+ u64 mode;
+ u64 rdev;
+
+ btrfs_debug(fs_info, "send_create_inode %llu", ino);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ if (ino != sctx->cur_ino) {
+ ret = get_inode_info(sctx->send_root, ino, &info);
+ if (ret < 0)
+ goto out;
+ gen = info.gen;
+ mode = info.mode;
+ rdev = info.rdev;
+ } else {
+ gen = sctx->cur_inode_gen;
+ mode = sctx->cur_inode_mode;
+ rdev = sctx->cur_inode_rdev;
+ }
+
+ if (S_ISREG(mode)) {
+ cmd = BTRFS_SEND_C_MKFILE;
+ } else if (S_ISDIR(mode)) {
+ cmd = BTRFS_SEND_C_MKDIR;
+ } else if (S_ISLNK(mode)) {
+ cmd = BTRFS_SEND_C_SYMLINK;
+ } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
+ cmd = BTRFS_SEND_C_MKNOD;
+ } else if (S_ISFIFO(mode)) {
+ cmd = BTRFS_SEND_C_MKFIFO;
+ } else if (S_ISSOCK(mode)) {
+ cmd = BTRFS_SEND_C_MKSOCK;
+ } else {
+ btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
+ (int)(mode & S_IFMT));
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, cmd);
+ if (ret < 0)
+ goto out;
+
+ ret = gen_unique_name(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
+
+ if (S_ISLNK(mode)) {
+ fs_path_reset(p);
+ ret = read_symlink(sctx->send_root, ino, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
+ } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
+ S_ISFIFO(mode) || S_ISSOCK(mode)) {
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
+ }
+
+ ret = send_cmd(sctx);
+ if (ret < 0)
+ goto out;
+
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+ int ret = 0;
+ int iter_ret = 0;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_key di_key;
+ struct btrfs_dir_item *di;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+
+ btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *eb = path->nodes[0];
+
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ break;
+ }
+
+ di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
+ di_key.objectid < sctx->send_progress) {
+ ret = 1;
+ break;
+ }
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ * directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+ int ret;
+
+ if (S_ISDIR(sctx->cur_inode_mode)) {
+ ret = did_create_dir(sctx, sctx->cur_ino);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return 0;
+ }
+
+ return send_create_inode(sctx, sctx->cur_ino);
+}
+
+struct recorded_ref {
+ struct list_head list;
+ char *name;
+ struct fs_path *full_path;
+ u64 dir;
+ u64 dir_gen;
+ int name_len;
+ struct rb_node node;
+ struct rb_root *root;
+};
+
+static struct recorded_ref *recorded_ref_alloc(void)
+{
+ struct recorded_ref *ref;
+
+ ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ if (!ref)
+ return NULL;
+ RB_CLEAR_NODE(&ref->node);
+ INIT_LIST_HEAD(&ref->list);
+ return ref;
+}
+
+static void recorded_ref_free(struct recorded_ref *ref)
+{
+ if (!ref)
+ return;
+ if (!RB_EMPTY_NODE(&ref->node))
+ rb_erase(&ref->node, ref->root);
+ list_del(&ref->list);
+ fs_path_free(ref->full_path);
+ kfree(ref);
+}
+
+static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+{
+ ref->full_path = path;
+ ref->name = (char *)kbasename(ref->full_path->start);
+ ref->name_len = ref->full_path->end - ref->name;
+}
+
+static int dup_ref(struct recorded_ref *ref, struct list_head *list)
+{
+ struct recorded_ref *new;
+
+ new = recorded_ref_alloc();
+ if (!new)
+ return -ENOMEM;
+
+ new->dir = ref->dir;
+ new->dir_gen = ref->dir_gen;
+ list_add_tail(&new->list, list);
+ return 0;
+}
+
+static void __free_recorded_refs(struct list_head *head)
+{
+ struct recorded_ref *cur;
+
+ while (!list_empty(head)) {
+ cur = list_entry(head->next, struct recorded_ref, list);
+ recorded_ref_free(cur);
+ }
+}
+
+static void free_recorded_refs(struct send_ctx *sctx)
+{
+ __free_recorded_refs(&sctx->new_refs);
+ __free_recorded_refs(&sctx->deleted_refs);
+}
+
+/*
+ * Renames/moves a file/dir to its orphan name. Used when the first
+ * ref of an unprocessed inode gets overwritten and for all non empty
+ * directories.
+ */
+static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
+ struct fs_path *path)
+{
+ int ret;
+ struct fs_path *orphan;
+
+ orphan = fs_path_alloc();
+ if (!orphan)
+ return -ENOMEM;
+
+ ret = gen_unique_name(sctx, ino, gen, orphan);
+ if (ret < 0)
+ goto out;
+
+ ret = send_rename(sctx, path, orphan);
+
+out:
+ fs_path_free(orphan);
+ return ret;
+}
+
+static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
+ u64 dir_ino, u64 dir_gen)
+{
+ struct rb_node **p = &sctx->orphan_dirs.rb_node;
+ struct rb_node *parent = NULL;
+ struct orphan_dir_info *entry, *odi;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino)
+ p = &(*p)->rb_left;
+ else if (dir_ino > entry->ino)
+ p = &(*p)->rb_right;
+ else if (dir_gen < entry->gen)
+ p = &(*p)->rb_left;
+ else if (dir_gen > entry->gen)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ odi = kmalloc(sizeof(*odi), GFP_KERNEL);
+ if (!odi)
+ return ERR_PTR(-ENOMEM);
+ odi->ino = dir_ino;
+ odi->gen = dir_gen;
+ odi->last_dir_index_offset = 0;
+
+ rb_link_node(&odi->node, parent, p);
+ rb_insert_color(&odi->node, &sctx->orphan_dirs);
+ return odi;
+}
+
+static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
+ u64 dir_ino, u64 gen)
+{
+ struct rb_node *n = sctx->orphan_dirs.rb_node;
+ struct orphan_dir_info *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino)
+ n = n->rb_left;
+ else if (dir_ino > entry->ino)
+ n = n->rb_right;
+ else if (gen < entry->gen)
+ n = n->rb_left;
+ else if (gen > entry->gen)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
+{
+ struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
+
+ return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+ struct orphan_dir_info *odi)
+{
+ if (!odi)
+ return;
+ rb_erase(&odi->node, &sctx->orphan_dirs);
+ kfree(odi);
+}
+
+/*
+ * Returns 1 if a directory can be removed at this point in time.
+ * We check this by iterating all dir items and checking if the inode behind
+ * the dir item was already processed.
+ */
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ u64 send_progress)
+{
+ int ret = 0;
+ int iter_ret = 0;
+ struct btrfs_root *root = sctx->parent_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_key loc;
+ struct btrfs_dir_item *di;
+ struct orphan_dir_info *odi = NULL;
+
+ /*
+ * Don't try to rmdir the top/root subvolume dir.
+ */
+ if (dir == BTRFS_FIRST_FREE_OBJECTID)
+ return 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+
+ odi = get_orphan_dir_info(sctx, dir, dir_gen);
+ if (odi)
+ key.offset = odi->last_dir_index_offset;
+
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct waiting_dir_move *dm;
+
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type)
+ break;
+
+ di = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+
+ dm = get_waiting_dir_move(sctx, loc.objectid);
+ if (dm) {
+ odi = add_orphan_dir_info(sctx, dir, dir_gen);
+ if (IS_ERR(odi)) {
+ ret = PTR_ERR(odi);
+ goto out;
+ }
+ odi->gen = dir_gen;
+ odi->last_dir_index_offset = found_key.offset;
+ dm->rmdir_ino = dir;
+ dm->rmdir_gen = dir_gen;
+ ret = 0;
+ goto out;
+ }
+
+ if (loc.objectid > send_progress) {
+ odi = add_orphan_dir_info(sctx, dir, dir_gen);
+ if (IS_ERR(odi)) {
+ ret = PTR_ERR(odi);
+ goto out;
+ }
+ odi->gen = dir_gen;
+ odi->last_dir_index_offset = found_key.offset;
+ ret = 0;
+ goto out;
+ }
+ }
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
+ }
+ free_orphan_dir_info(sctx, odi);
+
+ ret = 1;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+{
+ struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
+
+ return entry != NULL;
+}
+
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
+{
+ struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+ struct rb_node *parent = NULL;
+ struct waiting_dir_move *entry, *dm;
+
+ dm = kmalloc(sizeof(*dm), GFP_KERNEL);
+ if (!dm)
+ return -ENOMEM;
+ dm->ino = ino;
+ dm->rmdir_ino = 0;
+ dm->rmdir_gen = 0;
+ dm->orphanized = orphanized;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct waiting_dir_move, node);
+ if (ino < entry->ino) {
+ p = &(*p)->rb_left;
+ } else if (ino > entry->ino) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(dm);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&dm->node, parent, p);
+ rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+ return 0;
+}
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+ struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+ struct waiting_dir_move *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct waiting_dir_move, node);
+ if (ino < entry->ino)
+ n = n->rb_left;
+ else if (ino > entry->ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+ struct waiting_dir_move *dm)
+{
+ if (!dm)
+ return;
+ rb_erase(&dm->node, &sctx->waiting_dir_moves);
+ kfree(dm);
+}
+
+static int add_pending_dir_move(struct send_ctx *sctx,
+ u64 ino,
+ u64 ino_gen,
+ u64 parent_ino,
+ struct list_head *new_refs,
+ struct list_head *deleted_refs,
+ const bool is_orphan)
+{
+ struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+ struct rb_node *parent = NULL;
+ struct pending_dir_move *entry = NULL, *pm;
+ struct recorded_ref *cur;
+ int exists = 0;
+ int ret;
+
+ pm = kmalloc(sizeof(*pm), GFP_KERNEL);
+ if (!pm)
+ return -ENOMEM;
+ pm->parent_ino = parent_ino;
+ pm->ino = ino;
+ pm->gen = ino_gen;
+ INIT_LIST_HEAD(&pm->list);
+ INIT_LIST_HEAD(&pm->update_refs);
+ RB_CLEAR_NODE(&pm->node);
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct pending_dir_move, node);
+ if (parent_ino < entry->parent_ino) {
+ p = &(*p)->rb_left;
+ } else if (parent_ino > entry->parent_ino) {
+ p = &(*p)->rb_right;
+ } else {
+ exists = 1;
+ break;
+ }
+ }
+
+ list_for_each_entry(cur, deleted_refs, list) {
+ ret = dup_ref(cur, &pm->update_refs);
+ if (ret < 0)
+ goto out;
+ }
+ list_for_each_entry(cur, new_refs, list) {
+ ret = dup_ref(cur, &pm->update_refs);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
+ if (ret)
+ goto out;
+
+ if (exists) {
+ list_add_tail(&pm->list, &entry->list);
+ } else {
+ rb_link_node(&pm->node, parent, p);
+ rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+ }
+ ret = 0;
+out:
+ if (ret) {
+ __free_recorded_refs(&pm->update_refs);
+ kfree(pm);
+ }
+ return ret;
+}
+
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+ u64 parent_ino)
+{
+ struct rb_node *n = sctx->pending_dir_moves.rb_node;
+ struct pending_dir_move *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct pending_dir_move, node);
+ if (parent_ino < entry->parent_ino)
+ n = n->rb_left;
+ else if (parent_ino > entry->parent_ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+ u64 ino, u64 gen, u64 *ancestor_ino)
+{
+ int ret = 0;
+ u64 parent_inode = 0;
+ u64 parent_gen = 0;
+ u64 start_ino = ino;
+
+ *ancestor_ino = 0;
+ while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+ fs_path_reset(name);
+
+ if (is_waiting_for_rm(sctx, ino, gen))
+ break;
+ if (is_waiting_for_move(sctx, ino)) {
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret < 0)
+ break;
+ if (parent_inode == start_ino) {
+ ret = 1;
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ break;
+ }
+ ino = parent_inode;
+ gen = parent_gen;
+ }
+ return ret;
+}
+
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+ struct fs_path *from_path = NULL;
+ struct fs_path *to_path = NULL;
+ struct fs_path *name = NULL;
+ u64 orig_progress = sctx->send_progress;
+ struct recorded_ref *cur;
+ u64 parent_ino, parent_gen;
+ struct waiting_dir_move *dm = NULL;
+ u64 rmdir_ino = 0;
+ u64 rmdir_gen;
+ u64 ancestor;
+ bool is_orphan;
+ int ret;
+
+ name = fs_path_alloc();
+ from_path = fs_path_alloc();
+ if (!name || !from_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ rmdir_ino = dm->rmdir_ino;
+ rmdir_gen = dm->rmdir_gen;
+ is_orphan = dm->orphanized;
+ free_waiting_dir_move(sctx, dm);
+
+ if (is_orphan) {
+ ret = gen_unique_name(sctx, pm->ino,
+ pm->gen, from_path);
+ } else {
+ ret = get_first_ref(sctx->parent_root, pm->ino,
+ &parent_ino, &parent_gen, name);
+ if (ret < 0)
+ goto out;
+ ret = get_cur_path(sctx, parent_ino, parent_gen,
+ from_path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(from_path, name);
+ }
+ if (ret < 0)
+ goto out;
+
+ sctx->send_progress = sctx->cur_ino + 1;
+ ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ LIST_HEAD(deleted_refs);
+ ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+ ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+ &pm->update_refs, &deleted_refs,
+ is_orphan);
+ if (ret < 0)
+ goto out;
+ if (rmdir_ino) {
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ dm->rmdir_ino = rmdir_ino;
+ dm->rmdir_gen = rmdir_gen;
+ }
+ goto out;
+ }
+ fs_path_reset(name);
+ to_path = name;
+ name = NULL;
+ ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+ if (ret < 0)
+ goto out;
+
+ ret = send_rename(sctx, from_path, to_path);
+ if (ret < 0)
+ goto out;
+
+ if (rmdir_ino) {
+ struct orphan_dir_info *odi;
+ u64 gen;
+
+ odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
+ if (!odi) {
+ /* already deleted */
+ goto finish;
+ }
+ gen = odi->gen;
+
+ ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+ if (!ret)
+ goto finish;
+
+ name = fs_path_alloc();
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = get_cur_path(sctx, rmdir_ino, gen, name);
+ if (ret < 0)
+ goto out;
+ ret = send_rmdir(sctx, name);
+ if (ret < 0)
+ goto out;
+ }
+
+finish:
+ ret = send_utimes(sctx, pm->ino, pm->gen);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * After rename/move, need to update the utimes of both new parent(s)
+ * and old parent(s).
+ */
+ list_for_each_entry(cur, &pm->update_refs, list) {
+ /*
+ * The parent inode might have been deleted in the send snapshot
+ */
+ ret = get_inode_info(sctx->send_root, cur->dir, NULL);
+ if (ret == -ENOENT) {
+ ret = 0;
+ continue;
+ }
+ if (ret < 0)
+ goto out;
+
+ ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ fs_path_free(name);
+ fs_path_free(from_path);
+ fs_path_free(to_path);
+ sctx->send_progress = orig_progress;
+
+ return ret;
+}
+
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+ if (!list_empty(&m->list))
+ list_del(&m->list);
+ if (!RB_EMPTY_NODE(&m->node))
+ rb_erase(&m->node, &sctx->pending_dir_moves);
+ __free_recorded_refs(&m->update_refs);
+ kfree(m);
+}
+
+static void tail_append_pending_moves(struct send_ctx *sctx,
+ struct pending_dir_move *moves,
+ struct list_head *stack)
+{
+ if (list_empty(&moves->list)) {
+ list_add_tail(&moves->list, stack);
+ } else {
+ LIST_HEAD(list);
+ list_splice_init(&moves->list, &list);
+ list_add_tail(&moves->list, stack);
+ list_splice_tail(&list, stack);
+ }
+ if (!RB_EMPTY_NODE(&moves->node)) {
+ rb_erase(&moves->node, &sctx->pending_dir_moves);
+ RB_CLEAR_NODE(&moves->node);
+ }
+}
+
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+ struct pending_dir_move *pm;
+ struct list_head stack;
+ u64 parent_ino = sctx->cur_ino;
+ int ret = 0;
+
+ pm = get_pending_dir_moves(sctx, parent_ino);
+ if (!pm)
+ return 0;
+
+ INIT_LIST_HEAD(&stack);
+ tail_append_pending_moves(sctx, pm, &stack);
+
+ while (!list_empty(&stack)) {
+ pm = list_first_entry(&stack, struct pending_dir_move, list);
+ parent_ino = pm->ino;
+ ret = apply_dir_move(sctx, pm);
+ free_pending_move(sctx, pm);
+ if (ret)
+ goto out;
+ pm = get_pending_dir_moves(sctx, parent_ino);
+ if (pm)
+ tail_append_pending_moves(sctx, pm, &stack);
+ }
+ return 0;
+
+out:
+ while (!list_empty(&stack)) {
+ pm = list_first_entry(&stack, struct pending_dir_move, list);
+ free_pending_move(sctx, pm);
+ }
+ return ret;
+}
+
+/*
+ * We might need to delay a directory rename even when no ancestor directory
+ * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
+ * renamed. This happens when we rename a directory to the old name (the name
+ * in the parent root) of some other unrelated directory that got its rename
+ * delayed due to some ancestor with higher number that got renamed.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ * . (ino 256)
+ * |---- a/ (ino 257)
+ * | |---- file (ino 260)
+ * |
+ * |---- b/ (ino 258)
+ * |---- c/ (ino 259)
+ *
+ * Send snapshot:
+ * . (ino 256)
+ * |---- a/ (ino 258)
+ * |---- x/ (ino 259)
+ * |---- y/ (ino 257)
+ * |----- file (ino 260)
+ *
+ * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
+ * from 'a' to 'x/y' happening first, which in turn depends on the rename of
+ * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
+ * must issue is:
+ *
+ * 1 - rename 259 from 'c' to 'x'
+ * 2 - rename 257 from 'a' to 'x/y'
+ * 3 - rename 258 from 'b' to 'a'
+ *
+ * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
+ * be done right away and < 0 on error.
+ */
+static int wait_for_dest_dir_move(struct send_ctx *sctx,
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
+{
+ struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key di_key;
+ struct btrfs_dir_item *di;
+ u64 left_gen;
+ u64 right_gen;
+ int ret = 0;
+ struct waiting_dir_move *wdm;
+
+ if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
+ return 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = parent_ref->dir;
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
+
+ ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
+ parent_ref->name_len);
+ if (!di) {
+ ret = 0;
+ goto out;
+ }
+ /*
+ * di_key.objectid has the number of the inode that has a dentry in the
+ * parent directory with the same name that sctx->cur_ino is being
+ * renamed to. We need to check if that inode is in the send root as
+ * well and if it is currently marked as an inode with a pending rename,
+ * if it is, we need to delay the rename of sctx->cur_ino as well, so
+ * that it happens after that other inode is renamed.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
+ if (di_key.type != BTRFS_INODE_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
+ if (ret < 0)
+ goto out;
+ ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
+
+ /* Different inode, no need to delay the rename of sctx->cur_ino */
+ if (right_gen != left_gen) {
+ ret = 0;
+ goto out;
+ }
+
+ wdm = get_waiting_dir_move(sctx, di_key.objectid);
+ if (wdm && !wdm->orphanized) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ di_key.objectid,
+ &sctx->new_refs,
+ &sctx->deleted_refs,
+ is_orphan);
+ if (!ret)
+ ret = 1;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Check if inode ino2, or any of its ancestors, is inode ino1.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int check_ino_in_path(struct btrfs_root *root,
+ const u64 ino1,
+ const u64 ino1_gen,
+ const u64 ino2,
+ const u64 ino2_gen,
+ struct fs_path *fs_path)
+{
+ u64 ino = ino2;
+
+ if (ino1 == ino2)
+ return ino1_gen == ino2_gen;
+
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ u64 parent;
+ u64 parent_gen;
+ int ret;
+
+ fs_path_reset(fs_path);
+ ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+ if (ret < 0)
+ return ret;
+ if (parent == ino1)
+ return parent_gen == ino1_gen;
+ ino = parent;
+ }
+ return 0;
+}
+
+/*
+ * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
+ * possible path (in case ino2 is not a directory and has multiple hard links).
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+ const u64 ino1,
+ const u64 ino1_gen,
+ const u64 ino2,
+ struct fs_path *fs_path)
+{
+ bool free_fs_path = false;
+ int ret = 0;
+ int iter_ret = 0;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+
+ if (!fs_path) {
+ fs_path = fs_path_alloc();
+ if (!fs_path)
+ return -ENOMEM;
+ free_fs_path = true;
+ }
+
+ path = alloc_path_for_send();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = ino2;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+
+ btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u32 cur_offset = 0;
+ u32 item_size;
+
+ if (key.objectid != ino2)
+ break;
+ if (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ item_size = btrfs_item_size(leaf, slot);
+ while (cur_offset < item_size) {
+ u64 parent;
+ u64 parent_gen;
+
+ if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ unsigned long ptr;
+ struct btrfs_inode_extref *extref;
+
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ extref = (struct btrfs_inode_extref *)
+ (ptr + cur_offset);
+ parent = btrfs_inode_extref_parent(leaf,
+ extref);
+ cur_offset += sizeof(*extref);
+ cur_offset += btrfs_inode_extref_name_len(leaf,
+ extref);
+ } else {
+ parent = key.offset;
+ cur_offset = item_size;
+ }
+
+ ret = get_inode_gen(root, parent, &parent_gen);
+ if (ret < 0)
+ goto out;
+ ret = check_ino_in_path(root, ino1, ino1_gen,
+ parent, parent_gen, fs_path);
+ if (ret)
+ goto out;
+ }
+ }
+ ret = 0;
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+out:
+ btrfs_free_path(path);
+ if (free_fs_path)
+ fs_path_free(fs_path);
+ return ret;
+}
+
+static int wait_for_parent_move(struct send_ctx *sctx,
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
+{
+ int ret = 0;
+ u64 ino = parent_ref->dir;
+ u64 ino_gen = parent_ref->dir_gen;
+ u64 parent_ino_before, parent_ino_after;
+ struct fs_path *path_before = NULL;
+ struct fs_path *path_after = NULL;
+ int len1, len2;
+
+ path_after = fs_path_alloc();
+ path_before = fs_path_alloc();
+ if (!path_after || !path_before) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Our current directory inode may not yet be renamed/moved because some
+ * ancestor (immediate or not) has to be renamed/moved first. So find if
+ * such ancestor exists and make sure our own rename/move happens after
+ * that ancestor is processed to avoid path build infinite loops (done
+ * at get_cur_path()).
+ */
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ u64 parent_ino_after_gen;
+
+ if (is_waiting_for_move(sctx, ino)) {
+ /*
+ * If the current inode is an ancestor of ino in the
+ * parent root, we need to delay the rename of the
+ * current inode, otherwise don't delayed the rename
+ * because we can end up with a circular dependency
+ * of renames, resulting in some directories never
+ * getting the respective rename operations issued in
+ * the send stream or getting into infinite path build
+ * loops.
+ */
+ ret = is_ancestor(sctx->parent_root,
+ sctx->cur_ino, sctx->cur_inode_gen,
+ ino, path_before);
+ if (ret)
+ break;
+ }
+
+ fs_path_reset(path_before);
+ fs_path_reset(path_after);
+
+ ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+ &parent_ino_after_gen, path_after);
+ if (ret < 0)
+ goto out;
+ ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+ NULL, path_before);
+ if (ret < 0 && ret != -ENOENT) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ ret = 0;
+ break;
+ }
+
+ len1 = fs_path_len(path_before);
+ len2 = fs_path_len(path_after);
+ if (ino > sctx->cur_ino &&
+ (parent_ino_before != parent_ino_after || len1 != len2 ||
+ memcmp(path_before->start, path_after->start, len1))) {
+ u64 parent_ino_gen;
+
+ ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
+ if (ret < 0)
+ goto out;
+ if (ino_gen == parent_ino_gen) {
+ ret = 1;
+ break;
+ }
+ }
+ ino = parent_ino_after;
+ ino_gen = parent_ino_after_gen;
+ }
+
+out:
+ fs_path_free(path_before);
+ fs_path_free(path_after);
+
+ if (ret == 1) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ ino,
+ &sctx->new_refs,
+ &sctx->deleted_refs,
+ is_orphan);
+ if (!ret)
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ int ret;
+ struct fs_path *new_path;
+
+ /*
+ * Our reference's name member points to its full_path member string, so
+ * we use here a new path.
+ */
+ new_path = fs_path_alloc();
+ if (!new_path)
+ return -ENOMEM;
+
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path);
+ if (ret < 0) {
+ fs_path_free(new_path);
+ return ret;
+ }
+ ret = fs_path_add(new_path, ref->name, ref->name_len);
+ if (ret < 0) {
+ fs_path_free(new_path);
+ return ret;
+ }
+
+ fs_path_free(ref->full_path);
+ set_ref_path(ref, new_path);
+
+ return 0;
+}
+
+/*
+ * When processing the new references for an inode we may orphanize an existing
+ * directory inode because its old name conflicts with one of the new references
+ * of the current inode. Later, when processing another new reference of our
+ * inode, we might need to orphanize another inode, but the path we have in the
+ * reference reflects the pre-orphanization name of the directory we previously
+ * orphanized. For example:
+ *
+ * parent snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- f1 (ino 257)
+ * |----- f2 (ino 258)
+ * |----- d1/ (ino 259)
+ * |----- d2/ (ino 260)
+ *
+ * send snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- d1 (ino 258)
+ * |----- f2/ (ino 259)
+ * |----- f2_link/ (ino 260)
+ * | |----- f1 (ino 257)
+ * |
+ * |----- d2 (ino 258)
+ *
+ * When processing inode 257 we compute the name for inode 259 as "d1", and we
+ * cache it in the name cache. Later when we start processing inode 258, when
+ * collecting all its new references we set a full path of "d1/d2" for its new
+ * reference with name "d2". When we start processing the new references we
+ * start by processing the new reference with name "d1", and this results in
+ * orphanizing inode 259, since its old reference causes a conflict. Then we
+ * move on the next new reference, with name "d2", and we find out we must
+ * orphanize inode 260, as its old reference conflicts with ours - but for the
+ * orphanization we use a source path corresponding to the path we stored in the
+ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
+ * receiver fail since the path component "d1/" no longer exists, it was renamed
+ * to "o259-6-0/" when processing the previous new reference. So in this case we
+ * must recompute the path in the new reference and use it for the new
+ * orphanization operation.
+ */
+static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ char *name;
+ int ret;
+
+ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ fs_path_reset(ref->full_path);
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add(ref->full_path, name, ref->name_len);
+ if (ret < 0)
+ goto out;
+
+ /* Update the reference's base name pointer. */
+ set_ref_path(ref, ref->full_path);
+out:
+ kfree(name);
+ return ret;
+}
+
+/*
+ * This does all the move/link/unlink/rmdir magic.
+ */
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct recorded_ref *cur;
+ struct recorded_ref *cur2;
+ struct list_head check_dirs;
+ struct fs_path *valid_path = NULL;
+ u64 ow_inode = 0;
+ u64 ow_gen;
+ u64 ow_mode;
+ int did_overwrite = 0;
+ int is_orphan = 0;
+ u64 last_dir_ino_rm = 0;
+ bool can_rename = true;
+ bool orphanized_dir = false;
+ bool orphanized_ancestor = false;
+
+ btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
+
+ /*
+ * This should never happen as the root dir always has the same ref
+ * which is always '..'
+ */
+ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+ INIT_LIST_HEAD(&check_dirs);
+
+ valid_path = fs_path_alloc();
+ if (!valid_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * First, check if the first ref of the current inode was overwritten
+ * before. If yes, we know that the current inode was already orphanized
+ * and thus use the orphan name. If not, we can use get_cur_path to
+ * get the path of the first ref as it would like while receiving at
+ * this point in time.
+ * New inodes are always orphan at the beginning, so force to use the
+ * orphan name in this case.
+ * The first ref is stored in valid_path and will be updated if it
+ * gets moved around.
+ */
+ if (!sctx->cur_inode_new) {
+ ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ did_overwrite = 1;
+ }
+ if (sctx->cur_inode_new || did_overwrite) {
+ ret = gen_unique_name(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen, valid_path);
+ if (ret < 0)
+ goto out;
+ is_orphan = 1;
+ } else {
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ valid_path);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * Before doing any rename and link operations, do a first pass on the
+ * new references to orphanize any unprocessed inodes that may have a
+ * reference that conflicts with one of the new references of the current
+ * inode. This needs to happen first because a new reference may conflict
+ * with the old reference of a parent directory, so we must make sure
+ * that the path used for link and rename commands don't use an
+ * orphanized name when an ancestor was not yet orphanized.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir/ (ino 259)
+ * | |----- a (ino 257)
+ * |
+ * |----- b (ino 258)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir_2/ (ino 259)
+ * | |----- a (ino 260)
+ * |
+ * |----- testdir (ino 257)
+ * |----- b (ino 257)
+ * |----- b2 (ino 258)
+ *
+ * Processing the new reference for inode 257 with name "b" may happen
+ * before processing the new reference with name "testdir". If so, we
+ * must make sure that by the time we send a link command to create the
+ * hard link "b", inode 259 was already orphanized, since the generated
+ * path in "valid_path" already contains the orphanized name for 259.
+ * We are processing inode 257, so only later when processing 259 we do
+ * the rename operation to change its temporary (orphanized) name to
+ * "testdir_2".
+ */
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create)
+ continue;
+
+ /*
+ * Check if this new ref would overwrite the first ref of another
+ * unprocessed inode. If yes, orphanize the overwritten inode.
+ * If we find an overwritten ref that is not the first ref,
+ * simply unlink it.
+ */
+ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+ cur->name, cur->name_len,
+ &ow_inode, &ow_gen, &ow_mode);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = is_first_ref(sctx->parent_root,
+ ow_inode, cur->dir, cur->name,
+ cur->name_len);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ struct name_cache_entry *nce;
+ struct waiting_dir_move *wdm;
+
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = orphanize_inode(sctx, ow_inode, ow_gen,
+ cur->full_path);
+ if (ret < 0)
+ goto out;
+ if (S_ISDIR(ow_mode))
+ orphanized_dir = true;
+
+ /*
+ * If ow_inode has its rename operation delayed
+ * make sure that its orphanized name is used in
+ * the source path when performing its rename
+ * operation.
+ */
+ if (is_waiting_for_move(sctx, ow_inode)) {
+ wdm = get_waiting_dir_move(sctx,
+ ow_inode);
+ ASSERT(wdm);
+ wdm->orphanized = true;
+ }
+
+ /*
+ * Make sure we clear our orphanized inode's
+ * name from the name cache. This is because the
+ * inode ow_inode might be an ancestor of some
+ * other inode that will be orphanized as well
+ * later and has an inode number greater than
+ * sctx->send_progress. We need to prevent
+ * future name lookups from using the old name
+ * and get instead the orphan name.
+ */
+ nce = name_cache_search(sctx, ow_inode, ow_gen);
+ if (nce) {
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ }
+
+ /*
+ * ow_inode might currently be an ancestor of
+ * cur_ino, therefore compute valid_path (the
+ * current path of cur_ino) again because it
+ * might contain the pre-orphanization name of
+ * ow_inode, which is no longer valid.
+ */
+ ret = is_ancestor(sctx->parent_root,
+ ow_inode, ow_gen,
+ sctx->cur_ino, NULL);
+ if (ret > 0) {
+ orphanized_ancestor = true;
+ fs_path_reset(valid_path);
+ ret = get_cur_path(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen,
+ valid_path);
+ }
+ if (ret < 0)
+ goto out;
+ } else {
+ /*
+ * If we previously orphanized a directory that
+ * collided with a new reference that we already
+ * processed, recompute the current path because
+ * that directory may be part of the path.
+ */
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+ ret = send_unlink(sctx, cur->full_path);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * than the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
+ ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+ can_rename) {
+ ret = wait_for_parent_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
+ /*
+ * link/move the ref to the new place. If we have an orphan
+ * inode, move it and update valid_path. If not, link or move
+ * it depending on the inode mode.
+ */
+ if (is_orphan && can_rename) {
+ ret = send_rename(sctx, valid_path, cur->full_path);
+ if (ret < 0)
+ goto out;
+ is_orphan = 0;
+ ret = fs_path_copy(valid_path, cur->full_path);
+ if (ret < 0)
+ goto out;
+ } else if (can_rename) {
+ if (S_ISDIR(sctx->cur_inode_mode)) {
+ /*
+ * Dirs can't be linked, so move it. For moved
+ * dirs, we always have one new and one deleted
+ * ref. The deleted ref is ignored later.
+ */
+ ret = send_rename(sctx, valid_path,
+ cur->full_path);
+ if (!ret)
+ ret = fs_path_copy(valid_path,
+ cur->full_path);
+ if (ret < 0)
+ goto out;
+ } else {
+ /*
+ * We might have previously orphanized an inode
+ * which is an ancestor of our current inode,
+ * so our reference's full path, which was
+ * computed before any such orphanizations, must
+ * be updated.
+ */
+ if (orphanized_dir) {
+ ret = update_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+ ret = send_link(sctx, cur->full_path,
+ valid_path);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = dup_ref(cur, &check_dirs);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
+ /*
+ * Check if we can already rmdir the directory. If not,
+ * orphanize it. For every dir item inside that gets deleted
+ * later, we do this check again and rmdir it then if possible.
+ * See the use of check_dirs for more details.
+ */
+ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = send_rmdir(sctx, valid_path);
+ if (ret < 0)
+ goto out;
+ } else if (!is_orphan) {
+ ret = orphanize_inode(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen, valid_path);
+ if (ret < 0)
+ goto out;
+ is_orphan = 1;
+ }
+
+ list_for_each_entry(cur, &sctx->deleted_refs, list) {
+ ret = dup_ref(cur, &check_dirs);
+ if (ret < 0)
+ goto out;
+ }
+ } else if (S_ISDIR(sctx->cur_inode_mode) &&
+ !list_empty(&sctx->deleted_refs)) {
+ /*
+ * We have a moved dir. Add the old parent to check_dirs
+ */
+ cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+ list);
+ ret = dup_ref(cur, &check_dirs);
+ if (ret < 0)
+ goto out;
+ } else if (!S_ISDIR(sctx->cur_inode_mode)) {
+ /*
+ * We have a non dir inode. Go through all deleted refs and
+ * unlink them if they were not already overwritten by other
+ * inodes.
+ */
+ list_for_each_entry(cur, &sctx->deleted_refs, list) {
+ ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+ sctx->cur_ino, sctx->cur_inode_gen,
+ cur->name, cur->name_len);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ /*
+ * If we orphanized any ancestor before, we need
+ * to recompute the full path for deleted names,
+ * since any such path was computed before we
+ * processed any references and orphanized any
+ * ancestor inode.
+ */
+ if (orphanized_ancestor) {
+ ret = update_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+ ret = send_unlink(sctx, cur->full_path);
+ if (ret < 0)
+ goto out;
+ }
+ ret = dup_ref(cur, &check_dirs);
+ if (ret < 0)
+ goto out;
+ }
+ /*
+ * If the inode is still orphan, unlink the orphan. This may
+ * happen when a previous inode did overwrite the first ref
+ * of this inode and no new refs were added for the current
+ * inode. Unlinking does not mean that the inode is deleted in
+ * all cases. There may still be links to this inode in other
+ * places.
+ */
+ if (is_orphan) {
+ ret = send_unlink(sctx, valid_path);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ /*
+ * We did collect all parent dirs where cur_inode was once located. We
+ * now go through all these dirs and check if they are pending for
+ * deletion and if it's finally possible to perform the rmdir now.
+ * We also update the inode stats of the parent dirs here.
+ */
+ list_for_each_entry(cur, &check_dirs, list) {
+ /*
+ * In case we had refs into dirs that were not processed yet,
+ * we don't need to do the utime and rmdir logic for these dirs.
+ * The dir will be processed later.
+ */
+ if (cur->dir > sctx->cur_ino)
+ continue;
+
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+
+ if (ret == inode_state_did_create ||
+ ret == inode_state_no_change) {
+ /* TODO delayed utimes */
+ ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ } else if (ret == inode_state_did_delete &&
+ cur->dir != last_dir_ino_rm) {
+ ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+ sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = get_cur_path(sctx, cur->dir,
+ cur->dir_gen, valid_path);
+ if (ret < 0)
+ goto out;
+ ret = send_rmdir(sctx, valid_path);
+ if (ret < 0)
+ goto out;
+ last_dir_ino_rm = cur->dir;
+ }
+ }
+ }
+
+ ret = 0;
+
+out:
+ __free_recorded_refs(&check_dirs);
+ free_recorded_refs(sctx);
+ fs_path_free(valid_path);
+ return ret;
+}
+
+static int rbtree_ref_comp(const void *k, const struct rb_node *node)
+{
+ const struct recorded_ref *data = k;
+ const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
+ int result;
+
+ if (data->dir > ref->dir)
+ return 1;
+ if (data->dir < ref->dir)
+ return -1;
+ if (data->dir_gen > ref->dir_gen)
+ return 1;
+ if (data->dir_gen < ref->dir_gen)
+ return -1;
+ if (data->name_len > ref->name_len)
+ return 1;
+ if (data->name_len < ref->name_len)
+ return -1;
+ result = strcmp(data->name, ref->name);
+ if (result > 0)
+ return 1;
+ if (result < 0)
+ return -1;
+ return 0;
+}
+
+static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
+
+ return rbtree_ref_comp(entry, parent) < 0;
+}
+
+static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
+ struct fs_path *name, u64 dir, u64 dir_gen,
+ struct send_ctx *sctx)
+{
+ int ret = 0;
+ struct fs_path *path = NULL;
+ struct recorded_ref *ref = NULL;
+
+ path = fs_path_alloc();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ref = recorded_ref_alloc();
+ if (!ref) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_cur_path(sctx, dir, dir_gen, path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(path, name);
+ if (ret < 0)
+ goto out;
+
+ ref->dir = dir;
+ ref->dir_gen = dir_gen;
+ set_ref_path(ref, path);
+ list_add_tail(&ref->list, refs);
+ rb_add(&ref->node, root, rbtree_ref_less);
+ ref->root = root;
+out:
+ if (ret) {
+ if (path && (!ref || !ref->full_path))
+ fs_path_free(path);
+ recorded_ref_free(ref);
+ }
+ return ret;
+}
+
+static int record_new_ref_if_needed(int num, u64 dir, int index,
+ struct fs_path *name, void *ctx)
+{
+ int ret = 0;
+ struct send_ctx *sctx = ctx;
+ struct rb_node *node = NULL;
+ struct recorded_ref data;
+ struct recorded_ref *ref;
+ u64 dir_gen;
+
+ ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
+ if (ret < 0)
+ goto out;
+
+ data.dir = dir;
+ data.dir_gen = dir_gen;
+ set_ref_path(&data, name);
+ node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp);
+ if (node) {
+ ref = rb_entry(node, struct recorded_ref, node);
+ recorded_ref_free(ref);
+ } else {
+ ret = record_ref_in_tree(&sctx->rbtree_new_refs,
+ &sctx->new_refs, name, dir, dir_gen,
+ sctx);
+ }
+out:
+ return ret;
+}
+
+static int record_deleted_ref_if_needed(int num, u64 dir, int index,
+ struct fs_path *name, void *ctx)
+{
+ int ret = 0;
+ struct send_ctx *sctx = ctx;
+ struct rb_node *node = NULL;
+ struct recorded_ref data;
+ struct recorded_ref *ref;
+ u64 dir_gen;
+
+ ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
+ if (ret < 0)
+ goto out;
+
+ data.dir = dir;
+ data.dir_gen = dir_gen;
+ set_ref_path(&data, name);
+ node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp);
+ if (node) {
+ ref = rb_entry(node, struct recorded_ref, node);
+ recorded_ref_free(ref);
+ } else {
+ ret = record_ref_in_tree(&sctx->rbtree_deleted_refs,
+ &sctx->deleted_refs, name, dir,
+ dir_gen, sctx);
+ }
+out:
+ return ret;
+}
+
+static int record_new_ref(struct send_ctx *sctx)
+{
+ int ret;
+
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int record_deleted_ref(struct send_ctx *sctx)
+{
+ int ret;
+
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, record_deleted_ref_if_needed,
+ sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int record_changed_ref(struct send_ctx *sctx)
+{
+ int ret = 0;
+
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+ if (ret < 0)
+ goto out;
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ * Record and process all refs at once. Needed when an inode changes the
+ * generation number, which means that it was deleted and recreated.
+ */
+static int process_all_refs(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result cmd)
+{
+ int ret = 0;
+ int iter_ret = 0;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ iterate_inode_ref_t cb;
+ int pending_move = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ if (cmd == BTRFS_COMPARE_TREE_NEW) {
+ root = sctx->send_root;
+ cb = record_new_ref_if_needed;
+ } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
+ root = sctx->parent_root;
+ cb = record_deleted_ref_if_needed;
+ } else {
+ btrfs_err(sctx->send_root->fs_info,
+ "Wrong command %d in process_all_refs", cmd);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ key.objectid = sctx->cmp_key->objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ if (found_key.objectid != key.objectid ||
+ (found_key.type != BTRFS_INODE_REF_KEY &&
+ found_key.type != BTRFS_INODE_EXTREF_KEY))
+ break;
+
+ ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
+ if (ret < 0)
+ goto out;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * We don't actually care about pending_move as we are simply
+ * re-creating this inode and will be rename'ing it into place once we
+ * rename the parent directory.
+ */
+ ret = process_recorded_refs(sctx, &pending_move);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int send_set_xattr(struct send_ctx *sctx,
+ struct fs_path *path,
+ const char *name, int name_len,
+ const char *data, int data_len)
+{
+ int ret = 0;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+ TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int send_remove_xattr(struct send_ctx *sctx,
+ struct fs_path *path,
+ const char *name, int name_len)
+{
+ int ret = 0;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int __process_new_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len, const char *data,
+ int data_len, void *ctx)
+{
+ int ret;
+ struct send_ctx *sctx = ctx;
+ struct fs_path *p;
+ struct posix_acl_xattr_header dummy_acl;
+
+ /* Capabilities are emitted by finish_inode_if_needed */
+ if (!strncmp(name, XATTR_NAME_CAPS, name_len))
+ return 0;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ /*
+ * This hack is needed because empty acls are stored as zero byte
+ * data in xattrs. Problem with that is, that receiving these zero byte
+ * acls will fail later. To fix this, we send a dummy acl list that
+ * only contains the version number and no entries.
+ */
+ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
+ !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
+ if (data_len == 0) {
+ dummy_acl.a_version =
+ cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+ data = (char *)&dummy_acl;
+ data_len = sizeof(dummy_acl);
+ }
+ }
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
+
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len, void *ctx)
+{
+ int ret;
+ struct send_ctx *sctx = ctx;
+ struct fs_path *p;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ ret = send_remove_xattr(sctx, p, name, name_len);
+
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int process_new_xattr(struct send_ctx *sctx)
+{
+ int ret = 0;
+
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+ __process_new_xattr, sctx);
+
+ return ret;
+}
+
+static int process_deleted_xattr(struct send_ctx *sctx)
+{
+ return iterate_dir_item(sctx->parent_root, sctx->right_path,
+ __process_deleted_xattr, sctx);
+}
+
+struct find_xattr_ctx {
+ const char *name;
+ int name_len;
+ int found_idx;
+ char *found_data;
+ int found_data_len;
+};
+
+static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
+ int name_len, const char *data, int data_len, void *vctx)
+{
+ struct find_xattr_ctx *ctx = vctx;
+
+ if (name_len == ctx->name_len &&
+ strncmp(name, ctx->name, name_len) == 0) {
+ ctx->found_idx = num;
+ ctx->found_data_len = data_len;
+ ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
+ if (!ctx->found_data)
+ return -ENOMEM;
+ return 1;
+ }
+ return 0;
+}
+
+static int find_xattr(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ const char *name, int name_len,
+ char **data, int *data_len)
+{
+ int ret;
+ struct find_xattr_ctx ctx;
+
+ ctx.name = name;
+ ctx.name_len = name_len;
+ ctx.found_idx = -1;
+ ctx.found_data = NULL;
+ ctx.found_data_len = 0;
+
+ ret = iterate_dir_item(root, path, __find_xattr, &ctx);
+ if (ret < 0)
+ return ret;
+
+ if (ctx.found_idx == -1)
+ return -ENOENT;
+ if (data) {
+ *data = ctx.found_data;
+ *data_len = ctx.found_data_len;
+ } else {
+ kfree(ctx.found_data);
+ }
+ return ctx.found_idx;
+}
+
+
+static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ void *ctx)
+{
+ int ret;
+ struct send_ctx *sctx = ctx;
+ char *found_data = NULL;
+ int found_data_len = 0;
+
+ ret = find_xattr(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, name, name_len, &found_data,
+ &found_data_len);
+ if (ret == -ENOENT) {
+ ret = __process_new_xattr(num, di_key, name, name_len, data,
+ data_len, ctx);
+ } else if (ret >= 0) {
+ if (data_len != found_data_len ||
+ memcmp(data, found_data, data_len)) {
+ ret = __process_new_xattr(num, di_key, name, name_len,
+ data, data_len, ctx);
+ } else {
+ ret = 0;
+ }
+ }
+
+ kfree(found_data);
+ return ret;
+}
+
+static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ void *ctx)
+{
+ int ret;
+ struct send_ctx *sctx = ctx;
+
+ ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ name, name_len, NULL, NULL);
+ if (ret == -ENOENT)
+ ret = __process_deleted_xattr(num, di_key, name, name_len, data,
+ data_len, ctx);
+ else if (ret >= 0)
+ ret = 0;
+
+ return ret;
+}
+
+static int process_changed_xattr(struct send_ctx *sctx)
+{
+ int ret = 0;
+
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+ __process_changed_new_xattr, sctx);
+ if (ret < 0)
+ goto out;
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+ __process_changed_deleted_xattr, sctx);
+
+out:
+ return ret;
+}
+
+static int process_all_new_xattrs(struct send_ctx *sctx)
+{
+ int ret = 0;
+ int iter_ret = 0;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ root = sctx->send_root;
+
+ key.objectid = sctx->cmp_key->objectid;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ break;
+ }
+
+ ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
+ if (ret < 0)
+ break;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int send_verity(struct send_ctx *sctx, struct fs_path *path,
+ struct fsverity_descriptor *desc)
+{
+ int ret;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
+ le8_to_cpu(desc->hash_algorithm));
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
+ 1U << le8_to_cpu(desc->log_blocksize));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
+ le8_to_cpu(desc->salt_size));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
+ le32_to_cpu(desc->sig_size));
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int process_verity(struct send_ctx *sctx)
+{
+ int ret = 0;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ struct inode *inode;
+ struct fs_path *p;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ if (ret < 0)
+ goto iput;
+
+ if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+ ret = -EMSGSIZE;
+ goto iput;
+ }
+ if (!sctx->verity_descriptor) {
+ sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
+ GFP_KERNEL);
+ if (!sctx->verity_descriptor) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ }
+
+ ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ if (ret < 0)
+ goto iput;
+
+ p = fs_path_alloc();
+ if (!p) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto free_path;
+
+ ret = send_verity(sctx, p, sctx->verity_descriptor);
+ if (ret < 0)
+ goto free_path;
+
+free_path:
+ fs_path_free(p);
+iput:
+ iput(inode);
+ return ret;
+}
+
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+ sctx->put_data = true;
+ if (sctx->proto >= 2) {
+ /*
+ * Since v2, the data attribute header doesn't include a length,
+ * it is implicitly to the end of the command.
+ */
+ if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
+ return -EOVERFLOW;
+ put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
+ sctx->send_size += sizeof(__le16);
+ } else {
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ }
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct page *page;
+ pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t last_index;
+ unsigned pg_offset = offset_in_page(offset);
+ int ret;
+
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
+
+ last_index = (offset + len - 1) >> PAGE_SHIFT;
+
+ while (index <= last_index) {
+ unsigned cur_len = min_t(unsigned, len,
+ PAGE_SIZE - pg_offset);
+
+ page = find_lock_page(sctx->cur_inode->i_mapping, index);
+ if (!page) {
+ page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, index,
+ last_index + 1 - index);
+
+ page = find_or_create_page(sctx->cur_inode->i_mapping,
+ index, GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ if (PageReadahead(page))
+ page_cache_async_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, page_folio(page),
+ index, last_index + 1 - index);
+
+ if (!PageUptodate(page)) {
+ btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
+ put_page(page);
+ ret = -EIO;
+ break;
+ }
+ }
+
+ memcpy_from_page(sctx->send_buf + sctx->send_size, page,
+ pg_offset, cur_len);
+ unlock_page(page);
+ put_page(page);
+ index++;
+ pg_offset = 0;
+ len -= cur_len;
+ sctx->send_size += cur_len;
+ }
+
+ return ret;
+}
+
+/*
+ * Read some bytes from the current inode/file and send a write command to
+ * user space.
+ */
+static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+/*
+ * Send a clone command to user space.
+ */
+static int send_clone(struct send_ctx *sctx,
+ u64 offset, u32 len,
+ struct clone_root *clone_root)
+{
+ int ret = 0;
+ struct fs_path *p;
+ u64 gen;
+
+ btrfs_debug(sctx->send_root->fs_info,
+ "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
+ offset, len, clone_root->root->root_key.objectid,
+ clone_root->ino, clone_root->offset);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+
+ if (clone_root->root == sctx->send_root) {
+ ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
+ if (ret < 0)
+ goto out;
+ ret = get_cur_path(sctx, clone_root->ino, gen, p);
+ } else {
+ ret = get_inode_path(clone_root->root, clone_root->ino, p);
+ }
+ if (ret < 0)
+ goto out;
+
+ /*
+ * If the parent we're using has a received_uuid set then use that as
+ * our clone source as that is what we will look for when doing a
+ * receive.
+ *
+ * This covers the case that we create a snapshot off of a received
+ * subvolume and then use that as the parent and try to receive on a
+ * different host.
+ */
+ if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.uuid);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+ btrfs_root_ctransid(&clone_root->root->root_item));
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
+ clone_root->offset);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+ u64 offset, u32 len)
+{
+ int ret = 0;
+ struct fs_path *p;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+ struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
+ u64 offset = sctx->cur_inode_last_extent;
+ int ret = 0;
+
+ /*
+ * A hole that starts at EOF or beyond it. Since we do not yet support
+ * fallocate (for extent preallocation and hole punching), sending a
+ * write of zeroes starting at EOF or beyond would later require issuing
+ * a truncate operation which would undo the write and achieve nothing.
+ */
+ if (offset >= sctx->cur_inode_size)
+ return 0;
+
+ /*
+ * Don't go beyond the inode's i_size due to prealloc extents that start
+ * after the i_size.
+ */
+ end = min_t(u64, end, sctx->cur_inode_size);
+
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, end - offset);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto tlv_put_failure;
+ while (offset < end) {
+ u64 len = min(end - offset, read_size);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+ if (ret < 0)
+ break;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
+ ret = send_cmd(sctx);
+ if (ret < 0)
+ break;
+ offset += len;
+ }
+ sctx->cur_inode_next_write_offset = offset;
+tlv_put_failure:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_encoded_inline_extent(struct send_ctx *sctx,
+ struct btrfs_path *path, u64 offset,
+ u64 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 ram_bytes;
+ size_t inline_size;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei);
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + ram_bytes - offset, len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+
+ ret = put_data_header(sctx, inline_size);
+ if (ret < 0)
+ goto out;
+ read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
+ btrfs_file_extent_inline_start(ei), inline_size);
+ sctx->send_size += inline_size;
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
+ u64 offset, u64 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 disk_bytenr, disk_num_bytes;
+ u32 data_offset;
+ struct btrfs_cmd_header *hdr;
+ u32 crc;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
+ len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
+ btrfs_file_extent_ram_bytes(leaf, ei));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
+ offset - key.offset + btrfs_file_extent_offset(leaf, ei));
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0);
+
+ ret = put_data_header(sctx, disk_num_bytes);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We want to do I/O directly into the send buffer, so get the next page
+ * boundary in the send buffer. This means that there may be a gap
+ * between the beginning of the command and the file data.
+ */
+ data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
+ if (data_offset > sctx->send_max_size ||
+ sctx->send_max_size - data_offset < disk_num_bytes) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+
+ /*
+ * Note that send_buf is a mapping of send_buf_pages, so this is really
+ * reading into send_buf.
+ */
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ disk_bytenr, disk_num_bytes,
+ sctx->send_buf_pages +
+ (data_offset >> PAGE_SHIFT));
+ if (ret)
+ goto out;
+
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
+ hdr->crc = 0;
+ crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
+ crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+ hdr->crc = cpu_to_le32(crc);
+
+ ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+ &sctx->send_off);
+ if (!ret) {
+ ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset,
+ disk_num_bytes, &sctx->send_off);
+ }
+ sctx->send_size = 0;
+ sctx->put_data = false;
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
+ const u64 offset, const u64 len)
+{
+ const u64 end = offset + len;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ u64 read_size = max_send_read_size(sctx);
+ u64 sent = 0;
+
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, len);
+
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
+ btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
+ bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_INLINE);
+
+ /*
+ * Send the compressed extent unless the compressed data is
+ * larger than the decompressed data. This can happen if we're
+ * not sending the entire extent, either because it has been
+ * partially overwritten/truncated or because this is a part of
+ * the extent that we couldn't clone in clone_range().
+ */
+ if (is_inline &&
+ btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]) <= len) {
+ return send_encoded_inline_extent(sctx, path, offset,
+ len);
+ } else if (!is_inline &&
+ btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) {
+ return send_encoded_extent(sctx, path, offset, len);
+ }
+ }
+
+ if (sctx->cur_inode == NULL) {
+ struct btrfs_root *root = sctx->send_root;
+
+ sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(sctx->cur_inode)) {
+ int err = PTR_ERR(sctx->cur_inode);
+
+ sctx->cur_inode = NULL;
+ return err;
+ }
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+
+ /*
+ * It's very likely there are no pages from this inode in the page
+ * cache, so after reading extents and sending their data, we clean
+ * the page cache to avoid trashing the page cache (adding pressure
+ * to the page cache and forcing eviction of other data more useful
+ * for applications).
+ *
+ * We decide if we should clean the page cache simply by checking
+ * if the inode's mapping nrpages is 0 when we first open it, and
+ * not by using something like filemap_range_has_page() before
+ * reading an extent because when we ask the readahead code to
+ * read a given file range, it may (and almost always does) read
+ * pages from beyond that range (see the documentation for
+ * page_cache_sync_readahead()), so it would not be reliable,
+ * because after reading the first extent future calls to
+ * filemap_range_has_page() would return true because the readahead
+ * on the previous extent resulted in reading pages of the current
+ * extent as well.
+ */
+ sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
+ sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
+ }
+
+ while (sent < len) {
+ u64 size = min(len - sent, read_size);
+ int ret;
+
+ ret = send_write(sctx, offset + sent, size);
+ if (ret < 0)
+ return ret;
+ sent += size;
+ }
+
+ if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+ /*
+ * Always operate only on ranges that are a multiple of the page
+ * size. This is not only to prevent zeroing parts of a page in
+ * the case of subpage sector size, but also to guarantee we evict
+ * pages, as passing a range that is smaller than page size does
+ * not evict the respective page (only zeroes part of its content).
+ *
+ * Always start from the end offset of the last range cleared.
+ * This is because the readahead code may (and very often does)
+ * reads pages beyond the range we request for readahead. So if
+ * we have an extent layout like this:
+ *
+ * [ extent A ] [ extent B ] [ extent C ]
+ *
+ * When we ask page_cache_sync_readahead() to read extent A, it
+ * may also trigger reads for pages of extent B. If we are doing
+ * an incremental send and extent B has not changed between the
+ * parent and send snapshots, some or all of its pages may end
+ * up being read and placed in the page cache. So when truncating
+ * the page cache we always start from the end offset of the
+ * previously processed extent up to the end of the current
+ * extent.
+ */
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ end - 1);
+ sctx->page_cache_clear_start = end;
+ }
+
+ return 0;
+}
+
+/*
+ * Search for a capability xattr related to sctx->cur_ino. If the capability is
+ * found, call send_set_xattr function to emit it.
+ *
+ * Return 0 if there isn't a capability, or when the capability was emitted
+ * successfully, or < 0 if an error occurred.
+ */
+static int send_capabilities(struct send_ctx *sctx)
+{
+ struct fs_path *fspath = NULL;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct extent_buffer *leaf;
+ unsigned long data_ptr;
+ char *buf = NULL;
+ int buf_len;
+ int ret = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
+ XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
+ if (!di) {
+ /* There is no xattr for this inode */
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ buf_len = btrfs_dir_data_len(leaf, di);
+
+ fspath = fs_path_alloc();
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!fspath || !buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
+ read_extent_buffer(leaf, buf, data_ptr, buf_len);
+
+ ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ strlen(XATTR_NAME_CAPS), buf, buf_len);
+out:
+ kfree(buf);
+ fs_path_free(fspath);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
+ struct clone_root *clone_root, const u64 disk_byte,
+ u64 data_offset, u64 offset, u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+ struct btrfs_inode_info info;
+ u64 clone_src_i_size = 0;
+
+ /*
+ * Prevent cloning from a zero offset with a length matching the sector
+ * size because in some scenarios this will make the receiver fail.
+ *
+ * For example, if in the source filesystem the extent at offset 0
+ * has a length of sectorsize and it was written using direct IO, then
+ * it can never be an inline extent (even if compression is enabled).
+ * Then this extent can be cloned in the original filesystem to a non
+ * zero file offset, but it may not be possible to clone in the
+ * destination filesystem because it can be inlined due to compression
+ * on the destination filesystem (as the receiver's write operations are
+ * always done using buffered IO). The same happens when the original
+ * filesystem does not have compression enabled but the destination
+ * filesystem has.
+ */
+ if (clone_root->offset == 0 &&
+ len == sctx->send_root->fs_info->sectorsize)
+ return send_extent_data(sctx, dst_path, offset, len);
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * There are inodes that have extents that lie behind its i_size. Don't
+ * accept clones from these extents.
+ */
+ ret = get_inode_info(clone_root->root, clone_root->ino, &info);
+ btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ clone_src_i_size = info.size;
+
+ /*
+ * We can't send a clone operation for the entire range if we find
+ * extent items in the respective range in the source file that
+ * refer to different extents or if we find holes.
+ * So check for that and do a mix of clone and regular write/copy
+ * operations if needed.
+ *
+ * Example:
+ *
+ * mkfs.btrfs -f /dev/sda
+ * mount /dev/sda /mnt
+ * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+ * cp --reflink=always /mnt/foo /mnt/bar
+ * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+ * btrfs subvolume snapshot -r /mnt /mnt/snap
+ *
+ * If when we send the snapshot and we are processing file bar (which
+ * has a higher inode number than foo) we blindly send a clone operation
+ * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+ * a file bar that matches the content of file foo - iow, doesn't match
+ * the content from bar in the original filesystem.
+ */
+ key.objectid = clone_root->ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = clone_root->offset;
+ ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == clone_root->ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+ u64 ext_len;
+ u64 clone_len;
+ u64 clone_data_offset;
+ bool crossed_src_i_size = false;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(clone_root->root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /*
+ * We might have an implicit trailing hole (NO_HOLES feature
+ * enabled). We deal with it after leaving this loop.
+ */
+ if (key.objectid != clone_root->ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ ext_len = btrfs_file_extent_ram_bytes(leaf, ei);
+ ext_len = PAGE_ALIGN(ext_len);
+ } else {
+ ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+ }
+
+ if (key.offset + ext_len <= clone_root->offset)
+ goto next;
+
+ if (key.offset > clone_root->offset) {
+ /* Implicit hole, NO_HOLES feature enabled. */
+ u64 hole_len = key.offset - clone_root->offset;
+
+ if (hole_len > len)
+ hole_len = len;
+ ret = send_extent_data(sctx, dst_path, offset,
+ hole_len);
+ if (ret < 0)
+ goto out;
+
+ len -= hole_len;
+ if (len == 0)
+ break;
+ offset += hole_len;
+ clone_root->offset += hole_len;
+ data_offset += hole_len;
+ }
+
+ if (key.offset >= clone_root->offset + len)
+ break;
+
+ if (key.offset >= clone_src_i_size)
+ break;
+
+ if (key.offset + ext_len > clone_src_i_size) {
+ ext_len = clone_src_i_size - key.offset;
+ crossed_src_i_size = true;
+ }
+
+ clone_data_offset = btrfs_file_extent_offset(leaf, ei);
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
+ clone_root->offset = key.offset;
+ if (clone_data_offset < data_offset &&
+ clone_data_offset + ext_len > data_offset) {
+ u64 extent_offset;
+
+ extent_offset = data_offset - clone_data_offset;
+ ext_len -= extent_offset;
+ clone_data_offset += extent_offset;
+ clone_root->offset += extent_offset;
+ }
+ }
+
+ clone_len = min_t(u64, ext_len, len);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+ clone_data_offset == data_offset) {
+ const u64 src_end = clone_root->offset + clone_len;
+ const u64 sectorsize = SZ_64K;
+
+ /*
+ * We can't clone the last block, when its size is not
+ * sector size aligned, into the middle of a file. If we
+ * do so, the receiver will get a failure (-EINVAL) when
+ * trying to clone or will silently corrupt the data in
+ * the destination file if it's on a kernel without the
+ * fix introduced by commit ac765f83f1397646
+ * ("Btrfs: fix data corruption due to cloning of eof
+ * block).
+ *
+ * So issue a clone of the aligned down range plus a
+ * regular write for the eof block, if we hit that case.
+ *
+ * Also, we use the maximum possible sector size, 64K,
+ * because we don't know what's the sector size of the
+ * filesystem that receives the stream, so we have to
+ * assume the largest possible sector size.
+ */
+ if (src_end == clone_src_i_size &&
+ !IS_ALIGNED(src_end, sectorsize) &&
+ offset + clone_len < sctx->cur_inode_size) {
+ u64 slen;
+
+ slen = ALIGN_DOWN(src_end - clone_root->offset,
+ sectorsize);
+ if (slen > 0) {
+ ret = send_clone(sctx, offset, slen,
+ clone_root);
+ if (ret < 0)
+ goto out;
+ }
+ ret = send_extent_data(sctx, dst_path,
+ offset + slen,
+ clone_len - slen);
+ } else {
+ ret = send_clone(sctx, offset, clone_len,
+ clone_root);
+ }
+ } else if (crossed_src_i_size && clone_len < len) {
+ /*
+ * If we are at i_size of the clone source inode and we
+ * can not clone from it, terminate the loop. This is
+ * to avoid sending two write operations, one with a
+ * length matching clone_len and the final one after
+ * this loop with a length of len - clone_len.
+ *
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+ * was passed to the send ioctl), this helps avoid
+ * sending an encoded write for an offset that is not
+ * sector size aligned, in case the i_size of the source
+ * inode is not sector size aligned. That will make the
+ * receiver fallback to decompression of the data and
+ * writing it using regular buffered IO, therefore while
+ * not incorrect, it's not optimal due decompression and
+ * possible re-compression at the receiver.
+ */
+ break;
+ } else {
+ ret = send_extent_data(sctx, dst_path, offset,
+ clone_len);
+ }
+
+ if (ret < 0)
+ goto out;
+
+ len -= clone_len;
+ if (len == 0)
+ break;
+ offset += clone_len;
+ clone_root->offset += clone_len;
+
+ /*
+ * If we are cloning from the file we are currently processing,
+ * and using the send root as the clone root, we must stop once
+ * the current clone offset reaches the current eof of the file
+ * at the receiver, otherwise we would issue an invalid clone
+ * operation (source range going beyond eof) and cause the
+ * receiver to fail. So if we reach the current eof, bail out
+ * and fallback to a regular write.
+ */
+ if (clone_root->root == sctx->send_root &&
+ clone_root->ino == sctx->cur_ino &&
+ clone_root->offset >= sctx->cur_inode_next_write_offset)
+ break;
+
+ data_offset += clone_len;
+next:
+ path->slots[0]++;
+ }
+
+ if (len > 0)
+ ret = send_extent_data(sctx, dst_path, offset, len);
+ else
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int send_write_or_clone(struct send_ctx *sctx,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct clone_root *clone_root)
+{
+ int ret = 0;
+ u64 offset = key->offset;
+ u64 end;
+ u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
+
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
+
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
+ u64 disk_byte;
+ u64 data_offset;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+ data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+ ret = clone_range(sctx, path, clone_root, disk_byte,
+ data_offset, offset, end - offset);
+ } else {
+ ret = send_extent_data(sctx, path, offset, end - offset);
+ }
+ sctx->cur_inode_next_write_offset = end;
+ return ret;
+}
+
+static int is_extent_unchanged(struct send_ctx *sctx,
+ struct btrfs_path *left_path,
+ struct btrfs_key *ekey)
+{
+ int ret = 0;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *eb;
+ int slot;
+ struct btrfs_key found_key;
+ struct btrfs_file_extent_item *ei;
+ u64 left_disknr;
+ u64 right_disknr;
+ u64 left_offset;
+ u64 right_offset;
+ u64 left_offset_fixed;
+ u64 left_len;
+ u64 right_len;
+ u64 left_gen;
+ u64 right_gen;
+ u8 left_type;
+ u8 right_type;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ eb = left_path->nodes[0];
+ slot = left_path->slots[0];
+ ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ left_type = btrfs_file_extent_type(eb, ei);
+
+ if (left_type != BTRFS_FILE_EXTENT_REG) {
+ ret = 0;
+ goto out;
+ }
+ left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+ left_len = btrfs_file_extent_num_bytes(eb, ei);
+ left_offset = btrfs_file_extent_offset(eb, ei);
+ left_gen = btrfs_file_extent_generation(eb, ei);
+
+ /*
+ * Following comments will refer to these graphics. L is the left
+ * extents which we are checking at the moment. 1-8 are the right
+ * extents that we iterate.
+ *
+ * |-----L-----|
+ * |-1-|-2a-|-3-|-4-|-5-|-6-|
+ *
+ * |-----L-----|
+ * |--1--|-2b-|...(same as above)
+ *
+ * Alternative situation. Happens on files where extents got split.
+ * |-----L-----|
+ * |-----------7-----------|-6-|
+ *
+ * Alternative situation. Happens on files which got larger.
+ * |-----L-----|
+ * |-8-|
+ * Nothing follows after 8.
+ */
+
+ key.objectid = ekey->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ekey->offset;
+ ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Handle special case where the right side has no extents at all.
+ */
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ /* If we're a hole then just pretend nothing changed */
+ ret = (left_disknr) ? 0 : 1;
+ goto out;
+ }
+
+ /*
+ * We're now on 2a, 2b or 7.
+ */
+ key = found_key;
+ while (key.offset < ekey->offset + left_len) {
+ ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ right_type = btrfs_file_extent_type(eb, ei);
+ if (right_type != BTRFS_FILE_EXTENT_REG &&
+ right_type != BTRFS_FILE_EXTENT_INLINE) {
+ ret = 0;
+ goto out;
+ }
+
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ right_len = btrfs_file_extent_ram_bytes(eb, ei);
+ right_len = PAGE_ALIGN(right_len);
+ } else {
+ right_len = btrfs_file_extent_num_bytes(eb, ei);
+ }
+
+ /*
+ * Are we at extent 8? If yes, we know the extent is changed.
+ * This may only happen on the first iteration.
+ */
+ if (found_key.offset + right_len <= ekey->offset) {
+ /* If we're a hole just pretend nothing changed */
+ ret = (left_disknr) ? 0 : 1;
+ goto out;
+ }
+
+ /*
+ * We just wanted to see if when we have an inline extent, what
+ * follows it is a regular extent (wanted to check the above
+ * condition for inline extents too). This should normally not
+ * happen but it's possible for example when we have an inline
+ * compressed extent representing data with a size matching
+ * the page size (currently the same as sector size).
+ */
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = 0;
+ goto out;
+ }
+
+ right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+ right_offset = btrfs_file_extent_offset(eb, ei);
+ right_gen = btrfs_file_extent_generation(eb, ei);
+
+ left_offset_fixed = left_offset;
+ if (key.offset < ekey->offset) {
+ /* Fix the right offset for 2a and 7. */
+ right_offset += ekey->offset - key.offset;
+ } else {
+ /* Fix the left offset for all behind 2a and 2b */
+ left_offset_fixed += key.offset - ekey->offset;
+ }
+
+ /*
+ * Check if we have the same extent.
+ */
+ if (left_disknr != right_disknr ||
+ left_offset_fixed != right_offset ||
+ left_gen != right_gen) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Go to the next extent.
+ */
+ ret = btrfs_next_item(sctx->parent_root, path);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ }
+ if (ret || found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ key.offset += right_len;
+ break;
+ }
+ if (found_key.offset != key.offset + right_len) {
+ ret = 0;
+ goto out;
+ }
+ key = found_key;
+ }
+
+ /*
+ * We're now behind the left extent (treat as unchanged) or at the end
+ * of the right side (treat as changed).
+ */
+ if (key.offset >= ekey->offset + left_len)
+ ret = 1;
+ else
+ ret = 0;
+
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int get_last_extent(struct send_ctx *sctx, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_key key;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ sctx->cur_inode_last_extent = 0;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = offset;
+ ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ goto out;
+
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int range_is_hole_in_parent(struct send_ctx *sctx,
+ const u64 start,
+ const u64 end)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root *root = sctx->parent_root;
+ u64 search_start = start;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = search_start;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ while (search_start < end) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *fi;
+ u64 extent_end;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid < sctx->cur_ino ||
+ key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
+ if (key.objectid > sctx->cur_ino ||
+ key.type > BTRFS_EXTENT_DATA_KEY ||
+ key.offset >= end)
+ break;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(path);
+ if (extent_end <= start)
+ goto next;
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
+ search_start = extent_end;
+ goto next;
+ }
+ ret = 0;
+ goto out;
+next:
+ path->slots[0]++;
+ }
+ ret = 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ int ret = 0;
+
+ if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+ return 0;
+
+ if (sctx->cur_inode_last_extent == (u64)-1) {
+ ret = get_last_extent(sctx, key->offset - 1);
+ if (ret)
+ return ret;
+ }
+
+ if (path->slots[0] == 0 &&
+ sctx->cur_inode_last_extent < key->offset) {
+ /*
+ * We might have skipped entire leafs that contained only
+ * file extent items for our current inode. These leafs have
+ * a generation number smaller (older) than the one in the
+ * current leaf and the leaf our last extent came from, and
+ * are located between these 2 leafs.
+ */
+ ret = get_last_extent(sctx, key->offset - 1);
+ if (ret)
+ return ret;
+ }
+
+ if (sctx->cur_inode_last_extent < key->offset) {
+ ret = range_is_hole_in_parent(sctx,
+ sctx->cur_inode_last_extent,
+ key->offset);
+ if (ret < 0)
+ return ret;
+ else if (ret == 0)
+ ret = send_hole(sctx, key->offset);
+ else
+ ret = 0;
+ }
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
+ return ret;
+}
+
+static int process_extent(struct send_ctx *sctx,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct clone_root *found_clone = NULL;
+ int ret = 0;
+
+ if (S_ISLNK(sctx->cur_inode_mode))
+ return 0;
+
+ if (sctx->parent_root && !sctx->cur_inode_new) {
+ ret = is_extent_unchanged(sctx, path, key);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out_hole;
+ }
+ } else {
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], ei);
+ if (type == BTRFS_FILE_EXTENT_PREALLOC ||
+ type == BTRFS_FILE_EXTENT_REG) {
+ /*
+ * The send spec does not have a prealloc command yet,
+ * so just leave a hole for prealloc'ed extents until
+ * we have enough commands queued up to justify rev'ing
+ * the send spec.
+ */
+ if (type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Have a hole, just skip it. */
+ if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
+ ret = 0;
+ goto out;
+ }
+ }
+ }
+
+ ret = find_extent_clone(sctx, path, key->objectid, key->offset,
+ sctx->cur_inode_size, &found_clone);
+ if (ret != -ENOENT && ret < 0)
+ goto out;
+
+ ret = send_write_or_clone(sctx, path, key, found_clone);
+ if (ret)
+ goto out;
+out_hole:
+ ret = maybe_send_hole(sctx, path, key);
+out:
+ return ret;
+}
+
+static int process_all_extents(struct send_ctx *sctx)
+{
+ int ret = 0;
+ int iter_ret = 0;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+
+ root = sctx->send_root;
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = sctx->cmp_key->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ break;
+ }
+
+ ret = process_extent(sctx, path, &found_key);
+ if (ret < 0)
+ break;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+ int *pending_move,
+ int *refs_processed)
+{
+ int ret = 0;
+
+ if (sctx->cur_ino == 0)
+ goto out;
+ if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
+ sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
+ goto out;
+ if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
+ goto out;
+
+ ret = process_recorded_refs(sctx, pending_move);
+ if (ret < 0)
+ goto out;
+
+ *refs_processed = 1;
+out:
+ return ret;
+}
+
+static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+{
+ int ret = 0;
+ struct btrfs_inode_info info;
+ u64 left_mode;
+ u64 left_uid;
+ u64 left_gid;
+ u64 left_fileattr;
+ u64 right_mode;
+ u64 right_uid;
+ u64 right_gid;
+ u64 right_fileattr;
+ int need_chmod = 0;
+ int need_chown = 0;
+ bool need_fileattr = false;
+ int need_truncate = 1;
+ int pending_move = 0;
+ int refs_processed = 0;
+
+ if (sctx->ignore_cur_inode)
+ return 0;
+
+ ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+ &refs_processed);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We have processed the refs and thus need to advance send_progress.
+ * Now, calls to get_cur_xxx will take the updated refs of the current
+ * inode into account.
+ *
+ * On the other hand, if our current inode is a directory and couldn't
+ * be moved/renamed because its parent was renamed/moved too and it has
+ * a higher inode number, we can only move/rename our current inode
+ * after we moved/renamed its parent. Therefore in this case operate on
+ * the old path (pre move/rename) of our current inode, and the
+ * move/rename will be performed later.
+ */
+ if (refs_processed && !pending_move)
+ sctx->send_progress = sctx->cur_ino + 1;
+
+ if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
+ goto out;
+ if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
+ goto out;
+ ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
+ if (ret < 0)
+ goto out;
+ left_mode = info.mode;
+ left_uid = info.uid;
+ left_gid = info.gid;
+ left_fileattr = info.fileattr;
+
+ if (!sctx->parent_root || sctx->cur_inode_new) {
+ need_chown = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode))
+ need_chmod = 1;
+ if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
+ need_truncate = 0;
+ } else {
+ u64 old_size;
+
+ ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
+ if (ret < 0)
+ goto out;
+ old_size = info.size;
+ right_mode = info.mode;
+ right_uid = info.uid;
+ right_gid = info.gid;
+ right_fileattr = info.fileattr;
+
+ if (left_uid != right_uid || left_gid != right_gid)
+ need_chown = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
+ need_chmod = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
+ need_fileattr = true;
+ if ((old_size == sctx->cur_inode_size) ||
+ (sctx->cur_inode_size > old_size &&
+ sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
+ need_truncate = 0;
+ }
+
+ if (S_ISREG(sctx->cur_inode_mode)) {
+ if (need_send_hole(sctx)) {
+ if (sctx->cur_inode_last_extent == (u64)-1 ||
+ sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
+ ret = get_last_extent(sctx, (u64)-1);
+ if (ret)
+ goto out;
+ }
+ if (sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
+ ret = send_hole(sctx, sctx->cur_inode_size);
+ if (ret)
+ goto out;
+ }
+ }
+ if (need_truncate) {
+ ret = send_truncate(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen,
+ sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ if (need_chown) {
+ ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_uid, left_gid);
+ if (ret < 0)
+ goto out;
+ }
+ if (need_chmod) {
+ ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_mode);
+ if (ret < 0)
+ goto out;
+ }
+ if (need_fileattr) {
+ ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_fileattr);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
+ && sctx->cur_inode_needs_verity) {
+ ret = process_verity(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = send_capabilities(sctx);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * If other directory inodes depended on our current directory
+ * inode's move/rename, now do their move/rename operations.
+ */
+ if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+ ret = apply_children_dir_moves(sctx);
+ if (ret)
+ goto out;
+ /*
+ * Need to send that every time, no matter if it actually
+ * changed between the two trees as we have done changes to
+ * the inode before. If our inode is a directory and it's
+ * waiting to be moved/renamed, we will send its utimes when
+ * it's moved/renamed, therefore we don't need to do it here.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
+ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static void close_current_inode(struct send_ctx *sctx)
+{
+ u64 i_size;
+
+ if (sctx->cur_inode == NULL)
+ return;
+
+ i_size = i_size_read(sctx->cur_inode);
+
+ /*
+ * If we are doing an incremental send, we may have extents between the
+ * last processed extent and the i_size that have not been processed
+ * because they haven't changed but we may have read some of their pages
+ * through readahead, see the comments at send_extent_data().
+ */
+ if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ round_up(i_size, PAGE_SIZE) - 1);
+
+ iput(sctx->cur_inode);
+ sctx->cur_inode = NULL;
+}
+
+static int changed_inode(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+ struct btrfs_key *key = sctx->cmp_key;
+ struct btrfs_inode_item *left_ii = NULL;
+ struct btrfs_inode_item *right_ii = NULL;
+ u64 left_gen = 0;
+ u64 right_gen = 0;
+
+ close_current_inode(sctx);
+
+ sctx->cur_ino = key->objectid;
+ sctx->cur_inode_new_gen = false;
+ sctx->cur_inode_last_extent = (u64)-1;
+ sctx->cur_inode_next_write_offset = 0;
+ sctx->ignore_cur_inode = false;
+
+ /*
+ * Set send_progress to current inode. This will tell all get_cur_xxx
+ * functions that the current inode's refs are not updated yet. Later,
+ * when process_recorded_refs is finished, it is set to cur_ino + 1.
+ */
+ sctx->send_progress = sctx->cur_ino;
+
+ if (result == BTRFS_COMPARE_TREE_NEW ||
+ result == BTRFS_COMPARE_TREE_CHANGED) {
+ left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
+ sctx->left_path->slots[0],
+ struct btrfs_inode_item);
+ left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
+ left_ii);
+ } else {
+ right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+ sctx->right_path->slots[0],
+ struct btrfs_inode_item);
+ right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+ right_ii);
+ }
+ if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+ sctx->right_path->slots[0],
+ struct btrfs_inode_item);
+
+ right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+ right_ii);
+
+ /*
+ * The cur_ino = root dir case is special here. We can't treat
+ * the inode as deleted+reused because it would generate a
+ * stream that tries to delete/mkdir the root dir.
+ */
+ if (left_gen != right_gen &&
+ sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+ sctx->cur_inode_new_gen = true;
+ }
+
+ /*
+ * Normally we do not find inodes with a link count of zero (orphans)
+ * because the most common case is to create a snapshot and use it
+ * for a send operation. However other less common use cases involve
+ * using a subvolume and send it after turning it to RO mode just
+ * after deleting all hard links of a file while holding an open
+ * file descriptor against it or turning a RO snapshot into RW mode,
+ * keep an open file descriptor against a file, delete it and then
+ * turn the snapshot back to RO mode before using it for a send
+ * operation. The former is what the receiver operation does.
+ * Therefore, if we want to send these snapshots soon after they're
+ * received, we need to handle orphan inodes as well. Moreover, orphans
+ * can appear not only in the send snapshot but also in the parent
+ * snapshot. Here are several cases:
+ *
+ * Case 1: BTRFS_COMPARE_TREE_NEW
+ * | send snapshot | action
+ * --------------------------------
+ * nlink | 0 | ignore
+ *
+ * Case 2: BTRFS_COMPARE_TREE_DELETED
+ * | parent snapshot | action
+ * ----------------------------------
+ * nlink | 0 | as usual
+ * Note: No unlinks will be sent because there're no paths for it.
+ *
+ * Case 3: BTRFS_COMPARE_TREE_CHANGED
+ * | | parent snapshot | send snapshot | action
+ * -----------------------------------------------------------------------
+ * subcase 1 | nlink | 0 | 0 | ignore
+ * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
+ * subcase 3 | nlink | 0 | >0 | new_gen(creation)
+ *
+ */
+ if (result == BTRFS_COMPARE_TREE_NEW) {
+ if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
+ sctx->ignore_cur_inode = true;
+ goto out;
+ }
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
+ if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+ ret = send_create_inode_if_needed(sctx);
+ } else if (result == BTRFS_COMPARE_TREE_DELETED) {
+ sctx->cur_inode_gen = right_gen;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->right_path->nodes[0], right_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->right_path->nodes[0], right_ii);
+ } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ u32 new_nlinks, old_nlinks;
+
+ new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+ old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
+ if (new_nlinks == 0 && old_nlinks == 0) {
+ sctx->ignore_cur_inode = true;
+ goto out;
+ } else if (new_nlinks == 0 || old_nlinks == 0) {
+ sctx->cur_inode_new_gen = 1;
+ }
+ /*
+ * We need to do some special handling in case the inode was
+ * reported as changed with a changed generation number. This
+ * means that the original inode was deleted and new inode
+ * reused the same inum. So we have to treat the old inode as
+ * deleted and the new one as new.
+ */
+ if (sctx->cur_inode_new_gen) {
+ /*
+ * First, process the inode as if it was deleted.
+ */
+ if (old_nlinks > 0) {
+ sctx->cur_inode_gen = right_gen;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->right_path->nodes[0], right_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->right_path->nodes[0], right_ii);
+ ret = process_all_refs(sctx,
+ BTRFS_COMPARE_TREE_DELETED);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * Now process the inode as if it was new.
+ */
+ if (new_nlinks > 0) {
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0],
+ left_ii);
+ ret = send_create_inode_if_needed(sctx);
+ if (ret < 0)
+ goto out;
+
+ ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+ if (ret < 0)
+ goto out;
+ /*
+ * Advance send_progress now as we did not get
+ * into process_recorded_refs_if_needed in the
+ * new_gen case.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
+
+ /*
+ * Now process all extents and xattrs of the
+ * inode as if they were all new.
+ */
+ ret = process_all_extents(sctx);
+ if (ret < 0)
+ goto out;
+ ret = process_all_new_xattrs(sctx);
+ if (ret < 0)
+ goto out;
+ }
+ } else {
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_new_gen = false;
+ sctx->cur_inode_deleted = false;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0], left_ii);
+ }
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
+static int changed_ref(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ inconsistent_snapshot_error(sctx, result, "reference");
+ return -EIO;
+ }
+
+ if (!sctx->cur_inode_new_gen &&
+ sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ ret = record_new_ref(sctx);
+ else if (result == BTRFS_COMPARE_TREE_DELETED)
+ ret = record_deleted_ref(sctx);
+ else if (result == BTRFS_COMPARE_TREE_CHANGED)
+ ret = record_changed_ref(sctx);
+ }
+
+ return ret;
+}
+
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
+static int changed_xattr(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ inconsistent_snapshot_error(sctx, result, "xattr");
+ return -EIO;
+ }
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ ret = process_new_xattr(sctx);
+ else if (result == BTRFS_COMPARE_TREE_DELETED)
+ ret = process_deleted_xattr(sctx);
+ else if (result == BTRFS_COMPARE_TREE_CHANGED)
+ ret = process_changed_xattr(sctx);
+ }
+
+ return ret;
+}
+
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
+static int changed_extent(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ /*
+ * We have found an extent item that changed without the inode item
+ * having changed. This can happen either after relocation (where the
+ * disk_bytenr of an extent item is replaced at
+ * relocation.c:replace_file_extents()) or after deduplication into a
+ * file in both the parent and send snapshots (where an extent item can
+ * get modified or replaced with a new one). Note that deduplication
+ * updates the inode item, but it only changes the iversion (sequence
+ * field in the inode item) of the inode, so if a file is deduplicated
+ * the same amount of times in both the parent and send snapshots, its
+ * iversion becomes the same in both snapshots, whence the inode item is
+ * the same on both snapshots.
+ */
+ if (sctx->cur_ino != sctx->cmp_key->objectid)
+ return 0;
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result != BTRFS_COMPARE_TREE_DELETED)
+ ret = process_extent(sctx, sctx->left_path,
+ sctx->cmp_key);
+ }
+
+ return ret;
+}
+
+static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ sctx->cur_inode_needs_verity = true;
+ }
+ return ret;
+}
+
+static int dir_changed(struct send_ctx *sctx, u64 dir)
+{
+ u64 orig_gen, new_gen;
+ int ret;
+
+ ret = get_inode_gen(sctx->send_root, dir, &new_gen);
+ if (ret)
+ return ret;
+
+ ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
+ if (ret)
+ return ret;
+
+ return (orig_gen != new_gen) ? 1 : 0;
+}
+
+static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ u64 dirid = 0, last_dirid = 0;
+ unsigned long ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int ref_name_len;
+ int ret = 0;
+
+ /* Easy case, just check this one dirid */
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ dirid = key->offset;
+
+ ret = dir_changed(sctx, dirid);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ dirid = btrfs_inode_extref_parent(leaf, extref);
+ ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+ cur_offset += ref_name_len + sizeof(*extref);
+ if (dirid == last_dirid)
+ continue;
+ ret = dir_changed(sctx, dirid);
+ if (ret)
+ break;
+ last_dirid = dirid;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
+static int changed_cb(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ struct btrfs_key *key,
+ enum btrfs_compare_tree_result result,
+ struct send_ctx *sctx)
+{
+ int ret = 0;
+
+ /*
+ * We can not hold the commit root semaphore here. This is because in
+ * the case of sending and receiving to the same filesystem, using a
+ * pipe, could result in a deadlock:
+ *
+ * 1) The task running send blocks on the pipe because it's full;
+ *
+ * 2) The task running receive, which is the only consumer of the pipe,
+ * is waiting for a transaction commit (for example due to a space
+ * reservation when doing a write or triggering a transaction commit
+ * when creating a subvolume);
+ *
+ * 3) The transaction is waiting to write lock the commit root semaphore,
+ * but can not acquire it since it's being held at 1).
+ *
+ * Down this call chain we write to the pipe through kernel_write().
+ * The same type of problem can also happen when sending to a file that
+ * is stored in the same filesystem - when reserving space for a write
+ * into the file, we can trigger a transaction commit.
+ *
+ * Our caller has supplied us with clones of leaves from the send and
+ * parent roots, so we're safe here from a concurrent relocation and
+ * further reallocation of metadata extents while we are here. Below we
+ * also assert that the leaves are clones.
+ */
+ lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
+
+ /*
+ * We always have a send root, so left_path is never NULL. We will not
+ * have a leaf when we have reached the end of the send root but have
+ * not yet reached the end of the parent root.
+ */
+ if (left_path->nodes[0])
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+ &left_path->nodes[0]->bflags));
+ /*
+ * When doing a full send we don't have a parent root, so right_path is
+ * NULL. When doing an incremental send, we may have reached the end of
+ * the parent root already, so we don't have a leaf at right_path.
+ */
+ if (right_path && right_path->nodes[0])
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+ &right_path->nodes[0]->bflags));
+
+ if (result == BTRFS_COMPARE_TREE_SAME) {
+ if (key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY) {
+ ret = compare_refs(sctx, left_path, key);
+ if (!ret)
+ return 0;
+ if (ret < 0)
+ return ret;
+ } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+ return maybe_send_hole(sctx, left_path, key);
+ } else {
+ return 0;
+ }
+ result = BTRFS_COMPARE_TREE_CHANGED;
+ ret = 0;
+ }
+
+ sctx->left_path = left_path;
+ sctx->right_path = right_path;
+ sctx->cmp_key = key;
+
+ ret = finish_inode_if_needed(sctx, 0);
+ if (ret < 0)
+ goto out;
+
+ /* Ignore non-FS objects */
+ if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+ key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+ goto out;
+
+ if (key->type == BTRFS_INODE_ITEM_KEY) {
+ ret = changed_inode(sctx, result);
+ } else if (!sctx->ignore_cur_inode) {
+ if (key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = changed_ref(sctx, result);
+ else if (key->type == BTRFS_XATTR_ITEM_KEY)
+ ret = changed_xattr(sctx, result);
+ else if (key->type == BTRFS_EXTENT_DATA_KEY)
+ ret = changed_extent(sctx, result);
+ else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
+ key->offset == 0)
+ ret = changed_verity(sctx, result);
+ }
+
+out:
+ return ret;
+}
+
+static int search_key_again(const struct send_ctx *sctx,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key)
+{
+ int ret;
+
+ if (!path->need_commit_sem)
+ lockdep_assert_held_read(&root->fs_info->commit_root_sem);
+
+ /*
+ * Roots used for send operations are readonly and no one can add,
+ * update or remove keys from them, so we should be able to find our
+ * key again. The only exception is deduplication, which can operate on
+ * readonly roots and add, update or remove keys to/from them - but at
+ * the moment we don't allow it to run in parallel with send.
+ */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ ASSERT(ret <= 0);
+ if (ret > 0) {
+ btrfs_print_tree(path->nodes[path->lowest_level], false);
+ btrfs_err(root->fs_info,
+"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
+ key->objectid, key->type, key->offset,
+ (root == sctx->parent_root ? "parent" : "send"),
+ root->root_key.objectid, path->lowest_level,
+ path->slots[path->lowest_level]);
+ return -EUCLEAN;
+ }
+
+ return ret;
+}
+
+static int full_send_tree(struct send_ctx *sctx)
+{
+ int ret;
+ struct btrfs_root *send_root = sctx->send_root;
+ struct btrfs_key key;
+ struct btrfs_fs_info *fs_info = send_root->fs_info;
+ struct btrfs_path *path;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD_ALWAYS;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ down_read(&fs_info->commit_root_sem);
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ up_read(&fs_info->commit_root_sem);
+
+ ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ goto out_finish;
+
+ while (1) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ ret = changed_cb(path, NULL, &key,
+ BTRFS_COMPARE_TREE_NEW, sctx);
+ if (ret < 0)
+ goto out;
+
+ down_read(&fs_info->commit_root_sem);
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ up_read(&fs_info->commit_root_sem);
+ /*
+ * A transaction used for relocating a block group was
+ * committed or is about to finish its commit. Release
+ * our path (leaf) and restart the search, so that we
+ * avoid operating on any file extent items that are
+ * stale, with a disk_bytenr that reflects a pre
+ * relocation value. This way we avoid as much as
+ * possible to fallback to regular writes when checking
+ * if we can clone file ranges.
+ */
+ btrfs_release_path(path);
+ ret = search_key_again(sctx, send_root, path, &key);
+ if (ret < 0)
+ goto out;
+ } else {
+ up_read(&fs_info->commit_root_sem);
+ }
+
+ ret = btrfs_next_item(send_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ break;
+ }
+ }
+
+out_finish:
+ ret = finish_inode_if_needed(sctx, 1);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int replace_node_with_clone(struct btrfs_path *path, int level)
+{
+ struct extent_buffer *clone;
+
+ clone = btrfs_clone_extent_buffer(path->nodes[level]);
+ if (!clone)
+ return -ENOMEM;
+
+ free_extent_buffer(path->nodes[level]);
+ path->nodes[level] = clone;
+
+ return 0;
+}
+
+static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
+{
+ struct extent_buffer *eb;
+ struct extent_buffer *parent = path->nodes[*level];
+ int slot = path->slots[*level];
+ const int nritems = btrfs_header_nritems(parent);
+ u64 reada_max;
+ u64 reada_done = 0;
+
+ lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+
+ BUG_ON(*level == 0);
+ eb = btrfs_read_node_slot(parent, slot);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+
+ /*
+ * Trigger readahead for the next leaves we will process, so that it is
+ * very likely that when we need them they are already in memory and we
+ * will not block on disk IO. For nodes we only do readahead for one,
+ * since the time window between processing nodes is typically larger.
+ */
+ reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
+
+ for (slot++; slot < nritems && reada_done < reada_max; slot++) {
+ if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
+ btrfs_readahead_node_child(parent, slot);
+ reada_done += eb->fs_info->nodesize;
+ }
+ }
+
+ path->nodes[*level - 1] = eb;
+ path->slots[*level - 1] = 0;
+ (*level)--;
+
+ if (*level == 0)
+ return replace_node_with_clone(path, 0);
+
+ return 0;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_path *path,
+ int *level, int root_level)
+{
+ int ret = 0;
+ int nritems;
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+
+ path->slots[*level]++;
+
+ while (path->slots[*level] >= nritems) {
+ if (*level == root_level) {
+ path->slots[*level] = nritems - 1;
+ return -1;
+ }
+
+ /* move upnext */
+ path->slots[*level] = 0;
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ (*level)++;
+ path->slots[*level]++;
+
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_path *path,
+ int *level, int root_level,
+ int allow_down,
+ struct btrfs_key *key,
+ u64 reada_min_gen)
+{
+ int ret;
+
+ if (*level == 0 || !allow_down) {
+ ret = tree_move_next_or_upnext(path, level, root_level);
+ } else {
+ ret = tree_move_down(path, level, reada_min_gen);
+ }
+
+ /*
+ * Even if we have reached the end of a tree, ret is -1, update the key
+ * anyway, so that in case we need to restart due to a block group
+ * relocation, we can assert that the last key of the root node still
+ * exists in the tree.
+ */
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+
+ return ret;
+}
+
+static int tree_compare_item(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ char *tmp_buf)
+{
+ int cmp;
+ int len1, len2;
+ unsigned long off1, off2;
+
+ len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
+ if (len1 != len2)
+ return 1;
+
+ off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+ off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+ right_path->slots[0]);
+
+ read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+ cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+ if (cmp)
+ return 1;
+ return 0;
+}
+
+/*
+ * A transaction used for relocating a block group was committed or is about to
+ * finish its commit. Release our paths and restart the search, so that we are
+ * not using stale extent buffers:
+ *
+ * 1) For levels > 0, we are only holding references of extent buffers, without
+ * any locks on them, which does not prevent them from having been relocated
+ * and reallocated after the last time we released the commit root semaphore.
+ * The exception are the root nodes, for which we always have a clone, see
+ * the comment at btrfs_compare_trees();
+ *
+ * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
+ * we are safe from the concurrent relocation and reallocation. However they
+ * can have file extent items with a pre relocation disk_bytenr value, so we
+ * restart the start from the current commit roots and clone the new leaves so
+ * that we get the post relocation disk_bytenr values. Not doing so, could
+ * make us clone the wrong data in case there are new extents using the old
+ * disk_bytenr that happen to be shared.
+ */
+static int restart_after_relocation(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ const struct btrfs_key *left_key,
+ const struct btrfs_key *right_key,
+ int left_level,
+ int right_level,
+ const struct send_ctx *sctx)
+{
+ int root_level;
+ int ret;
+
+ lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
+
+ btrfs_release_path(left_path);
+ btrfs_release_path(right_path);
+
+ /*
+ * Since keys can not be added or removed to/from our roots because they
+ * are readonly and we do not allow deduplication to run in parallel
+ * (which can add, remove or change keys), the layout of the trees should
+ * not change.
+ */
+ left_path->lowest_level = left_level;
+ ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
+ if (ret < 0)
+ return ret;
+
+ right_path->lowest_level = right_level;
+ ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * If the lowest level nodes are leaves, clone them so that they can be
+ * safely used by changed_cb() while not under the protection of the
+ * commit root semaphore, even if relocation and reallocation happens in
+ * parallel.
+ */
+ if (left_level == 0) {
+ ret = replace_node_with_clone(left_path, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (right_level == 0) {
+ ret = replace_node_with_clone(right_path, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * Now clone the root nodes (unless they happen to be the leaves we have
+ * already cloned). This is to protect against concurrent snapshotting of
+ * the send and parent roots (see the comment at btrfs_compare_trees()).
+ */
+ root_level = btrfs_header_level(sctx->send_root->commit_root);
+ if (root_level > 0) {
+ ret = replace_node_with_clone(left_path, root_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ root_level = btrfs_header_level(sctx->parent_root->commit_root);
+ if (root_level > 0) {
+ ret = replace_node_with_clone(right_path, root_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+static int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root, struct send_ctx *sctx)
+{
+ struct btrfs_fs_info *fs_info = left_root->fs_info;
+ int ret;
+ int cmp;
+ struct btrfs_path *left_path = NULL;
+ struct btrfs_path *right_path = NULL;
+ struct btrfs_key left_key;
+ struct btrfs_key right_key;
+ char *tmp_buf = NULL;
+ int left_root_level;
+ int right_root_level;
+ int left_level;
+ int right_level;
+ int left_end_reached = 0;
+ int right_end_reached = 0;
+ int advance_left = 0;
+ int advance_right = 0;
+ u64 left_blockptr;
+ u64 right_blockptr;
+ u64 left_gen;
+ u64 right_gen;
+ u64 reada_min_gen;
+
+ left_path = btrfs_alloc_path();
+ if (!left_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ right_path = btrfs_alloc_path();
+ if (!right_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!tmp_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ left_path->search_commit_root = 1;
+ left_path->skip_locking = 1;
+ right_path->search_commit_root = 1;
+ right_path->skip_locking = 1;
+
+ /*
+ * Strategy: Go to the first items of both trees. Then do
+ *
+ * If both trees are at level 0
+ * Compare keys of current items
+ * If left < right treat left item as new, advance left tree
+ * and repeat
+ * If left > right treat right item as deleted, advance right tree
+ * and repeat
+ * If left == right do deep compare of items, treat as changed if
+ * needed, advance both trees and repeat
+ * If both trees are at the same level but not at level 0
+ * Compare keys of current nodes/leafs
+ * If left < right advance left tree and repeat
+ * If left > right advance right tree and repeat
+ * If left == right compare blockptrs of the next nodes/leafs
+ * If they match advance both trees but stay at the same level
+ * and repeat
+ * If they don't match advance both trees while allowing to go
+ * deeper and repeat
+ * If tree levels are different
+ * Advance the tree that needs it and repeat
+ *
+ * Advancing a tree means:
+ * If we are at level 0, try to go to the next slot. If that's not
+ * possible, go one level up and repeat. Stop when we found a level
+ * where we could go to the next slot. We may at this point be on a
+ * node or a leaf.
+ *
+ * If we are not at level 0 and not on shared tree blocks, go one
+ * level deeper.
+ *
+ * If we are not at level 0 and on shared tree blocks, go one slot to
+ * the right if possible or go up and right.
+ */
+
+ down_read(&fs_info->commit_root_sem);
+ left_level = btrfs_header_level(left_root->commit_root);
+ left_root_level = left_level;
+ /*
+ * We clone the root node of the send and parent roots to prevent races
+ * with snapshot creation of these roots. Snapshot creation COWs the
+ * root node of a tree, so after the transaction is committed the old
+ * extent can be reallocated while this send operation is still ongoing.
+ * So we clone them, under the commit root semaphore, to be race free.
+ */
+ left_path->nodes[left_level] =
+ btrfs_clone_extent_buffer(left_root->commit_root);
+ if (!left_path->nodes[left_level]) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ right_level = btrfs_header_level(right_root->commit_root);
+ right_root_level = right_level;
+ right_path->nodes[right_level] =
+ btrfs_clone_extent_buffer(right_root->commit_root);
+ if (!right_path->nodes[right_level]) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ /*
+ * Our right root is the parent root, while the left root is the "send"
+ * root. We know that all new nodes/leaves in the left root must have
+ * a generation greater than the right root's generation, so we trigger
+ * readahead for those nodes and leaves of the left root, as we know we
+ * will need to read them at some point.
+ */
+ reada_min_gen = btrfs_header_generation(right_root->commit_root);
+
+ if (left_level == 0)
+ btrfs_item_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ else
+ btrfs_node_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ if (right_level == 0)
+ btrfs_item_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+ else
+ btrfs_node_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+
+ while (1) {
+ if (need_resched() ||
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
+ up_read(&fs_info->commit_root_sem);
+ cond_resched();
+ down_read(&fs_info->commit_root_sem);
+ }
+
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ ret = restart_after_relocation(left_path, right_path,
+ &left_key, &right_key,
+ left_level, right_level,
+ sctx);
+ if (ret < 0)
+ goto out_unlock;
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ }
+
+ if (advance_left && !left_end_reached) {
+ ret = tree_advance(left_path, &left_level,
+ left_root_level,
+ advance_left != ADVANCE_ONLY_NEXT,
+ &left_key, reada_min_gen);
+ if (ret == -1)
+ left_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out_unlock;
+ advance_left = 0;
+ }
+ if (advance_right && !right_end_reached) {
+ ret = tree_advance(right_path, &right_level,
+ right_root_level,
+ advance_right != ADVANCE_ONLY_NEXT,
+ &right_key, reada_min_gen);
+ if (ret == -1)
+ right_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out_unlock;
+ advance_right = 0;
+ }
+
+ if (left_end_reached && right_end_reached) {
+ ret = 0;
+ goto out_unlock;
+ } else if (left_end_reached) {
+ if (right_level == 0) {
+ up_read(&fs_info->commit_root_sem);
+ ret = changed_cb(left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ sctx);
+ if (ret < 0)
+ goto out;
+ down_read(&fs_info->commit_root_sem);
+ }
+ advance_right = ADVANCE;
+ continue;
+ } else if (right_end_reached) {
+ if (left_level == 0) {
+ up_read(&fs_info->commit_root_sem);
+ ret = changed_cb(left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ sctx);
+ if (ret < 0)
+ goto out;
+ down_read(&fs_info->commit_root_sem);
+ }
+ advance_left = ADVANCE;
+ continue;
+ }
+
+ if (left_level == 0 && right_level == 0) {
+ up_read(&fs_info->commit_root_sem);
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ ret = changed_cb(left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ sctx);
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ ret = changed_cb(left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ sctx);
+ advance_right = ADVANCE;
+ } else {
+ enum btrfs_compare_tree_result result;
+
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+ ret = tree_compare_item(left_path, right_path,
+ tmp_buf);
+ if (ret)
+ result = BTRFS_COMPARE_TREE_CHANGED;
+ else
+ result = BTRFS_COMPARE_TREE_SAME;
+ ret = changed_cb(left_path, right_path,
+ &left_key, result, sctx);
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+
+ if (ret < 0)
+ goto out;
+ down_read(&fs_info->commit_root_sem);
+ } else if (left_level == right_level) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ advance_right = ADVANCE;
+ } else {
+ left_blockptr = btrfs_node_blockptr(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_blockptr = btrfs_node_blockptr(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ left_gen = btrfs_node_ptr_generation(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_gen = btrfs_node_ptr_generation(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr &&
+ left_gen == right_gen) {
+ /*
+ * As we're on a shared block, don't
+ * allow to go deeper.
+ */
+ advance_left = ADVANCE_ONLY_NEXT;
+ advance_right = ADVANCE_ONLY_NEXT;
+ } else {
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ }
+ } else if (left_level < right_level) {
+ advance_right = ADVANCE;
+ } else {
+ advance_left = ADVANCE;
+ }
+ }
+
+out_unlock:
+ up_read(&fs_info->commit_root_sem);
+out:
+ btrfs_free_path(left_path);
+ btrfs_free_path(right_path);
+ kvfree(tmp_buf);
+ return ret;
+}
+
+static int send_subvol(struct send_ctx *sctx)
+{
+ int ret;
+
+ if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
+ ret = send_header(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = send_subvol_begin(sctx);
+ if (ret < 0)
+ goto out;
+
+ if (sctx->parent_root) {
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
+ if (ret < 0)
+ goto out;
+ ret = finish_inode_if_needed(sctx, 1);
+ if (ret < 0)
+ goto out;
+ } else {
+ ret = full_send_tree(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ free_recorded_refs(sctx);
+ return ret;
+}
+
+/*
+ * If orphan cleanup did remove any orphans from a root, it means the tree
+ * was modified and therefore the commit root is not the same as the current
+ * root anymore. This is a problem, because send uses the commit root and
+ * therefore can see inode items that don't exist in the current root anymore,
+ * and for example make calls to btrfs_iget, which will do tree lookups based
+ * on the current root and not on the commit root. Those lookups will fail,
+ * returning a -ESTALE error, and making send fail with that error. So make
+ * sure a send does not see any orphans we have just removed, and that it will
+ * see the same inodes regardless of whether a transaction commit happened
+ * before it started (meaning that the commit root will be the same as the
+ * current root) or not.
+ */
+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
+{
+ int i;
+ struct btrfs_trans_handle *trans = NULL;
+
+again:
+ if (sctx->parent_root &&
+ sctx->parent_root->node != sctx->parent_root->commit_root)
+ goto commit_trans;
+
+ for (i = 0; i < sctx->clone_roots_cnt; i++)
+ if (sctx->clone_roots[i].root->node !=
+ sctx->clone_roots[i].root->commit_root)
+ goto commit_trans;
+
+ if (trans)
+ return btrfs_end_transaction(trans);
+
+ return 0;
+
+commit_trans:
+ /* Use any root, all fs roots will get their commit roots updated. */
+ if (!trans) {
+ trans = btrfs_join_transaction(sctx->send_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ goto again;
+ }
+
+ return btrfs_commit_transaction(trans);
+}
+
+/*
+ * Make sure any existing dellaloc is flushed for any root used by a send
+ * operation so that we do not miss any data and we do not race with writeback
+ * finishing and changing a tree while send is using the tree. This could
+ * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
+ * a send operation then uses the subvolume.
+ * After flushing delalloc ensure_commit_roots_uptodate() must be called.
+ */
+static int flush_delalloc_roots(struct send_ctx *sctx)
+{
+ struct btrfs_root *root = sctx->parent_root;
+ int ret;
+ int i;
+
+ if (root) {
+ ret = btrfs_start_delalloc_snapshot(root, false);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ }
+
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
+ root = sctx->clone_roots[i].root;
+ ret = btrfs_start_delalloc_snapshot(root, false);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ }
+
+ return 0;
+}
+
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+ spin_lock(&root->root_item_lock);
+ root->send_in_progress--;
+ /*
+ * Not much left to do, we don't know why it's unbalanced and
+ * can't blindly reset it to 0.
+ */
+ if (root->send_in_progress < 0)
+ btrfs_err(root->fs_info,
+ "send_in_progress unbalanced %d root %llu",
+ root->send_in_progress, root->root_key.objectid);
+ spin_unlock(&root->root_item_lock);
+}
+
+static void dedupe_in_progress_warn(const struct btrfs_root *root)
+{
+ btrfs_warn_rl(root->fs_info,
+"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
+ root->root_key.objectid, root->dedupe_in_progress);
+}
+
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+{
+ int ret = 0;
+ struct btrfs_root *send_root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = send_root->fs_info;
+ struct btrfs_root *clone_root;
+ struct send_ctx *sctx = NULL;
+ u32 i;
+ u64 *clone_sources_tmp = NULL;
+ int clone_sources_to_rollback = 0;
+ size_t alloc_size;
+ int sort_clone_roots = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * The subvolume must remain read-only during send, protect against
+ * making it RW. This also protects against deletion.
+ */
+ spin_lock(&send_root->root_item_lock);
+ if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(send_root);
+ spin_unlock(&send_root->root_item_lock);
+ return -EAGAIN;
+ }
+ send_root->send_in_progress++;
+ spin_unlock(&send_root->root_item_lock);
+
+ /*
+ * Userspace tools do the checks and warn the user if it's
+ * not RO.
+ */
+ if (!btrfs_root_readonly(send_root)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * Check that we don't overflow at later allocations, we request
+ * clone_sources_count + 1 items, and compare to unsigned long inside
+ * access_ok. Also set an upper limit for allocation size so this can't
+ * easily exhaust memory. Max number of clone sources is about 200K.
+ */
+ if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
+ if (!sctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&sctx->new_refs);
+ INIT_LIST_HEAD(&sctx->deleted_refs);
+ INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
+ INIT_LIST_HEAD(&sctx->name_cache_list);
+
+ sctx->flags = arg->flags;
+
+ if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+ if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+ ret = -EPROTO;
+ goto out;
+ }
+ /* Zero means "use the highest version" */
+ sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+ } else {
+ sctx->proto = 1;
+ }
+ if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sctx->send_filp = fget(arg->send_fd);
+ if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ sctx->send_root = send_root;
+ /*
+ * Unlikely but possible, if the subvolume is marked for deletion but
+ * is slow to remove the directory entry, send can still be started
+ */
+ if (btrfs_root_dead(sctx->send_root)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ sctx->clone_roots_cnt = arg->clone_sources_count;
+
+ if (sctx->proto >= 2) {
+ u32 send_buf_num_pages;
+
+ sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+ sctx->send_buf = vmalloc(sctx->send_max_size);
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
+ sctx->send_buf_pages = kcalloc(send_buf_num_pages,
+ sizeof(*sctx->send_buf_pages),
+ GFP_KERNEL);
+ if (!sctx->send_buf_pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < send_buf_num_pages; i++) {
+ sctx->send_buf_pages[i] =
+ vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT));
+ }
+ } else {
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
+ sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
+ }
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ sctx->pending_dir_moves = RB_ROOT;
+ sctx->waiting_dir_moves = RB_ROOT;
+ sctx->orphan_dirs = RB_ROOT;
+ sctx->rbtree_new_refs = RB_ROOT;
+ sctx->rbtree_deleted_refs = RB_ROOT;
+
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
+ if (!sctx->clone_roots) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
+
+ if (arg->clone_sources_count) {
+ clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
+ if (!clone_sources_tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
+ alloc_size);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ for (i = 0; i < arg->clone_sources_count; i++) {
+ clone_root = btrfs_get_fs_root(fs_info,
+ clone_sources_tmp[i], true);
+ if (IS_ERR(clone_root)) {
+ ret = PTR_ERR(clone_root);
+ goto out;
+ }
+ spin_lock(&clone_root->root_item_lock);
+ if (!btrfs_root_readonly(clone_root) ||
+ btrfs_root_dead(clone_root)) {
+ spin_unlock(&clone_root->root_item_lock);
+ btrfs_put_root(clone_root);
+ ret = -EPERM;
+ goto out;
+ }
+ if (clone_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(clone_root);
+ spin_unlock(&clone_root->root_item_lock);
+ btrfs_put_root(clone_root);
+ ret = -EAGAIN;
+ goto out;
+ }
+ clone_root->send_in_progress++;
+ spin_unlock(&clone_root->root_item_lock);
+
+ sctx->clone_roots[i].root = clone_root;
+ clone_sources_to_rollback = i + 1;
+ }
+ kvfree(clone_sources_tmp);
+ clone_sources_tmp = NULL;
+ }
+
+ if (arg->parent_root) {
+ sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
+ true);
+ if (IS_ERR(sctx->parent_root)) {
+ ret = PTR_ERR(sctx->parent_root);
+ goto out;
+ }
+
+ spin_lock(&sctx->parent_root->root_item_lock);
+ sctx->parent_root->send_in_progress++;
+ if (!btrfs_root_readonly(sctx->parent_root) ||
+ btrfs_root_dead(sctx->parent_root)) {
+ spin_unlock(&sctx->parent_root->root_item_lock);
+ ret = -EPERM;
+ goto out;
+ }
+ if (sctx->parent_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(sctx->parent_root);
+ spin_unlock(&sctx->parent_root->root_item_lock);
+ ret = -EAGAIN;
+ goto out;
+ }
+ spin_unlock(&sctx->parent_root->root_item_lock);
+ }
+
+ /*
+ * Clones from send_root are allowed, but only if the clone source
+ * is behind the current send position. This is checked while searching
+ * for possible clone sources.
+ */
+ sctx->clone_roots[sctx->clone_roots_cnt++].root =
+ btrfs_grab_root(sctx->send_root);
+
+ /* We do a bsearch later */
+ sort(sctx->clone_roots, sctx->clone_roots_cnt,
+ sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
+ NULL);
+ sort_clone_roots = 1;
+
+ ret = flush_delalloc_roots(sctx);
+ if (ret)
+ goto out;
+
+ ret = ensure_commit_roots_uptodate(sctx);
+ if (ret)
+ goto out;
+
+ ret = send_subvol(sctx);
+ if (ret < 0)
+ goto out;
+
+ if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
+ ret = begin_cmd(sctx, BTRFS_SEND_C_END);
+ if (ret < 0)
+ goto out;
+ ret = send_cmd(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+ struct rb_node *n;
+ struct pending_dir_move *pm;
+
+ n = rb_first(&sctx->pending_dir_moves);
+ pm = rb_entry(n, struct pending_dir_move, node);
+ while (!list_empty(&pm->list)) {
+ struct pending_dir_move *pm2;
+
+ pm2 = list_first_entry(&pm->list,
+ struct pending_dir_move, list);
+ free_pending_move(sctx, pm2);
+ }
+ free_pending_move(sctx, pm);
+ }
+
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+ struct rb_node *n;
+ struct waiting_dir_move *dm;
+
+ n = rb_first(&sctx->waiting_dir_moves);
+ dm = rb_entry(n, struct waiting_dir_move, node);
+ rb_erase(&dm->node, &sctx->waiting_dir_moves);
+ kfree(dm);
+ }
+
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+ struct rb_node *n;
+ struct orphan_dir_info *odi;
+
+ n = rb_first(&sctx->orphan_dirs);
+ odi = rb_entry(n, struct orphan_dir_info, node);
+ free_orphan_dir_info(sctx, odi);
+ }
+
+ if (sort_clone_roots) {
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
+ btrfs_root_dec_send_in_progress(
+ sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
+ } else {
+ for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
+ btrfs_root_dec_send_in_progress(
+ sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
+
+ btrfs_root_dec_send_in_progress(send_root);
+ }
+ if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
+ btrfs_root_dec_send_in_progress(sctx->parent_root);
+ btrfs_put_root(sctx->parent_root);
+ }
+
+ kvfree(clone_sources_tmp);
+
+ if (sctx) {
+ if (sctx->send_filp)
+ fput(sctx->send_filp);
+
+ kvfree(sctx->clone_roots);
+ kfree(sctx->send_buf_pages);
+ kvfree(sctx->send_buf);
+ kvfree(sctx->verity_descriptor);
+
+ name_cache_free(sctx);
+
+ close_current_inode(sctx);
+
+ kfree(sctx);
+ }
+
+ return ret;
+}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
new file mode 100644
index 000000000..f7585cfa7
--- /dev/null
+++ b/fs/btrfs/send.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2012 Alexander Block. All rights reserved.
+ * Copyright (C) 2012 STRATO. All rights reserved.
+ */
+
+#ifndef BTRFS_SEND_H
+#define BTRFS_SEND_H
+
+#include <linux/types.h>
+
+#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
+/* Conditional support for the upcoming protocol version. */
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_SEND_STREAM_VERSION 3
+#else
+#define BTRFS_SEND_STREAM_VERSION 2
+#endif
+
+/*
+ * In send stream v1, no command is larger than 64K. In send stream v2, no limit
+ * should be assumed.
+ */
+#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
+
+struct inode;
+struct btrfs_ioctl_send_args;
+
+enum btrfs_tlv_type {
+ BTRFS_TLV_U8,
+ BTRFS_TLV_U16,
+ BTRFS_TLV_U32,
+ BTRFS_TLV_U64,
+ BTRFS_TLV_BINARY,
+ BTRFS_TLV_STRING,
+ BTRFS_TLV_UUID,
+ BTRFS_TLV_TIMESPEC,
+};
+
+struct btrfs_stream_header {
+ char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)];
+ __le32 version;
+} __attribute__ ((__packed__));
+
+struct btrfs_cmd_header {
+ /* len excluding the header */
+ __le32 len;
+ __le16 cmd;
+ /* crc including the header with zero crc field */
+ __le32 crc;
+} __attribute__ ((__packed__));
+
+struct btrfs_tlv_header {
+ __le16 tlv_type;
+ /* len excluding the header */
+ __le16 tlv_len;
+} __attribute__ ((__packed__));
+
+/* commands */
+enum btrfs_send_cmd {
+ BTRFS_SEND_C_UNSPEC = 0,
+
+ /* Version 1 */
+ BTRFS_SEND_C_SUBVOL = 1,
+ BTRFS_SEND_C_SNAPSHOT = 2,
+
+ BTRFS_SEND_C_MKFILE = 3,
+ BTRFS_SEND_C_MKDIR = 4,
+ BTRFS_SEND_C_MKNOD = 5,
+ BTRFS_SEND_C_MKFIFO = 6,
+ BTRFS_SEND_C_MKSOCK = 7,
+ BTRFS_SEND_C_SYMLINK = 8,
+
+ BTRFS_SEND_C_RENAME = 9,
+ BTRFS_SEND_C_LINK = 10,
+ BTRFS_SEND_C_UNLINK = 11,
+ BTRFS_SEND_C_RMDIR = 12,
+
+ BTRFS_SEND_C_SET_XATTR = 13,
+ BTRFS_SEND_C_REMOVE_XATTR = 14,
+
+ BTRFS_SEND_C_WRITE = 15,
+ BTRFS_SEND_C_CLONE = 16,
+
+ BTRFS_SEND_C_TRUNCATE = 17,
+ BTRFS_SEND_C_CHMOD = 18,
+ BTRFS_SEND_C_CHOWN = 19,
+ BTRFS_SEND_C_UTIMES = 20,
+
+ BTRFS_SEND_C_END = 21,
+ BTRFS_SEND_C_UPDATE_EXTENT = 22,
+ BTRFS_SEND_C_MAX_V1 = 22,
+
+ /* Version 2 */
+ BTRFS_SEND_C_FALLOCATE = 23,
+ BTRFS_SEND_C_FILEATTR = 24,
+ BTRFS_SEND_C_ENCODED_WRITE = 25,
+ BTRFS_SEND_C_MAX_V2 = 25,
+
+ /* Version 3 */
+ BTRFS_SEND_C_ENABLE_VERITY = 26,
+ BTRFS_SEND_C_MAX_V3 = 26,
+ /* End */
+ BTRFS_SEND_C_MAX = 26,
+};
+
+/* attributes in send stream */
+enum {
+ BTRFS_SEND_A_UNSPEC = 0,
+
+ /* Version 1 */
+ BTRFS_SEND_A_UUID = 1,
+ BTRFS_SEND_A_CTRANSID = 2,
+
+ BTRFS_SEND_A_INO = 3,
+ BTRFS_SEND_A_SIZE = 4,
+ BTRFS_SEND_A_MODE = 5,
+ BTRFS_SEND_A_UID = 6,
+ BTRFS_SEND_A_GID = 7,
+ BTRFS_SEND_A_RDEV = 8,
+ BTRFS_SEND_A_CTIME = 9,
+ BTRFS_SEND_A_MTIME = 10,
+ BTRFS_SEND_A_ATIME = 11,
+ BTRFS_SEND_A_OTIME = 12,
+
+ BTRFS_SEND_A_XATTR_NAME = 13,
+ BTRFS_SEND_A_XATTR_DATA = 14,
+
+ BTRFS_SEND_A_PATH = 15,
+ BTRFS_SEND_A_PATH_TO = 16,
+ BTRFS_SEND_A_PATH_LINK = 17,
+
+ BTRFS_SEND_A_FILE_OFFSET = 18,
+ /*
+ * As of send stream v2, this attribute is special: it must be the last
+ * attribute in a command, its header contains only the type, and its
+ * length is implicitly the remaining length of the command.
+ */
+ BTRFS_SEND_A_DATA = 19,
+
+ BTRFS_SEND_A_CLONE_UUID = 20,
+ BTRFS_SEND_A_CLONE_CTRANSID = 21,
+ BTRFS_SEND_A_CLONE_PATH = 22,
+ BTRFS_SEND_A_CLONE_OFFSET = 23,
+ BTRFS_SEND_A_CLONE_LEN = 24,
+
+ BTRFS_SEND_A_MAX_V1 = 24,
+
+ /* Version 2 */
+ BTRFS_SEND_A_FALLOCATE_MODE = 25,
+
+ /*
+ * File attributes from the FS_*_FL namespace (i_flags, xflags),
+ * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored
+ * in btrfs_inode_item::flags (represented by btrfs_inode::flags and
+ * btrfs_inode::ro_flags).
+ */
+ BTRFS_SEND_A_FILEATTR = 26,
+
+ BTRFS_SEND_A_UNENCODED_FILE_LEN = 27,
+ BTRFS_SEND_A_UNENCODED_LEN = 28,
+ BTRFS_SEND_A_UNENCODED_OFFSET = 29,
+ /*
+ * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from
+ * BTRFS_SEND_C_ENCODED_WRITE.
+ */
+ BTRFS_SEND_A_COMPRESSION = 30,
+ BTRFS_SEND_A_ENCRYPTION = 31,
+ BTRFS_SEND_A_MAX_V2 = 31,
+
+ /* Version 3 */
+ BTRFS_SEND_A_VERITY_ALGORITHM = 32,
+ BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33,
+ BTRFS_SEND_A_VERITY_SALT_DATA = 34,
+ BTRFS_SEND_A_VERITY_SIG_DATA = 35,
+ BTRFS_SEND_A_MAX_V3 = 35,
+
+ __BTRFS_SEND_A_MAX = 35,
+};
+
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
+
+#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
new file mode 100644
index 000000000..2635fb4bf
--- /dev/null
+++ b/fs/btrfs/space-info.c
@@ -0,0 +1,1786 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "ctree.h"
+#include "space-info.h"
+#include "sysfs.h"
+#include "volumes.h"
+#include "free-space-cache.h"
+#include "ordered-data.h"
+#include "transaction.h"
+#include "block-group.h"
+#include "zoned.h"
+
+/*
+ * HOW DOES SPACE RESERVATION WORK
+ *
+ * If you want to know about delalloc specifically, there is a separate comment
+ * for that with the delalloc code. This comment is about how the whole system
+ * works generally.
+ *
+ * BASIC CONCEPTS
+ *
+ * 1) space_info. This is the ultimate arbiter of how much space we can use.
+ * There's a description of the bytes_ fields with the struct declaration,
+ * refer to that for specifics on each field. Suffice it to say that for
+ * reservations we care about total_bytes - SUM(space_info->bytes_) when
+ * determining if there is space to make an allocation. There is a space_info
+ * for METADATA, SYSTEM, and DATA areas.
+ *
+ * 2) block_rsv's. These are basically buckets for every different type of
+ * metadata reservation we have. You can see the comment in the block_rsv
+ * code on the rules for each type, but generally block_rsv->reserved is how
+ * much space is accounted for in space_info->bytes_may_use.
+ *
+ * 3) btrfs_calc*_size. These are the worst case calculations we used based
+ * on the number of items we will want to modify. We have one for changing
+ * items, and one for inserting new items. Generally we use these helpers to
+ * determine the size of the block reserves, and then use the actual bytes
+ * values to adjust the space_info counters.
+ *
+ * MAKING RESERVATIONS, THE NORMAL CASE
+ *
+ * We call into either btrfs_reserve_data_bytes() or
+ * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
+ * num_bytes we want to reserve.
+ *
+ * ->reserve
+ * space_info->bytes_may_reserve += num_bytes
+ *
+ * ->extent allocation
+ * Call btrfs_add_reserved_bytes() which does
+ * space_info->bytes_may_reserve -= num_bytes
+ * space_info->bytes_reserved += extent_bytes
+ *
+ * ->insert reference
+ * Call btrfs_update_block_group() which does
+ * space_info->bytes_reserved -= extent_bytes
+ * space_info->bytes_used += extent_bytes
+ *
+ * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
+ *
+ * Assume we are unable to simply make the reservation because we do not have
+ * enough space
+ *
+ * -> __reserve_bytes
+ * create a reserve_ticket with ->bytes set to our reservation, add it to
+ * the tail of space_info->tickets, kick async flush thread
+ *
+ * ->handle_reserve_ticket
+ * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
+ * on the ticket.
+ *
+ * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
+ * Flushes various things attempting to free up space.
+ *
+ * -> btrfs_try_granting_tickets()
+ * This is called by anything that either subtracts space from
+ * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
+ * space_info->total_bytes. This loops through the ->priority_tickets and
+ * then the ->tickets list checking to see if the reservation can be
+ * completed. If it can the space is added to space_info->bytes_may_use and
+ * the ticket is woken up.
+ *
+ * -> ticket wakeup
+ * Check if ->bytes == 0, if it does we got our reservation and we can carry
+ * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
+ * were interrupted.)
+ *
+ * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
+ *
+ * Same as the above, except we add ourselves to the
+ * space_info->priority_tickets, and we do not use ticket->wait, we simply
+ * call flush_space() ourselves for the states that are safe for us to call
+ * without deadlocking and hope for the best.
+ *
+ * THE FLUSHING STATES
+ *
+ * Generally speaking we will have two cases for each state, a "nice" state
+ * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
+ * reduce the locking over head on the various trees, and even to keep from
+ * doing any work at all in the case of delayed refs. Each of these delayed
+ * things however hold reservations, and so letting them run allows us to
+ * reclaim space so we can make new reservations.
+ *
+ * FLUSH_DELAYED_ITEMS
+ * Every inode has a delayed item to update the inode. Take a simple write
+ * for example, we would update the inode item at write time to update the
+ * mtime, and then again at finish_ordered_io() time in order to update the
+ * isize or bytes. We keep these delayed items to coalesce these operations
+ * into a single operation done on demand. These are an easy way to reclaim
+ * metadata space.
+ *
+ * FLUSH_DELALLOC
+ * Look at the delalloc comment to get an idea of how much space is reserved
+ * for delayed allocation. We can reclaim some of this space simply by
+ * running delalloc, but usually we need to wait for ordered extents to
+ * reclaim the bulk of this space.
+ *
+ * FLUSH_DELAYED_REFS
+ * We have a block reserve for the outstanding delayed refs space, and every
+ * delayed ref operation holds a reservation. Running these is a quick way
+ * to reclaim space, but we want to hold this until the end because COW can
+ * churn a lot and we can avoid making some extent tree modifications if we
+ * are able to delay for as long as possible.
+ *
+ * ALLOC_CHUNK
+ * We will skip this the first time through space reservation, because of
+ * overcommit and we don't want to have a lot of useless metadata space when
+ * our worst case reservations will likely never come true.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we're freeing inodes we're likely freeing checksums, file extent
+ * items, and extent tree items. Loads of space could be freed up by these
+ * operations, however they won't be usable until the transaction commits.
+ *
+ * COMMIT_TRANS
+ * This will commit the transaction. Historically we had a lot of logic
+ * surrounding whether or not we'd commit the transaction, but this waits born
+ * out of a pre-tickets era where we could end up committing the transaction
+ * thousands of times in a row without making progress. Now thanks to our
+ * ticketing system we know if we're not making progress and can error
+ * everybody out after a few commits rather than burning the disk hoping for
+ * a different answer.
+ *
+ * OVERCOMMIT
+ *
+ * Because we hold so many reservations for metadata we will allow you to
+ * reserve more space than is currently free in the currently allocate
+ * metadata space. This only happens with metadata, data does not allow
+ * overcommitting.
+ *
+ * You can see the current logic for when we allow overcommit in
+ * btrfs_can_overcommit(), but it only applies to unallocated space. If there
+ * is no unallocated space to be had, all reservations are kept within the
+ * free space in the allocated metadata chunks.
+ *
+ * Because of overcommitting, you generally want to use the
+ * btrfs_can_overcommit() logic for metadata allocations, as it does the right
+ * thing with or without extra unallocated space.
+ */
+
+u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+ bool may_use_included)
+{
+ ASSERT(s_info);
+ return s_info->bytes_used + s_info->bytes_reserved +
+ s_info->bytes_pinned + s_info->bytes_readonly +
+ s_info->bytes_zone_unusable +
+ (may_use_included ? s_info->bytes_may_use : 0);
+}
+
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ list_for_each_entry(found, head, list)
+ found->full = 0;
+}
+
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+
+/*
+ * Calculate chunk size depending on volume type (regular or zoned).
+ */
+static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
+{
+ if (btrfs_is_zoned(fs_info))
+ return fs_info->zone_size;
+
+ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ return BTRFS_MAX_DATA_CHUNK_SIZE;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return SZ_32M;
+
+ /* Handle BTRFS_BLOCK_GROUP_METADATA */
+ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ return SZ_1G;
+
+ return SZ_256M;
+}
+
+/*
+ * Update default chunk size.
+ */
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size)
+{
+ WRITE_ONCE(space_info->chunk_size, chunk_size);
+}
+
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+{
+
+ struct btrfs_space_info *space_info;
+ int i;
+ int ret;
+
+ space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+ if (!space_info)
+ return -ENOMEM;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ INIT_LIST_HEAD(&space_info->block_groups[i]);
+ init_rwsem(&space_info->groups_sem);
+ spin_lock_init(&space_info->lock);
+ space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+ INIT_LIST_HEAD(&space_info->ro_bgs);
+ INIT_LIST_HEAD(&space_info->tickets);
+ INIT_LIST_HEAD(&space_info->priority_tickets);
+ space_info->clamp = 1;
+ btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+
+ if (btrfs_is_zoned(info))
+ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+
+ ret = btrfs_sysfs_add_space_info_type(info, space_info);
+ if (ret)
+ return ret;
+
+ list_add(&space_info->list, &info->space_info);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ info->data_sinfo = space_info;
+
+ return ret;
+}
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+ u64 flags;
+ int mixed = 0;
+ int ret;
+
+ disk_super = fs_info->super_copy;
+ if (!btrfs_super_root(disk_super))
+ return -EINVAL;
+
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = 1;
+
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ ret = create_space_info(fs_info, flags);
+ if (ret)
+ goto out;
+
+ if (mixed) {
+ flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+ ret = create_space_info(fs_info, flags);
+ } else {
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ ret = create_space_info(fs_info, flags);
+ if (ret)
+ goto out;
+
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ ret = create_space_info(fs_info, flags);
+ }
+out:
+ return ret;
+}
+
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_space_info *found;
+ int factor, index;
+
+ factor = btrfs_bg_type_to_factor(block_group->flags);
+
+ found = btrfs_find_space_info(info, block_group->flags);
+ ASSERT(found);
+ spin_lock(&found->lock);
+ found->total_bytes += block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ found->active_total_bytes += block_group->length;
+ found->disk_total += block_group->length * factor;
+ found->bytes_used += block_group->used;
+ found->disk_used += block_group->used * factor;
+ found->bytes_readonly += block_group->bytes_super;
+ found->bytes_zone_unusable += block_group->zone_unusable;
+ if (block_group->length > 0)
+ found->full = 0;
+ btrfs_try_granting_tickets(info, found);
+ spin_unlock(&found->lock);
+
+ block_group->space_info = found;
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ down_write(&found->groups_sem);
+ list_add_tail(&block_group->list, &found->block_groups[index]);
+ up_write(&found->groups_sem);
+}
+
+struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
+ u64 flags)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ list_for_each_entry(found, head, list) {
+ if (found->flags & flags)
+ return found;
+ }
+ return NULL;
+}
+
+static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 profile;
+ u64 avail;
+ int factor;
+
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ profile = btrfs_system_alloc_profile(fs_info);
+ else
+ profile = btrfs_metadata_alloc_profile(fs_info);
+
+ avail = atomic64_read(&fs_info->free_chunk_space);
+
+ /*
+ * If we have dup, raid1 or raid10 then only half of the free
+ * space is actually usable. For raid56, the space info used
+ * doesn't include the parity drive, so we don't have to
+ * change the math
+ */
+ factor = btrfs_bg_type_to_factor(profile);
+ avail = div_u64(avail, factor);
+
+ /*
+ * If we aren't flushing all things, let us overcommit up to
+ * 1/2th of the space. If we can flush, don't let us overcommit
+ * too much, let it overcommit up to 1/8 of the space.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ avail >>= 3;
+ else
+ avail >>= 1;
+ return avail;
+}
+
+static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ /*
+ * On regular filesystem, all total_bytes are always writable. On zoned
+ * filesystem, there may be a limitation imposed by max_active_zones.
+ * For metadata allocation, we cannot finish an existing active block
+ * group to avoid a deadlock. Thus, we need to consider only the active
+ * groups to be writable for metadata space.
+ */
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return space_info->total_bytes;
+
+ return space_info->active_total_bytes;
+}
+
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 avail;
+ u64 used;
+
+ /* Don't overcommit when in mixed mode */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return 0;
+
+ used = btrfs_space_info_used(space_info, true);
+ avail = calc_available_free_space(fs_info, space_info, flush);
+
+ if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
+ return 1;
+ return 0;
+}
+
+static void remove_ticket(struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ if (!list_empty(&ticket->list)) {
+ list_del_init(&ticket->list);
+ ASSERT(space_info->reclaim_size >= ticket->bytes);
+ space_info->reclaim_size -= ticket->bytes;
+ }
+}
+
+/*
+ * This is for space we already have accounted in space_info->bytes_may_use, so
+ * basically when we're returning space from block_rsv's.
+ */
+void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ struct list_head *head;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+
+ lockdep_assert_held(&space_info->lock);
+
+ head = &space_info->priority_tickets;
+again:
+ while (!list_empty(head)) {
+ struct reserve_ticket *ticket;
+ u64 used = btrfs_space_info_used(space_info, true);
+
+ ticket = list_first_entry(head, struct reserve_ticket, list);
+
+ /* Check and see if our ticket can be satisfied now. */
+ if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
+ btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
+ flush)) {
+ btrfs_space_info_update_bytes_may_use(fs_info,
+ space_info,
+ ticket->bytes);
+ remove_ticket(space_info, ticket);
+ ticket->bytes = 0;
+ space_info->tickets_id++;
+ wake_up(&ticket->wait);
+ } else {
+ break;
+ }
+ }
+
+ if (head == &space_info->priority_tickets) {
+ head = &space_info->tickets;
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ goto again;
+ }
+}
+
+#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
+do { \
+ struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
+ spin_lock(&__rsv->lock); \
+ btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
+ __rsv->size, __rsv->reserved); \
+ spin_unlock(&__rsv->lock); \
+} while (0)
+
+static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
+{
+ switch (space_info->flags) {
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "SYSTEM";
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "DATA+METADATA";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "DATA";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "METADATA";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+}
+
+static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *info)
+{
+ const char *flag_str = space_info_flag_to_str(info);
+ lockdep_assert_held(&info->lock);
+
+ /* The free space could be negative in case of overcommit */
+ btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
+ flag_str,
+ (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
+ info->full ? "" : "not ");
+ btrfs_info(fs_info,
+"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
+ info->total_bytes, info->bytes_used, info->bytes_pinned,
+ info->bytes_reserved, info->bytes_may_use,
+ info->bytes_readonly, info->bytes_zone_unusable);
+}
+
+void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups)
+{
+ struct btrfs_block_group *cache;
+ int index = 0;
+
+ spin_lock(&info->lock);
+ __btrfs_dump_space_info(fs_info, info);
+ dump_global_block_rsv(fs_info);
+ spin_unlock(&info->lock);
+
+ if (!dump_block_groups)
+ return;
+
+ down_read(&info->groups_sem);
+again:
+ list_for_each_entry(cache, &info->block_groups[index], list) {
+ spin_lock(&cache->lock);
+ btrfs_info(fs_info,
+ "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
+ cache->start, cache->length, cache->used, cache->pinned,
+ cache->reserved, cache->zone_unusable,
+ cache->ro ? "[readonly]" : "");
+ spin_unlock(&cache->lock);
+ btrfs_dump_free_space(cache, bytes);
+ }
+ if (++index < BTRFS_NR_RAID_TYPES)
+ goto again;
+ up_read(&info->groups_sem);
+}
+
+static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+ u64 to_reclaim)
+{
+ u64 bytes;
+ u64 nr;
+
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ nr = div64_u64(to_reclaim, bytes);
+ if (!nr)
+ nr = 1;
+ return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM SZ_256K
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 to_reclaim, bool wait_ordered,
+ bool for_preempt)
+{
+ struct btrfs_trans_handle *trans;
+ u64 delalloc_bytes;
+ u64 ordered_bytes;
+ u64 items;
+ long time_left;
+ int loops;
+
+ delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ if (delalloc_bytes == 0 && ordered_bytes == 0)
+ return;
+
+ /* Calc the number of the pages we need flush for space reservation */
+ if (to_reclaim == U64_MAX) {
+ items = U64_MAX;
+ } else {
+ /*
+ * to_reclaim is set to however much metadata we need to
+ * reclaim, but reclaiming that much data doesn't really track
+ * exactly. What we really want to do is reclaim full inode's
+ * worth of reservations, however that's not available to us
+ * here. We will take a fraction of the delalloc bytes for our
+ * flushing loops and hope for the best. Delalloc will expand
+ * the amount we write to cover an entire dirty extent, which
+ * will reclaim the metadata reservation for that range. If
+ * it's not enough subsequent flush stages will be more
+ * aggressive.
+ */
+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
+ items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
+ }
+
+ trans = current->journal_info;
+
+ /*
+ * If we are doing more ordered than delalloc we need to just wait on
+ * ordered extents, otherwise we'll waste time trying to flush delalloc
+ * that likely won't give us the space back we need.
+ */
+ if (ordered_bytes > delalloc_bytes && !for_preempt)
+ wait_ordered = true;
+
+ loops = 0;
+ while ((delalloc_bytes || ordered_bytes) && loops < 3) {
+ u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+ long nr_pages = min_t(u64, temp, LONG_MAX);
+ int async_pages;
+
+ btrfs_start_delalloc_roots(fs_info, nr_pages, true);
+
+ /*
+ * We need to make sure any outstanding async pages are now
+ * processed before we continue. This is because things like
+ * sync_inode() try to be smart and skip writing if the inode is
+ * marked clean. We don't use filemap_fwrite for flushing
+ * because we want to control how many pages we write out at a
+ * time, thus this is the only safe way to make sure we've
+ * waited for outstanding compressed workers to have started
+ * their jobs and thus have ordered extents set up properly.
+ *
+ * This exists because we do not want to wait for each
+ * individual inode to finish its async work, we simply want to
+ * start the IO on everybody, and then come back here and wait
+ * for all of the async work to catch up. Once we're done with
+ * that we know we'll have ordered extents for everything and we
+ * can decide if we wait for that or not.
+ *
+ * If we choose to replace this in the future, make absolutely
+ * sure that the proper waiting is being done in the async case,
+ * as there have been bugs in that area before.
+ */
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
+ goto skip_async;
+
+ /*
+ * We don't want to wait forever, if we wrote less pages in this
+ * loop than we have outstanding, only wait for that number of
+ * pages, otherwise we can wait for all async pages to finish
+ * before continuing.
+ */
+ if (async_pages > nr_pages)
+ async_pages -= nr_pages;
+ else
+ async_pages = 0;
+ wait_event(fs_info->async_submit_wait,
+ atomic_read(&fs_info->async_delalloc_pages) <=
+ async_pages);
+skip_async:
+ loops++;
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ } else {
+ time_left = schedule_timeout_killable(1);
+ if (time_left)
+ break;
+ }
+
+ /*
+ * If we are for preemption we just want a one-shot of delalloc
+ * flushing so we can stop flushing if we decide we don't need
+ * to anymore.
+ */
+ if (for_preempt)
+ break;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets) &&
+ list_empty(&space_info->priority_tickets)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
+ delalloc_bytes = percpu_counter_sum_positive(
+ &fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(
+ &fs_info->ordered_bytes);
+ }
+}
+
+/*
+ * Try to flush some data based on policy set by @state. This is only advisory
+ * and may fail for various reasons. The caller is supposed to examine the
+ * state of @space_info to detect the outcome.
+ */
+static void flush_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 num_bytes,
+ enum btrfs_flush_state state, bool for_preempt)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_trans_handle *trans;
+ int nr;
+ int ret = 0;
+
+ switch (state) {
+ case FLUSH_DELAYED_ITEMS_NR:
+ case FLUSH_DELAYED_ITEMS:
+ if (state == FLUSH_DELAYED_ITEMS_NR)
+ nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
+ else
+ nr = -1;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_run_delayed_items_nr(trans, nr);
+ btrfs_end_transaction(trans);
+ break;
+ case FLUSH_DELALLOC:
+ case FLUSH_DELALLOC_WAIT:
+ case FLUSH_DELALLOC_FULL:
+ if (state == FLUSH_DELALLOC_FULL)
+ num_bytes = U64_MAX;
+ shrink_delalloc(fs_info, space_info, num_bytes,
+ state != FLUSH_DELALLOC, for_preempt);
+ break;
+ case FLUSH_DELAYED_REFS_NR:
+ case FLUSH_DELAYED_REFS:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ if (state == FLUSH_DELAYED_REFS_NR)
+ nr = calc_reclaim_items_nr(fs_info, num_bytes);
+ else
+ nr = 0;
+ btrfs_run_delayed_refs(trans, nr);
+ btrfs_end_transaction(trans);
+ break;
+ case ALLOC_CHUNK:
+ case ALLOC_CHUNK_FORCE:
+ /*
+ * For metadata space on zoned filesystem, reaching here means we
+ * don't have enough space left in active_total_bytes. Try to
+ * activate a block group first, because we may have inactive
+ * block group already allocated.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false);
+ if (ret < 0)
+ break;
+ else if (ret == 1)
+ break;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_chunk_alloc(trans,
+ btrfs_get_alloc_profile(fs_info, space_info->flags),
+ (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
+ CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+
+ /*
+ * For metadata space on zoned filesystem, allocating a new chunk
+ * is not enough. We still need to activate the block * group.
+ * Active the newly allocated block group by (maybe) finishing
+ * a block group.
+ */
+ if (ret == 1) {
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
+ /*
+ * Revert to the original ret regardless we could finish
+ * one block group or not.
+ */
+ if (ret >= 0)
+ ret = 1;
+ }
+
+ if (ret > 0 || ret == -ENOSPC)
+ ret = 0;
+ break;
+ case RUN_DELAYED_IPUTS:
+ /*
+ * If we have pending delayed iputs then we could free up a
+ * bunch of pinned space, so make sure we run the iputs before
+ * we do our pinned bytes check below.
+ */
+ btrfs_run_delayed_iputs(fs_info);
+ btrfs_wait_on_delayed_iputs(fs_info);
+ break;
+ case COMMIT_TRANS:
+ ASSERT(current->journal_info == NULL);
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_commit_transaction(trans);
+ break;
+ default:
+ ret = -ENOSPC;
+ break;
+ }
+
+ trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
+ ret, for_preempt);
+ return;
+}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ u64 used;
+ u64 avail;
+ u64 total;
+ u64 to_reclaim = space_info->reclaim_size;
+
+ lockdep_assert_held(&space_info->lock);
+
+ avail = calc_available_free_space(fs_info, space_info,
+ BTRFS_RESERVE_FLUSH_ALL);
+ used = btrfs_space_info_used(space_info, true);
+
+ /*
+ * We may be flushing because suddenly we have less space than we had
+ * before, and now we're well over-committed based on our current free
+ * space. If that's the case add in our overage so we make sure to put
+ * appropriate pressure on the flushing state machine.
+ */
+ total = writable_total_bytes(fs_info, space_info);
+ if (total + avail < used)
+ to_reclaim += used - (total + avail);
+
+ return to_reclaim;
+}
+
+static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ u64 ordered, delalloc;
+ u64 total = writable_total_bytes(fs_info, space_info);
+ u64 thresh;
+ u64 used;
+
+ thresh = div_factor_fine(total, 90);
+
+ lockdep_assert_held(&space_info->lock);
+
+ /* If we're just plain full then async reclaim just slows us down. */
+ if ((space_info->bytes_used + space_info->bytes_reserved +
+ global_rsv_size) >= thresh)
+ return false;
+
+ used = space_info->bytes_may_use + space_info->bytes_pinned;
+
+ /* The total flushable belongs to the global rsv, don't flush. */
+ if (global_rsv_size >= used)
+ return false;
+
+ /*
+ * 128MiB is 1/4 of the maximum global rsv size. If we have less than
+ * that devoted to other reservations then there's no sense in flushing,
+ * we don't have a lot of things that need flushing.
+ */
+ if (used - global_rsv_size <= SZ_128M)
+ return false;
+
+ /*
+ * We have tickets queued, bail so we don't compete with the async
+ * flushers.
+ */
+ if (space_info->reclaim_size)
+ return false;
+
+ /*
+ * If we have over half of the free space occupied by reservations or
+ * pinned then we want to start flushing.
+ *
+ * We do not do the traditional thing here, which is to say
+ *
+ * if (used >= ((total_bytes + avail) / 2))
+ * return 1;
+ *
+ * because this doesn't quite work how we want. If we had more than 50%
+ * of the space_info used by bytes_used and we had 0 available we'd just
+ * constantly run the background flusher. Instead we want it to kick in
+ * if our reclaimable space exceeds our clamped free space.
+ *
+ * Our clamping range is 2^1 -> 2^8. Practically speaking that means
+ * the following:
+ *
+ * Amount of RAM Minimum threshold Maximum threshold
+ *
+ * 256GiB 1GiB 128GiB
+ * 128GiB 512MiB 64GiB
+ * 64GiB 256MiB 32GiB
+ * 32GiB 128MiB 16GiB
+ * 16GiB 64MiB 8GiB
+ *
+ * These are the range our thresholds will fall in, corresponding to how
+ * much delalloc we need for the background flusher to kick in.
+ */
+
+ thresh = calc_available_free_space(fs_info, space_info,
+ BTRFS_RESERVE_FLUSH_ALL);
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_readonly + global_rsv_size;
+ if (used < total)
+ thresh += total - used;
+ thresh >>= space_info->clamp;
+
+ used = space_info->bytes_pinned;
+
+ /*
+ * If we have more ordered bytes than delalloc bytes then we're either
+ * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
+ * around. Preemptive flushing is only useful in that it can free up
+ * space before tickets need to wait for things to finish. In the case
+ * of ordered extents, preemptively waiting on ordered extents gets us
+ * nothing, if our reservations are tied up in ordered extents we'll
+ * simply have to slow down writers by forcing them to wait on ordered
+ * extents.
+ *
+ * In the case that ordered is larger than delalloc, only include the
+ * block reserves that we would actually be able to directly reclaim
+ * from. In this case if we're heavy on metadata operations this will
+ * clearly be heavy enough to warrant preemptive flushing. In the case
+ * of heavy DIO or ordered reservations, preemptive flushing will just
+ * waste time and cause us to slow down.
+ *
+ * We want to make sure we truly are maxed out on ordered however, so
+ * cut ordered in half, and if it's still higher than delalloc then we
+ * can keep flushing. This is to avoid the case where we start
+ * flushing, and now delalloc == ordered and we stop preemptively
+ * flushing when we could still have several gigs of delalloc to flush.
+ */
+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
+ delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
+ if (ordered >= delalloc)
+ used += fs_info->delayed_refs_rsv.reserved +
+ fs_info->delayed_block_rsv.reserved;
+ else
+ used += space_info->bytes_may_use - global_rsv_size;
+
+ return (used >= thresh && !btrfs_fs_closing(fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (!ticket->steal)
+ return false;
+
+ if (global_rsv->space_info != space_info)
+ return false;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, 1);
+ if (global_rsv->reserved < min_bytes + ticket->bytes) {
+ spin_unlock(&global_rsv->lock);
+ return false;
+ }
+ global_rsv->reserved -= ticket->bytes;
+ remove_ticket(space_info, ticket);
+ ticket->bytes = 0;
+ wake_up(&ticket->wait);
+ space_info->tickets_id++;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = 0;
+ spin_unlock(&global_rsv->lock);
+
+ return true;
+}
+
+/*
+ * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * @fs_info - fs_info for this fs
+ * @space_info - the space info we were flushing
+ *
+ * We call this when we've exhausted our flushing ability and haven't made
+ * progress in satisfying tickets. The reservation code handles tickets in
+ * order, so if there is a large ticket first and then smaller ones we could
+ * very well satisfy the smaller tickets. This will attempt to wake up any
+ * tickets in the list to catch this case.
+ *
+ * This function returns true if it was able to make progress by clearing out
+ * other tickets, or if it stumbles across a ticket that was smaller than the
+ * first ticket.
+ */
+static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ struct reserve_ticket *ticket;
+ u64 tickets_id = space_info->tickets_id;
+ const bool aborted = BTRFS_FS_ERROR(fs_info);
+
+ trace_btrfs_fail_all_tickets(fs_info, space_info);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
+ __btrfs_dump_space_info(fs_info, space_info);
+ }
+
+ while (!list_empty(&space_info->tickets) &&
+ tickets_id == space_info->tickets_id) {
+ ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+
+ if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
+ return true;
+
+ if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_info(fs_info, "failing ticket with %llu bytes",
+ ticket->bytes);
+
+ remove_ticket(space_info, ticket);
+ if (aborted)
+ ticket->error = -EIO;
+ else
+ ticket->error = -ENOSPC;
+ wake_up(&ticket->wait);
+
+ /*
+ * We're just throwing tickets away, so more flushing may not
+ * trip over btrfs_try_granting_tickets, so we need to call it
+ * here to see if we can make progress with the next ticket in
+ * the list.
+ */
+ if (!aborted)
+ btrfs_try_granting_tickets(fs_info, space_info);
+ }
+ return (tickets_id != space_info->tickets_id);
+}
+
+/*
+ * This is for normal flushers, we can wait all goddamned day if we want to. We
+ * will loop and continuously try to flush as long as we are making progress.
+ * We count progress as clearing off tickets each time we have to loop.
+ */
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 to_reclaim;
+ enum btrfs_flush_state flush_state;
+ int commit_cycles = 0;
+ u64 last_tickets_id;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
+ if (!to_reclaim) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ do {
+ flush_space(fs_info, space_info, to_reclaim, flush_state, false);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
+ space_info);
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ if (commit_cycles)
+ commit_cycles--;
+ }
+
+ /*
+ * We do not want to empty the system of delalloc unless we're
+ * under heavy pressure, so allow one trip through the flushing
+ * logic before we start doing a FLUSH_DELALLOC_FULL.
+ */
+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+ flush_state++;
+
+ /*
+ * We don't want to force a chunk allocation until we've tried
+ * pretty hard to reclaim space. Think of the case where we
+ * freed up a bunch of space and so have a lot of pinned space
+ * to reclaim. We would rather use that than possibly create a
+ * underutilized metadata chunk. So if this is our first run
+ * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+ * commit the transaction. If nothing has changed the next go
+ * around then we can force a chunk allocation.
+ */
+ if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+ flush_state++;
+
+ if (flush_state > COMMIT_TRANS) {
+ commit_cycles++;
+ if (commit_cycles > 2) {
+ if (maybe_fail_all_tickets(fs_info, space_info)) {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ commit_cycles--;
+ } else {
+ space_info->flush = 0;
+ }
+ } else {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ } while (flush_state <= COMMIT_TRANS);
+}
+
+/*
+ * This handles pre-flushing of metadata space before we get to the point that
+ * we need to start blocking threads on tickets. The logic here is different
+ * from the other flush paths because it doesn't rely on tickets to tell us how
+ * much we need to flush, instead it attempts to keep us below the 80% full
+ * watermark of space by flushing whichever reservation pool is currently the
+ * largest.
+ */
+static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *delayed_block_rsv;
+ struct btrfs_block_rsv *delayed_refs_rsv;
+ struct btrfs_block_rsv *global_rsv;
+ struct btrfs_block_rsv *trans_rsv;
+ int loops = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info,
+ preempt_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ delayed_block_rsv = &fs_info->delayed_block_rsv;
+ delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ global_rsv = &fs_info->global_block_rsv;
+ trans_rsv = &fs_info->trans_block_rsv;
+
+ spin_lock(&space_info->lock);
+ while (need_preemptive_reclaim(fs_info, space_info)) {
+ enum btrfs_flush_state flush;
+ u64 delalloc_size = 0;
+ u64 to_reclaim, block_rsv_size;
+ u64 global_rsv_size = global_rsv->reserved;
+
+ loops++;
+
+ /*
+ * We don't have a precise counter for the metadata being
+ * reserved for delalloc, so we'll approximate it by subtracting
+ * out the block rsv's space from the bytes_may_use. If that
+ * amount is higher than the individual reserves, then we can
+ * assume it's tied up in delalloc reservations.
+ */
+ block_rsv_size = global_rsv_size +
+ delayed_block_rsv->reserved +
+ delayed_refs_rsv->reserved +
+ trans_rsv->reserved;
+ if (block_rsv_size < space_info->bytes_may_use)
+ delalloc_size = space_info->bytes_may_use - block_rsv_size;
+
+ /*
+ * We don't want to include the global_rsv in our calculation,
+ * because that's space we can't touch. Subtract it from the
+ * block_rsv_size for the next checks.
+ */
+ block_rsv_size -= global_rsv_size;
+
+ /*
+ * We really want to avoid flushing delalloc too much, as it
+ * could result in poor allocation patterns, so only flush it if
+ * it's larger than the rest of the pools combined.
+ */
+ if (delalloc_size > block_rsv_size) {
+ to_reclaim = delalloc_size;
+ flush = FLUSH_DELALLOC;
+ } else if (space_info->bytes_pinned >
+ (delayed_block_rsv->reserved +
+ delayed_refs_rsv->reserved)) {
+ to_reclaim = space_info->bytes_pinned;
+ flush = COMMIT_TRANS;
+ } else if (delayed_block_rsv->reserved >
+ delayed_refs_rsv->reserved) {
+ to_reclaim = delayed_block_rsv->reserved;
+ flush = FLUSH_DELAYED_ITEMS_NR;
+ } else {
+ to_reclaim = delayed_refs_rsv->reserved;
+ flush = FLUSH_DELAYED_REFS_NR;
+ }
+
+ spin_unlock(&space_info->lock);
+
+ /*
+ * We don't want to reclaim everything, just a portion, so scale
+ * down the to_reclaim by 1/4. If it takes us down to 0,
+ * reclaim 1 items worth.
+ */
+ to_reclaim >>= 2;
+ if (!to_reclaim)
+ to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
+ flush_space(fs_info, space_info, to_reclaim, flush, true);
+ cond_resched();
+ spin_lock(&space_info->lock);
+ }
+
+ /* We only went through once, back off our clamping. */
+ if (loops == 1 && !space_info->reclaim_size)
+ space_info->clamp = max(1, space_info->clamp - 1);
+ trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+/*
+ * FLUSH_DELALLOC_WAIT:
+ * Space is freed from flushing delalloc in one of two ways.
+ *
+ * 1) compression is on and we allocate less space than we reserved
+ * 2) we are overwriting existing space
+ *
+ * For #1 that extra space is reclaimed as soon as the delalloc pages are
+ * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
+ * length to ->bytes_reserved, and subtracts the reserved space from
+ * ->bytes_may_use.
+ *
+ * For #2 this is trickier. Once the ordered extent runs we will drop the
+ * extent in the range we are overwriting, which creates a delayed ref for
+ * that freed extent. This however is not reclaimed until the transaction
+ * commits, thus the next stages.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we are freeing inodes, we want to make sure all delayed iputs have
+ * completed, because they could have been on an inode with i_nlink == 0, and
+ * thus have been truncated and freed up space. But again this space is not
+ * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * run and then the transaction must be committed.
+ *
+ * COMMIT_TRANS
+ * This is where we reclaim all of the pinned space generated by running the
+ * iputs
+ *
+ * ALLOC_CHUNK_FORCE
+ * For data we start with alloc chunk force, however we could have been full
+ * before, and then the transaction commit could have freed new block groups,
+ * so if we now have space to allocate do the force chunk allocation.
+ */
+static const enum btrfs_flush_state data_flush_states[] = {
+ FLUSH_DELALLOC_FULL,
+ RUN_DELAYED_IPUTS,
+ COMMIT_TRANS,
+ ALLOC_CHUNK_FORCE,
+};
+
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 last_tickets_id;
+ enum btrfs_flush_state flush_state = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+ }
+
+ while (flush_state < ARRAY_SIZE(data_flush_states)) {
+ flush_space(fs_info, space_info, U64_MAX,
+ data_flush_states[flush_state], false);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = 0;
+ }
+
+ if (flush_state >= ARRAY_SIZE(data_flush_states)) {
+ if (space_info->full) {
+ if (maybe_fail_all_tickets(fs_info, space_info))
+ flush_state = 0;
+ else
+ space_info->flush = 0;
+ } else {
+ flush_state = 0;
+ }
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
+
+ }
+ spin_unlock(&space_info->lock);
+ }
+ return;
+
+aborted_fs:
+ maybe_fail_all_tickets(fs_info, space_info);
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+}
+
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
+{
+ INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
+ INIT_WORK(&fs_info->preempt_reclaim_work,
+ btrfs_preempt_reclaim_metadata_space);
+}
+
+static const enum btrfs_flush_state priority_flush_states[] = {
+ FLUSH_DELAYED_ITEMS_NR,
+ FLUSH_DELAYED_ITEMS,
+ ALLOC_CHUNK,
+};
+
+static const enum btrfs_flush_state evict_flush_states[] = {
+ FLUSH_DELAYED_ITEMS_NR,
+ FLUSH_DELAYED_ITEMS,
+ FLUSH_DELAYED_REFS_NR,
+ FLUSH_DELAYED_REFS,
+ FLUSH_DELALLOC,
+ FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
+ ALLOC_CHUNK,
+ COMMIT_TRANS,
+};
+
+static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket,
+ const enum btrfs_flush_state *states,
+ int states_nr)
+{
+ u64 to_reclaim;
+ int flush_state = 0;
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
+ /*
+ * This is the priority reclaim path, so to_reclaim could be >0 still
+ * because we may have only satisfied the priority tickets and still
+ * left non priority tickets on the list. We would then have
+ * to_reclaim but ->bytes == 0.
+ */
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ while (flush_state < states_nr) {
+ spin_unlock(&space_info->lock);
+ flush_space(fs_info, space_info, to_reclaim, states[flush_state],
+ false);
+ flush_state++;
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ }
+
+ /* Attempt to steal from the global rsv if we can. */
+ if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ }
+
+ /*
+ * We must run try_granting_tickets here because we could be a large
+ * ticket in front of a smaller ticket that can now be satisfied with
+ * the available space.
+ */
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ spin_lock(&space_info->lock);
+
+ /* We could have been granted before we got here. */
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ while (!space_info->full) {
+ spin_unlock(&space_info->lock);
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ }
+
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+
+{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ while (ticket->bytes > 0 && ticket->error == 0) {
+ ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+ if (ret) {
+ /*
+ * Delete us from the list. After we unlock the space
+ * info, we don't want the async reclaim job to reserve
+ * space for this ticket. If that would happen, then the
+ * ticket's task would not known that space was reserved
+ * despite getting an error, resulting in a space leak
+ * (bytes_may_use counter of our space_info).
+ */
+ remove_ticket(space_info, ticket);
+ ticket->error = -EINTR;
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
+ schedule();
+
+ finish_wait(&ticket->wait, &wait);
+ spin_lock(&space_info->lock);
+ }
+ spin_unlock(&space_info->lock);
+}
+
+/**
+ * Do the appropriate flushing and waiting for a ticket
+ *
+ * @fs_info: the filesystem
+ * @space_info: space info for the reservation
+ * @ticket: ticket for the reservation
+ * @start_ns: timestamp when the reservation started
+ * @orig_bytes: amount of bytes originally reserved
+ * @flush: how much we can flush
+ *
+ * This does the work of figuring out how to flush for the ticket, waiting for
+ * the reservation, and returning the appropriate error if there is one.
+ */
+static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket,
+ u64 start_ns, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ switch (flush) {
+ case BTRFS_RESERVE_FLUSH_DATA:
+ case BTRFS_RESERVE_FLUSH_ALL:
+ case BTRFS_RESERVE_FLUSH_ALL_STEAL:
+ wait_reserve_ticket(fs_info, space_info, ticket);
+ break;
+ case BTRFS_RESERVE_FLUSH_LIMIT:
+ priority_reclaim_metadata_space(fs_info, space_info, ticket,
+ priority_flush_states,
+ ARRAY_SIZE(priority_flush_states));
+ break;
+ case BTRFS_RESERVE_FLUSH_EVICT:
+ priority_reclaim_metadata_space(fs_info, space_info, ticket,
+ evict_flush_states,
+ ARRAY_SIZE(evict_flush_states));
+ break;
+ case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ priority_reclaim_data_space(fs_info, space_info, ticket);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ ret = ticket->error;
+ ASSERT(list_empty(&ticket->list));
+ /*
+ * Check that we can't have an error set if the reservation succeeded,
+ * as that would confuse tasks and lead them to error out without
+ * releasing reserved space (if an error happens the expectation is that
+ * space wasn't reserved at all).
+ */
+ ASSERT(!(ticket->bytes == 0 && ticket->error));
+ trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
+ start_ns, flush, ticket->error);
+ return ret;
+}
+
+/*
+ * This returns true if this flush state will go through the ordinary flushing
+ * code.
+ */
+static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush == BTRFS_RESERVE_FLUSH_ALL) ||
+ (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+}
+
+static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+
+ /*
+ * If we're heavy on ordered operations then clamping won't help us. We
+ * need to clamp specifically to keep up with dirty'ing buffered
+ * writers, because there's not a 1:1 correlation of writing delalloc
+ * and freeing space, like there is with flushing delayed refs or
+ * delayed nodes. If we're already more ordered than delalloc then
+ * we're keeping up, otherwise we aren't and should probably clamp.
+ */
+ if (ordered < delalloc)
+ space_info->clamp = min(space_info->clamp + 1, 8);
+}
+
+static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_EVICT);
+}
+
+/**
+ * Try to reserve bytes from the block_rsv's space
+ *
+ * @fs_info: the filesystem
+ * @space_info: space info we want to allocate from
+ * @orig_bytes: number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
+ *
+ * This will reserve orig_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct work_struct *async_work;
+ struct reserve_ticket ticket;
+ u64 start_ns = 0;
+ u64 used;
+ int ret = 0;
+ bool pending_tickets;
+
+ ASSERT(orig_bytes);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+
+ if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ async_work = &fs_info->async_data_reclaim_work;
+ else
+ async_work = &fs_info->async_reclaim_work;
+
+ spin_lock(&space_info->lock);
+ ret = -ENOSPC;
+ used = btrfs_space_info_used(space_info, true);
+
+ /*
+ * We don't want NO_FLUSH allocations to jump everybody, they can
+ * generally handle ENOSPC in a different way, so treat them the same as
+ * normal flushers when it comes to skipping pending tickets.
+ */
+ if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
+ pending_tickets = !list_empty(&space_info->tickets) ||
+ !list_empty(&space_info->priority_tickets);
+ else
+ pending_tickets = !list_empty(&space_info->priority_tickets);
+
+ /*
+ * Carry on if we have enough space (short-circuit) OR call
+ * can_overcommit() to ensure we can overcommit to continue.
+ */
+ if (!pending_tickets &&
+ ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
+ btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
+ btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+ orig_bytes);
+ ret = 0;
+ }
+
+ /*
+ * If we couldn't make a reservation then setup our reservation ticket
+ * and kick the async worker if it's not already running.
+ *
+ * If we are a priority flusher then we just need to add our ticket to
+ * the list and we will do our own flushing further down.
+ */
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ ticket.bytes = orig_bytes;
+ ticket.error = 0;
+ space_info->reclaim_size += ticket.bytes;
+ init_waitqueue_head(&ticket.wait);
+ ticket.steal = can_steal(flush);
+ if (trace_btrfs_reserve_ticket_enabled())
+ start_ns = ktime_get_ns();
+
+ if (flush == BTRFS_RESERVE_FLUSH_ALL ||
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_DATA) {
+ list_add_tail(&ticket.list, &space_info->tickets);
+ if (!space_info->flush) {
+ /*
+ * We were forced to add a reserve ticket, so
+ * our preemptive flushing is unable to keep
+ * up. Clamp down on the threshold for the
+ * preemptive flushing in order to keep up with
+ * the workload.
+ */
+ maybe_clamp_preempt(fs_info, space_info);
+
+ space_info->flush = 1;
+ trace_btrfs_trigger_flush(fs_info,
+ space_info->flags,
+ orig_bytes, flush,
+ "enospc");
+ queue_work(system_unbound_wq, async_work);
+ }
+ } else {
+ list_add_tail(&ticket.list,
+ &space_info->priority_tickets);
+ }
+ } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
+ !work_busy(&fs_info->preempt_reclaim_work) &&
+ need_preemptive_reclaim(fs_info, space_info)) {
+ trace_btrfs_trigger_flush(fs_info, space_info->flags,
+ orig_bytes, flush, "preempt");
+ queue_work(system_unbound_wq,
+ &fs_info->preempt_reclaim_work);
+ }
+ }
+ spin_unlock(&space_info->lock);
+ if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+ return ret;
+
+ return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
+ orig_bytes, flush);
+}
+
+/**
+ * Trye to reserve metadata bytes from the block_rsv's space
+ *
+ * @fs_info: the filesystem
+ * @block_rsv: block_rsv we're allocating for
+ * @orig_bytes: number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
+ *
+ * This will reserve orig_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ block_rsv->space_info->flags,
+ orig_bytes, 1);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, block_rsv->space_info,
+ orig_bytes, 0);
+ }
+ return ret;
+}
+
+/**
+ * Try to reserve data bytes for an allocation
+ *
+ * @fs_info: the filesystem
+ * @bytes: number of bytes we need
+ * @flush: how we are allowed to flush
+ *
+ * This will reserve bytes from the data space info. If there is not enough
+ * space then we will attempt to flush space as specified by flush.
+ */
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ int ret;
+
+ ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+ flush == BTRFS_RESERVE_NO_FLUSH);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+
+ ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ }
+ return ret;
+}
+
+/* Dump all the space infos when we abort a transaction due to ENOSPC. */
+__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ btrfs_info(fs_info, "dumping space info:");
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ spin_lock(&space_info->lock);
+ __btrfs_dump_space_info(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+ }
+ dump_global_block_rsv(fs_info);
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
new file mode 100644
index 000000000..ce66023a9
--- /dev/null
+++ b/fs/btrfs/space-info.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SPACE_INFO_H
+#define BTRFS_SPACE_INFO_H
+
+#include "volumes.h"
+
+struct btrfs_space_info {
+ spinlock_t lock;
+
+ u64 total_bytes; /* total bytes in the space,
+ this doesn't take mirrors into account */
+ u64 bytes_used; /* total bytes used,
+ this doesn't take mirrors into account */
+ u64 bytes_pinned; /* total bytes pinned, will be freed when the
+ transaction finishes */
+ u64 bytes_reserved; /* total bytes the allocator has reserved for
+ current allocations */
+ u64 bytes_may_use; /* number of bytes that may be used for
+ delalloc/allocations */
+ u64 bytes_readonly; /* total bytes that are read only */
+ /* Total bytes in the space, but only accounts active block groups. */
+ u64 active_total_bytes;
+ u64 bytes_zone_unusable; /* total bytes that are unusable until
+ resetting the device zone */
+
+ u64 max_extent_size; /* This will hold the maximum extent size of
+ the space info if we had an ENOSPC in the
+ allocator. */
+ /* Chunk size in bytes */
+ u64 chunk_size;
+
+ /*
+ * Once a block group drops below this threshold (percents) we'll
+ * schedule it for reclaim.
+ */
+ int bg_reclaim_threshold;
+
+ int clamp; /* Used to scale our threshold for preemptive
+ flushing. The value is >> clamp, so turns
+ out to be a 2^clamp divisor. */
+
+ unsigned int full:1; /* indicates that we cannot allocate any more
+ chunks for this space */
+ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
+
+ unsigned int flush:1; /* set if we are trying to make space */
+
+ unsigned int force_alloc; /* set if we need to force a chunk
+ alloc for this space */
+
+ u64 disk_used; /* total bytes used on disk */
+ u64 disk_total; /* total bytes on disk, takes mirrors into
+ account */
+
+ u64 flags;
+
+ struct list_head list;
+ /* Protected by the spinlock 'lock'. */
+ struct list_head ro_bgs;
+ struct list_head priority_tickets;
+ struct list_head tickets;
+
+ /*
+ * Size of space that needs to be reclaimed in order to satisfy pending
+ * tickets
+ */
+ u64 reclaim_size;
+
+ /*
+ * tickets_id just indicates the next ticket will be handled, so note
+ * it's not stored per ticket.
+ */
+ u64 tickets_id;
+
+ struct rw_semaphore groups_sem;
+ /* for block groups in our same type */
+ struct list_head block_groups[BTRFS_NR_RAID_TYPES];
+
+ struct kobject kobj;
+ struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+};
+
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ bool steal;
+ struct list_head list;
+ wait_queue_head_t wait;
+};
+
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+ return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
+/*
+ *
+ * Declare a helper function to detect underflow of various space info members
+ */
+#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \
+static inline void \
+btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
+ struct btrfs_space_info *sinfo, \
+ s64 bytes) \
+{ \
+ const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \
+ lockdep_assert_held(&sinfo->lock); \
+ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
+ trace_btrfs_space_reservation(fs_info, trace_name, \
+ sinfo->flags, abs_bytes, \
+ bytes > 0); \
+ if (bytes < 0 && sinfo->name < -bytes) { \
+ WARN_ON(1); \
+ sinfo->name = 0; \
+ return; \
+ } \
+ sinfo->name += bytes; \
+}
+
+DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
+DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group);
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size);
+struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
+ u64 flags);
+u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+ bool may_use_included);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups);
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush);
+void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info);
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
+
+static inline void btrfs_space_info_free_bytes_may_use(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ spin_lock(&space_info->lock);
+ btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
+void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
+
+#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000..12455b2b4
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <asm/unaligned.h>
+
+#include "ctree.h"
+
+static bool check_setget_bounds(const struct extent_buffer *eb,
+ const void *ptr, unsigned off, int size)
+{
+ const unsigned long member_offset = (unsigned long)ptr + off;
+
+ if (unlikely(member_offset + size > eb->len)) {
+ btrfs_warn(eb->fs_info,
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
+ (unsigned long)ptr, eb->start, member_offset, size);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Macro templates that define helpers to read/write extent buffer data of a
+ * given size, that are also used via ctree.h for access to item members by
+ * specialized helpers.
+ *
+ * Generic helpers:
+ * - btrfs_set_8 (for 8/16/32/64)
+ * - btrfs_get_8 (for 8/16/32/64)
+ *
+ * Generic helpers with a token (cached address of the most recently accessed
+ * page):
+ * - btrfs_set_token_8 (for 8/16/32/64)
+ * - btrfs_get_token_8 (for 8/16/32/64)
+ *
+ * The set/get functions handle data spanning two pages transparently, in case
+ * metadata block size is larger than page. Every pointer to metadata items is
+ * an offset into the extent buffer page array, cast to a specific type. This
+ * gives us all the type checking.
+ *
+ * The extent buffer pages stored in the array pages do not form a contiguous
+ * phyusical range, but the API functions assume the linear offset to the range
+ * from 0 to metadata node size.
+ */
+
+#define DEFINE_BTRFS_SETGET_BITS(bits) \
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off) \
+{ \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ const unsigned long oip = get_eb_offset_in_page(token->eb, \
+ member_offset); \
+ const int size = sizeof(u##bits); \
+ u8 lebytes[sizeof(u##bits)]; \
+ const int part = PAGE_SIZE - oip; \
+ \
+ ASSERT(token); \
+ ASSERT(token->kaddr); \
+ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
+ if (token->offset <= member_offset && \
+ member_offset + size <= token->offset + PAGE_SIZE) { \
+ return get_unaligned_le##bits(token->kaddr + oip); \
+ } \
+ token->kaddr = page_address(token->eb->pages[idx]); \
+ token->offset = idx << PAGE_SHIFT; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
+ return get_unaligned_le##bits(token->kaddr + oip); \
+ \
+ memcpy(lebytes, token->kaddr + oip, part); \
+ token->kaddr = page_address(token->eb->pages[idx + 1]); \
+ token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(lebytes + part, token->kaddr, size - part); \
+ return get_unaligned_le##bits(lebytes); \
+} \
+u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
+ const void *ptr, unsigned long off) \
+{ \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ char *kaddr = page_address(eb->pages[idx]); \
+ const int size = sizeof(u##bits); \
+ const int part = PAGE_SIZE - oip; \
+ u8 lebytes[sizeof(u##bits)]; \
+ \
+ ASSERT(check_setget_bounds(eb, ptr, off, size)); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
+ return get_unaligned_le##bits(kaddr + oip); \
+ \
+ memcpy(lebytes, kaddr + oip, part); \
+ kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(lebytes + part, kaddr, size - part); \
+ return get_unaligned_le##bits(lebytes); \
+} \
+void btrfs_set_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off, \
+ u##bits val) \
+{ \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ const unsigned long oip = get_eb_offset_in_page(token->eb, \
+ member_offset); \
+ const int size = sizeof(u##bits); \
+ u8 lebytes[sizeof(u##bits)]; \
+ const int part = PAGE_SIZE - oip; \
+ \
+ ASSERT(token); \
+ ASSERT(token->kaddr); \
+ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
+ if (token->offset <= member_offset && \
+ member_offset + size <= token->offset + PAGE_SIZE) { \
+ put_unaligned_le##bits(val, token->kaddr + oip); \
+ return; \
+ } \
+ token->kaddr = page_address(token->eb->pages[idx]); \
+ token->offset = idx << PAGE_SHIFT; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
+ put_unaligned_le##bits(val, token->kaddr + oip); \
+ return; \
+ } \
+ put_unaligned_le##bits(val, lebytes); \
+ memcpy(token->kaddr + oip, lebytes, part); \
+ token->kaddr = page_address(token->eb->pages[idx + 1]); \
+ token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(token->kaddr, lebytes + part, size - part); \
+} \
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val) \
+{ \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ char *kaddr = page_address(eb->pages[idx]); \
+ const int size = sizeof(u##bits); \
+ const int part = PAGE_SIZE - oip; \
+ u8 lebytes[sizeof(u##bits)]; \
+ \
+ ASSERT(check_setget_bounds(eb, ptr, off, size)); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
+ put_unaligned_le##bits(val, kaddr + oip); \
+ return; \
+ } \
+ \
+ put_unaligned_le##bits(val, lebytes); \
+ memcpy(kaddr + oip, lebytes, part); \
+ kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(kaddr, lebytes + part, size - part); \
+}
+
+DEFINE_BTRFS_SETGET_BITS(8)
+DEFINE_BTRFS_SETGET_BITS(16)
+DEFINE_BTRFS_SETGET_BITS(32)
+DEFINE_BTRFS_SETGET_BITS(64)
+
+void btrfs_node_key(const struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+ read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
new file mode 100644
index 000000000..9a176af84
--- /dev/null
+++ b/fs/btrfs/subpage.c
@@ -0,0 +1,768 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include "ctree.h"
+#include "subpage.h"
+#include "btrfs_inode.h"
+
+/*
+ * Subpage (sectorsize < PAGE_SIZE) support overview:
+ *
+ * Limitations:
+ *
+ * - Only support 64K page size for now
+ * This is to make metadata handling easier, as 64K page would ensure
+ * all nodesize would fit inside one page, thus we don't need to handle
+ * cases where a tree block crosses several pages.
+ *
+ * - Only metadata read-write for now
+ * The data read-write part is in development.
+ *
+ * - Metadata can't cross 64K page boundary
+ * btrfs-progs and kernel have done that for a while, thus only ancient
+ * filesystems could have such problem. For such case, do a graceful
+ * rejection.
+ *
+ * Special behavior:
+ *
+ * - Metadata
+ * Metadata read is fully supported.
+ * Meaning when reading one tree block will only trigger the read for the
+ * needed range, other unrelated range in the same page will not be touched.
+ *
+ * Metadata write support is partial.
+ * The writeback is still for the full page, but we will only submit
+ * the dirty extent buffers in the page.
+ *
+ * This means, if we have a metadata page like this:
+ *
+ * Page offset
+ * 0 16K 32K 48K 64K
+ * |/////////| |///////////|
+ * \- Tree block A \- Tree block B
+ *
+ * Even if we just want to writeback tree block A, we will also writeback
+ * tree block B if it's also dirty.
+ *
+ * This may cause extra metadata writeback which results more COW.
+ *
+ * Implementation:
+ *
+ * - Common
+ * Both metadata and data will use a new structure, btrfs_subpage, to
+ * record the status of each sector inside a page. This provides the extra
+ * granularity needed.
+ *
+ * - Metadata
+ * Since we have multiple tree blocks inside one page, we can't rely on page
+ * locking anymore, or we will have greatly reduced concurrency or even
+ * deadlocks (hold one tree lock while trying to lock another tree lock in
+ * the same page).
+ *
+ * Thus for metadata locking, subpage support relies on io_tree locking only.
+ * This means a slightly higher tree locking latency.
+ */
+
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+{
+ if (fs_info->sectorsize >= PAGE_SIZE)
+ return false;
+
+ /*
+ * Only data pages (either through DIO or compression) can have no
+ * mapping. And if page->mapping->host is data inode, it's subpage.
+ * As we have ruled our sectorsize >= PAGE_SIZE case already.
+ */
+ if (!page->mapping || !page->mapping->host ||
+ is_data_inode(page->mapping->host))
+ return true;
+
+ /*
+ * Now the only remaining case is metadata, which we only go subpage
+ * routine if nodesize < PAGE_SIZE.
+ */
+ if (fs_info->nodesize < PAGE_SIZE)
+ return true;
+ return false;
+}
+
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
+{
+ unsigned int cur = 0;
+ unsigned int nr_bits;
+
+ ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
+
+ nr_bits = PAGE_SIZE / sectorsize;
+ subpage_info->bitmap_nr_bits = nr_bits;
+
+ subpage_info->uptodate_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->error_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->dirty_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->writeback_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->ordered_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->checked_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->total_nr_bits = cur;
+}
+
+int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page, enum btrfs_subpage_type type)
+{
+ struct btrfs_subpage *subpage;
+
+ /*
+ * We have cases like a dummy extent buffer page, which is not mapped
+ * and doesn't need to be locked.
+ */
+ if (page->mapping)
+ ASSERT(PageLocked(page));
+
+ /* Either not subpage, or the page already has private attached */
+ if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
+ return 0;
+
+ subpage = btrfs_alloc_subpage(fs_info, type);
+ if (IS_ERR(subpage))
+ return PTR_ERR(subpage);
+
+ attach_page_private(page, subpage);
+ return 0;
+}
+
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ /* Either not subpage, or already detached */
+ if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
+ return;
+
+ subpage = detach_page_private(page);
+ ASSERT(subpage);
+ btrfs_free_subpage(subpage);
+}
+
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type)
+{
+ struct btrfs_subpage *ret;
+ unsigned int real_size;
+
+ ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+ real_size = struct_size(ret, bitmaps,
+ BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+ ret = kzalloc(real_size, GFP_NOFS);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&ret->lock);
+ if (type == BTRFS_SUBPAGE_METADATA) {
+ atomic_set(&ret->eb_refs, 0);
+ } else {
+ atomic_set(&ret->readers, 0);
+ atomic_set(&ret->writers, 0);
+ }
+ return ret;
+}
+
+void btrfs_free_subpage(struct btrfs_subpage *subpage)
+{
+ kfree(subpage);
+}
+
+/*
+ * Increase the eb_refs of current subpage.
+ *
+ * This is important for eb allocation, to prevent race with last eb freeing
+ * of the same page.
+ * With the eb_refs increased before the eb inserted into radix tree,
+ * detach_extent_buffer_page() won't detach the page private while we're still
+ * allocating the extent buffer.
+ */
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page) && page->mapping);
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ subpage = (struct btrfs_subpage *)page->private;
+ atomic_inc(&subpage->eb_refs);
+}
+
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page) && page->mapping);
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(atomic_read(&subpage->eb_refs));
+ atomic_dec(&subpage->eb_refs);
+}
+
+static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ /* Basic checks */
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
+ /*
+ * The range check only works for mapped page, we can still have
+ * unmapped page like dummy extent buffer pages.
+ */
+ if (page->mapping)
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+}
+
+void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = len >> fs_info->sectorsize_bits;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ atomic_add(nbits, &subpage->readers);
+}
+
+void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = len >> fs_info->sectorsize_bits;
+ bool is_data;
+ bool last;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+ is_data = is_data_inode(page->mapping->host);
+ ASSERT(atomic_read(&subpage->readers) >= nbits);
+ last = atomic_sub_and_test(nbits, &subpage->readers);
+
+ /*
+ * For data we need to unlock the page if the last read has finished.
+ *
+ * And please don't replace @last with atomic_sub_and_test() call
+ * inside if () condition.
+ * As we want the atomic_sub_and_test() to be always executed.
+ */
+ if (is_data && last)
+ unlock_page(page);
+}
+
+static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+{
+ u64 orig_start = *start;
+ u32 orig_len = *len;
+
+ *start = max_t(u64, page_offset(page), orig_start);
+ /*
+ * For certain call sites like btrfs_drop_pages(), we may have pages
+ * beyond the target range. In that case, just set @len to 0, subpage
+ * helpers can handle @len == 0 without any problem.
+ */
+ if (page_offset(page) >= orig_start + orig_len)
+ *len = 0;
+ else
+ *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ orig_start + orig_len) - *start;
+}
+
+void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = (len >> fs_info->sectorsize_bits);
+ int ret;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ ASSERT(atomic_read(&subpage->readers) == 0);
+ ret = atomic_add_return(nbits, &subpage->writers);
+ ASSERT(ret == nbits);
+}
+
+bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = (len >> fs_info->sectorsize_bits);
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ /*
+ * We have call sites passing @lock_page into
+ * extent_clear_unlock_delalloc() for compression path.
+ *
+ * This @locked_page is locked by plain lock_page(), thus its
+ * subpage::writers is 0. Handle them in a special way.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ return true;
+
+ ASSERT(atomic_read(&subpage->writers) >= nbits);
+ return atomic_sub_and_test(nbits, &subpage->writers);
+}
+
+/*
+ * Lock a page for delalloc page writeback.
+ *
+ * Return -EAGAIN if the page is not properly initialized.
+ * Return 0 with the page locked, and writer counter updated.
+ *
+ * Even with 0 returned, the page still need extra check to make sure
+ * it's really the correct page, as the caller is using
+ * filemap_get_folios_contig(), which can race with page invalidating.
+ */
+int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
+ lock_page(page);
+ return 0;
+ }
+ lock_page(page);
+ if (!PagePrivate(page) || !page->private) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ btrfs_subpage_clamp_range(page, &start, &len);
+ btrfs_subpage_start_writer(fs_info, page, start, len);
+ return 0;
+}
+
+void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
+ return unlock_page(page);
+ btrfs_subpage_clamp_range(page, &start, &len);
+ if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
+ unlock_page(page);
+}
+
+static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
+{
+ unsigned int found_zero;
+
+ found_zero = find_next_zero_bit(addr, start + nbits, start);
+ if (found_zero == start + nbits)
+ return true;
+ return false;
+}
+
+static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
+{
+ unsigned int found_set;
+
+ found_set = find_next_bit(addr, start + nbits, start);
+ if (found_set == start + nbits)
+ return true;
+ return false;
+}
+
+#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+({ \
+ unsigned int start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, page, start, len); \
+ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ start_bit += fs_info->subpage_info->name##_offset; \
+ start_bit; \
+})
+
+#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
+ bitmap_test_range_all_set(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
+#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
+ bitmap_test_range_all_zero(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
+void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
+ SetPageUptodate(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ ClearPageUptodate(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ SetPageError(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
+ ClearPageError(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ set_page_dirty(page);
+}
+
+/*
+ * Extra clear_and_test function for subpage dirty bitmap.
+ *
+ * Return true if we're the last bits in the dirty_bitmap and clear the
+ * dirty_bitmap.
+ * Return false otherwise.
+ *
+ * NOTE: Callers should manually clear page dirty for true case, as we have
+ * extra handling for tree blocks.
+ */
+bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
+ unsigned long flags;
+ bool last = false;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
+ last = true;
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return last;
+}
+
+void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ bool last;
+
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ if (last)
+ clear_page_dirty_for_io(page);
+}
+
+void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ set_page_writeback(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
+ ASSERT(PageWriteback(page));
+ end_page_writeback(page);
+ }
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ SetPageOrdered(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
+ ClearPageOrdered(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ SetPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ ClearPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+/*
+ * Unlike set/clear which is dependent on each page status, for test all bits
+ * are tested in the same way.
+ */
+#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
+bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ name, start, len); \
+ unsigned long flags; \
+ bool ret; \
+ \
+ spin_lock_irqsave(&subpage->lock, flags); \
+ ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ len >> fs_info->sectorsize_bits); \
+ spin_unlock_irqrestore(&subpage->lock, flags); \
+ return ret; \
+}
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
+
+/*
+ * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
+ * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
+ * back to regular sectorsize branch.
+ */
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
+ test_page_func) \
+void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
+ set_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_set_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
+ clear_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clear_##name(fs_info, page, start, len); \
+} \
+bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
+ return test_page_func(page); \
+ return btrfs_subpage_test_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
+ set_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
+ clear_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, page, start, len); \
+} \
+bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
+ return test_page_func(page); \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, page, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
+ PageUptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
+ PageDirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
+ PageWriteback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
+ PageOrdered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
+
+/*
+ * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
+ * is cleared.
+ */
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ASSERT(!PageDirty(page));
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+}
+
+/*
+ * Handle different locked pages with different page sizes:
+ *
+ * - Page locked by plain lock_page()
+ * It should not have any subpage::writers count.
+ * Can be unlocked by unlock_page().
+ * This is the most common locked page for __extent_writepage() called
+ * inside extent_write_cache_pages().
+ * Rarer cases include the @locked_page from extent_write_locked_range().
+ *
+ * - Page locked by lock_delalloc_pages()
+ * There is only one caller, all pages except @locked_page for
+ * extent_write_locked_range().
+ * In this case, we have to call subpage helper to handle the case.
+ */
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage;
+
+ ASSERT(PageLocked(page));
+ /* For non-subpage case, we just unlock the page */
+ if (!btrfs_is_subpage(fs_info, page))
+ return unlock_page(page);
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * For subpage case, there are two types of locked page. With or
+ * without writers number.
+ *
+ * Since we own the page lock, no one else could touch subpage::writers
+ * and we are safe to do several atomic operations without spinlock.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ /* No writers, locked by plain lock_page() */
+ return unlock_page(page);
+
+ /* Have writers, use proper subpage helper to end it */
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
new file mode 100644
index 000000000..0e80ad336
--- /dev/null
+++ b/fs/btrfs/subpage.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SUBPAGE_H
+#define BTRFS_SUBPAGE_H
+
+#include <linux/spinlock.h>
+
+/*
+ * Extra info for subpapge bitmap.
+ *
+ * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * one larger bitmap.
+ *
+ * This structure records how they are organized in the bitmap:
+ *
+ * /- uptodate_offset /- error_offset /- dirty_offset
+ * | | |
+ * v v v
+ * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o|
+ * |<- bitmap_nr_bits ->|
+ * |<--------------- total_nr_bits ---------------->|
+ */
+struct btrfs_subpage_info {
+ /* Number of bits for each bitmap */
+ unsigned int bitmap_nr_bits;
+
+ /* Total number of bits for the whole bitmap */
+ unsigned int total_nr_bits;
+
+ /*
+ * *_start indicates where the bitmap starts, the length is always
+ * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
+ */
+ unsigned int uptodate_offset;
+ unsigned int error_offset;
+ unsigned int dirty_offset;
+ unsigned int writeback_offset;
+ unsigned int ordered_offset;
+ unsigned int checked_offset;
+};
+
+/*
+ * Structure to trace status of each sector inside a page, attached to
+ * page::private for both data and metadata inodes.
+ */
+struct btrfs_subpage {
+ /* Common members for both data and metadata pages */
+ spinlock_t lock;
+ /*
+ * Both data and metadata needs to track how many readers are for the
+ * page.
+ * Data relies on @readers to unlock the page when last reader finished.
+ * While metadata doesn't need page unlock, it needs to prevent
+ * page::private get cleared before the last end_page_read().
+ */
+ atomic_t readers;
+ union {
+ /*
+ * Structures only used by metadata
+ *
+ * @eb_refs should only be operated under private_lock, as it
+ * manages whether the subpage can be detached.
+ */
+ atomic_t eb_refs;
+
+ /* Structures only used by data */
+ atomic_t writers;
+ };
+ unsigned long bitmaps[];
+};
+
+enum btrfs_subpage_type {
+ BTRFS_SUBPAGE_METADATA,
+ BTRFS_SUBPAGE_DATA,
+};
+
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
+int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
+/* Allocate additional data where page represents more than one sector */
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type);
+void btrfs_free_subpage(struct btrfs_subpage *subpage);
+
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
+void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
+void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
+/*
+ * Template for subpage related operations.
+ *
+ * btrfs_subpage_*() are for call sites where the page has subpage attached and
+ * the range is ensured to be inside the page.
+ *
+ * btrfs_page_*() are for call sites where the page can either be subpage
+ * specific or regular page. The function will handle both cases.
+ * But the range still needs to be inside the page.
+ *
+ * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * need to be inside the page. Those functions will truncate the range
+ * automatically.
+ */
+#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
+void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len);
+
+DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
+DECLARE_BTRFS_SUBPAGE_OPS(error);
+DECLARE_BTRFS_SUBPAGE_OPS(dirty);
+DECLARE_BTRFS_SUBPAGE_OPS(writeback);
+DECLARE_BTRFS_SUBPAGE_OPS(ordered);
+DECLARE_BTRFS_SUBPAGE_OPS(checked);
+
+bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len);
+
+#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000..6fc5fa18d
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,2857 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/crc32c.h>
+#include <linux/btrfs.h>
+#include "delayed-inode.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "props.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "export.h"
+#include "compression.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "free-space-cache.h"
+#include "backref.h"
+#include "space-info.h"
+#include "sysfs.h"
+#include "zoned.h"
+#include "tests/btrfs-tests.h"
+#include "block-group.h"
+#include "discard.h"
+#include "qgroup.h"
+#include "raid56.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/btrfs.h>
+
+static const struct super_operations btrfs_super_ops;
+
+/*
+ * Types for mounting the default subvolume and a subvolume explicitly
+ * requested by subvol=/path. That way the callchain is straightforward and we
+ * don't have to play tricks with the mount options and recursive calls to
+ * btrfs_mount.
+ *
+ * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
+ */
+static struct file_system_type btrfs_fs_type;
+static struct file_system_type btrfs_root_fs_type;
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+
+#ifdef CONFIG_PRINTK
+
+#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+
+/*
+ * Characters to print to indicate error conditions or uncommon filesystem state.
+ * RO is not an error.
+ */
+static const char fs_state_chars[] = {
+ [BTRFS_FS_STATE_ERROR] = 'E',
+ [BTRFS_FS_STATE_REMOUNTING] = 'M',
+ [BTRFS_FS_STATE_RO] = 0,
+ [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
+ [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
+ [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+};
+
+static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
+{
+ unsigned int bit;
+ bool states_printed = false;
+ unsigned long fs_state = READ_ONCE(info->fs_state);
+ char *curr = buf;
+
+ memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
+ curr += sizeof(STATE_STRING_PREFACE) - 1;
+
+ for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
+ WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
+ if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
+ *curr++ = fs_state_chars[bit];
+ states_printed = true;
+ }
+ }
+
+ /* If no states were printed, reset the buffer */
+ if (!states_printed)
+ curr = buf;
+
+ *curr++ = 0;
+}
+#endif
+
+/*
+ * Generally the error codes correspond to their respective errors, but there
+ * are a few special cases.
+ *
+ * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
+ * instance will return EUCLEAN if any of the blocks are corrupted in
+ * a way that is problematic. We want to reserve EUCLEAN for these
+ * sort of corruptions.
+ *
+ * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
+ * need to use EROFS for this case. We will have no idea of the
+ * original failure, that will have been reported at the time we tripped
+ * over the error. Each subsequent error that doesn't have any context
+ * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
+ */
+const char * __attribute_const__ btrfs_decode_error(int errno)
+{
+ char *errstr = "unknown";
+
+ switch (errno) {
+ case -ENOENT: /* -2 */
+ errstr = "No such entry";
+ break;
+ case -EIO: /* -5 */
+ errstr = "IO failure";
+ break;
+ case -ENOMEM: /* -12*/
+ errstr = "Out of memory";
+ break;
+ case -EEXIST: /* -17 */
+ errstr = "Object already exists";
+ break;
+ case -ENOSPC: /* -28 */
+ errstr = "No space left";
+ break;
+ case -EROFS: /* -30 */
+ errstr = "Readonly filesystem";
+ break;
+ case -EOPNOTSUPP: /* -95 */
+ errstr = "Operation not supported";
+ break;
+ case -EUCLEAN: /* -117 */
+ errstr = "Filesystem corrupted";
+ break;
+ case -EDQUOT: /* -122 */
+ errstr = "Quota exceeded";
+ break;
+ }
+
+ return errstr;
+}
+
+/*
+ * __btrfs_handle_fs_error decodes expected errors from the caller and
+ * invokes the appropriate error response.
+ */
+__cold
+void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
+ char statestr[STATE_STRING_BUF_LEN];
+ const char *errstr;
+#endif
+
+ /*
+ * Special case: if the error is EROFS, and we're already
+ * under SB_RDONLY, then it is safe here.
+ */
+ if (errno == -EROFS && sb_rdonly(sb))
+ return;
+
+#ifdef CONFIG_PRINTK
+ errstr = btrfs_decode_error(errno);
+ btrfs_state_to_string(fs_info, statestr);
+ if (fmt) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, statestr, function, line, errno, errstr, &vaf);
+ va_end(args);
+ } else {
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
+ sb->s_id, statestr, function, line, errno, errstr);
+ }
+#endif
+
+ /*
+ * Today we only save the error info to memory. Long term we'll
+ * also send it down to the disk
+ */
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
+ /* Don't go through full error handling during mount */
+ if (!(sb->s_flags & SB_BORN))
+ return;
+
+ if (sb_rdonly(sb))
+ return;
+
+ btrfs_discard_stop(fs_info);
+
+ /* btrfs handle error by forcing the filesystem readonly */
+ btrfs_set_sb_rdonly(sb);
+ btrfs_info(fs_info, "forced readonly");
+ /*
+ * Note that a running device replace operation is not canceled here
+ * although there is no way to update the progress. It would add the
+ * risk of a deadlock, therefore the canceling is omitted. The only
+ * penalty is that some I/O remains active until the procedure
+ * completes. The next time when the filesystem is mounted writable
+ * again, the device replace operation continues.
+ */
+}
+
+#ifdef CONFIG_PRINTK
+static const char * const logtypes[] = {
+ "emergency",
+ "alert",
+ "critical",
+ "error",
+ "warning",
+ "notice",
+ "info",
+ "debug",
+};
+
+
+/*
+ * Use one ratelimit state per log level so that a flood of less important
+ * messages doesn't cause more important ones to be dropped.
+ */
+static struct ratelimit_state printk_limits[] = {
+ RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
+};
+
+void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+ char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
+ struct va_format vaf;
+ va_list args;
+ int kern_level;
+ const char *type = logtypes[4];
+ struct ratelimit_state *ratelimit = &printk_limits[4];
+
+ va_start(args, fmt);
+
+ while ((kern_level = printk_get_level(fmt)) != 0) {
+ size_t size = printk_skip_level(fmt) - fmt;
+
+ if (kern_level >= '0' && kern_level <= '7') {
+ memcpy(lvl, fmt, size);
+ lvl[size] = '\0';
+ type = logtypes[kern_level - '0'];
+ ratelimit = &printk_limits[kern_level - '0'];
+ }
+ fmt += size;
+ }
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (__ratelimit(ratelimit)) {
+ if (fs_info) {
+ char statestr[STATE_STRING_BUF_LEN];
+
+ btrfs_state_to_string(fs_info, statestr);
+ _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ fs_info->sb->s_id, statestr, &vaf);
+ } else {
+ _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
+ }
+
+ va_end(args);
+}
+#endif
+
+#if BITS_PER_LONG == 32
+void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
+ btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
+ btrfs_warn(fs_info,
+"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_warn(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+
+void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
+ btrfs_err(fs_info, "reached 32bit limit for logical addresses");
+ btrfs_err(fs_info,
+"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_err(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+#endif
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+__cold
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ const char *function,
+ unsigned int line, int errno, bool first_hit)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ WRITE_ONCE(trans->aborted, errno);
+ WRITE_ONCE(trans->transaction->aborted, errno);
+ if (first_hit && errno == -ENOSPC)
+ btrfs_dump_space_info_for_trans_abort(fs_info);
+ /* Wake up anybody who may be waiting on this transaction */
+ wake_up(&fs_info->transaction_wait);
+ wake_up(&fs_info->transaction_blocked_wait);
+ __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+}
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller,
+ * issues an alert, and either panics or BUGs, depending on mount options.
+ */
+__cold
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ char *s_id = "<unknown>";
+ const char *errstr;
+ struct va_format vaf = { .fmt = fmt };
+ va_list args;
+
+ if (fs_info)
+ s_id = fs_info->sb->s_id;
+
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ errstr = btrfs_decode_error(errno);
+ if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
+ panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+ s_id, function, line, &vaf, errno, errstr);
+
+ btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+ function, line, &vaf, errno, errstr);
+ va_end(args);
+ /* Caller calls BUG() */
+}
+
+static void btrfs_put_super(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid);
+ close_ctree(fs_info);
+}
+
+enum {
+ Opt_acl, Opt_noacl,
+ Opt_clear_cache,
+ Opt_commit_interval,
+ Opt_compress,
+ Opt_compress_force,
+ Opt_compress_force_type,
+ Opt_compress_type,
+ Opt_degraded,
+ Opt_device,
+ Opt_fatal_errors,
+ Opt_flushoncommit, Opt_noflushoncommit,
+ Opt_max_inline,
+ Opt_barrier, Opt_nobarrier,
+ Opt_datacow, Opt_nodatacow,
+ Opt_datasum, Opt_nodatasum,
+ Opt_defrag, Opt_nodefrag,
+ Opt_discard, Opt_nodiscard,
+ Opt_discard_mode,
+ Opt_norecovery,
+ Opt_ratio,
+ Opt_rescan_uuid_tree,
+ Opt_skip_balance,
+ Opt_space_cache, Opt_no_space_cache,
+ Opt_space_cache_version,
+ Opt_ssd, Opt_nossd,
+ Opt_ssd_spread, Opt_nossd_spread,
+ Opt_subvol,
+ Opt_subvol_empty,
+ Opt_subvolid,
+ Opt_thread_pool,
+ Opt_treelog, Opt_notreelog,
+ Opt_user_subvol_rm_allowed,
+
+ /* Rescue options */
+ Opt_rescue,
+ Opt_usebackuproot,
+ Opt_nologreplay,
+ Opt_ignorebadroots,
+ Opt_ignoredatacsums,
+ Opt_rescue_all,
+
+ /* Deprecated options */
+ Opt_recovery,
+ Opt_inode_cache, Opt_noinode_cache,
+
+ /* Debugging options */
+ Opt_check_integrity,
+ Opt_check_integrity_including_extent_data,
+ Opt_check_integrity_print_mask,
+ Opt_enospc_debug, Opt_noenospc_debug,
+#ifdef CONFIG_BTRFS_DEBUG
+ Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ Opt_ref_verify,
+#endif
+ Opt_err,
+};
+
+static const match_table_t tokens = {
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_clear_cache, "clear_cache"},
+ {Opt_commit_interval, "commit=%u"},
+ {Opt_compress, "compress"},
+ {Opt_compress_type, "compress=%s"},
+ {Opt_compress_force, "compress-force"},
+ {Opt_compress_force_type, "compress-force=%s"},
+ {Opt_degraded, "degraded"},
+ {Opt_device, "device=%s"},
+ {Opt_fatal_errors, "fatal_errors=%s"},
+ {Opt_flushoncommit, "flushoncommit"},
+ {Opt_noflushoncommit, "noflushoncommit"},
+ {Opt_inode_cache, "inode_cache"},
+ {Opt_noinode_cache, "noinode_cache"},
+ {Opt_max_inline, "max_inline=%s"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_datacow, "datacow"},
+ {Opt_nodatacow, "nodatacow"},
+ {Opt_datasum, "datasum"},
+ {Opt_nodatasum, "nodatasum"},
+ {Opt_defrag, "autodefrag"},
+ {Opt_nodefrag, "noautodefrag"},
+ {Opt_discard, "discard"},
+ {Opt_discard_mode, "discard=%s"},
+ {Opt_nodiscard, "nodiscard"},
+ {Opt_norecovery, "norecovery"},
+ {Opt_ratio, "metadata_ratio=%u"},
+ {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
+ {Opt_skip_balance, "skip_balance"},
+ {Opt_space_cache, "space_cache"},
+ {Opt_no_space_cache, "nospace_cache"},
+ {Opt_space_cache_version, "space_cache=%s"},
+ {Opt_ssd, "ssd"},
+ {Opt_nossd, "nossd"},
+ {Opt_ssd_spread, "ssd_spread"},
+ {Opt_nossd_spread, "nossd_spread"},
+ {Opt_subvol, "subvol=%s"},
+ {Opt_subvol_empty, "subvol="},
+ {Opt_subvolid, "subvolid=%s"},
+ {Opt_thread_pool, "thread_pool=%u"},
+ {Opt_treelog, "treelog"},
+ {Opt_notreelog, "notreelog"},
+ {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+
+ /* Rescue options */
+ {Opt_rescue, "rescue=%s"},
+ /* Deprecated, with alias rescue=nologreplay */
+ {Opt_nologreplay, "nologreplay"},
+ /* Deprecated, with alias rescue=usebackuproot */
+ {Opt_usebackuproot, "usebackuproot"},
+
+ /* Deprecated options */
+ {Opt_recovery, "recovery"},
+
+ /* Debugging options */
+ {Opt_check_integrity, "check_int"},
+ {Opt_check_integrity_including_extent_data, "check_int_data"},
+ {Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
+ {Opt_enospc_debug, "enospc_debug"},
+ {Opt_noenospc_debug, "noenospc_debug"},
+#ifdef CONFIG_BTRFS_DEBUG
+ {Opt_fragment_data, "fragment=data"},
+ {Opt_fragment_metadata, "fragment=metadata"},
+ {Opt_fragment_all, "fragment=all"},
+#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ {Opt_ref_verify, "ref_verify"},
+#endif
+ {Opt_err, NULL},
+};
+
+static const match_table_t rescue_tokens = {
+ {Opt_usebackuproot, "usebackuproot"},
+ {Opt_nologreplay, "nologreplay"},
+ {Opt_ignorebadroots, "ignorebadroots"},
+ {Opt_ignorebadroots, "ibadroots"},
+ {Opt_ignoredatacsums, "ignoredatacsums"},
+ {Opt_ignoredatacsums, "idatacsums"},
+ {Opt_rescue_all, "all"},
+ {Opt_err, NULL},
+};
+
+static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
+ const char *opt_name)
+{
+ if (fs_info->mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
+
+static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+{
+ char *opts;
+ char *orig;
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int ret = 0;
+
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ":")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+ token = match_token(p, rescue_tokens, args);
+ switch (token){
+ case Opt_usebackuproot:
+ btrfs_info(info,
+ "trying to use backup root at mount time");
+ btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ break;
+ case Opt_nologreplay:
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
+ case Opt_ignorebadroots:
+ btrfs_set_and_info(info, IGNOREBADROOTS,
+ "ignoring bad roots");
+ break;
+ case Opt_ignoredatacsums:
+ btrfs_set_and_info(info, IGNOREDATACSUMS,
+ "ignoring data csums");
+ break;
+ case Opt_rescue_all:
+ btrfs_info(info, "enabling all of the rescue options");
+ btrfs_set_and_info(info, IGNOREDATACSUMS,
+ "ignoring data csums");
+ btrfs_set_and_info(info, IGNOREBADROOTS,
+ "ignoring bad roots");
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
+ case Opt_err:
+ btrfs_info(info, "unrecognized rescue option '%s'", p);
+ ret = -EINVAL;
+ goto out;
+ default:
+ break;
+ }
+
+ }
+out:
+ kfree(orig);
+ return ret;
+}
+
+/*
+ * Regular mount options parser. Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ * XXX JDM: This needs to be cleaned up for remount.
+ */
+int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
+ unsigned long new_flags)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *num;
+ int intarg;
+ int ret = 0;
+ char *compress_type;
+ bool compress_force = false;
+ enum btrfs_compression_type saved_compress_type;
+ int saved_compress_level;
+ bool saved_compress_force;
+ int no_compress = 0;
+ const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
+
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
+ btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+ else if (btrfs_free_space_cache_v1_active(info)) {
+ if (btrfs_is_zoned(info)) {
+ btrfs_info(info,
+ "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(info->super_copy, 0);
+ } else {
+ btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ }
+ }
+
+ /*
+ * Even the options are empty, we still need to do extra check
+ * against new flags
+ */
+ if (!options)
+ goto check;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_degraded:
+ btrfs_info(info, "allowing degraded mounts");
+ btrfs_set_opt(info->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol:
+ case Opt_subvol_empty:
+ case Opt_subvolid:
+ case Opt_device:
+ /*
+ * These are parsed by btrfs_parse_subvol_options or
+ * btrfs_parse_device_options and can be ignored here.
+ */
+ break;
+ case Opt_nodatasum:
+ btrfs_set_and_info(info, NODATASUM,
+ "setting nodatasum");
+ break;
+ case Opt_datasum:
+ if (btrfs_test_opt(info, NODATASUM)) {
+ if (btrfs_test_opt(info, NODATACOW))
+ btrfs_info(info,
+ "setting datasum, datacow enabled");
+ else
+ btrfs_info(info, "setting datasum");
+ }
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
+ break;
+ case Opt_nodatacow:
+ if (!btrfs_test_opt(info, NODATACOW)) {
+ if (!btrfs_test_opt(info, COMPRESS) ||
+ !btrfs_test_opt(info, FORCE_COMPRESS)) {
+ btrfs_info(info,
+ "setting nodatacow, compression disabled");
+ } else {
+ btrfs_info(info, "setting nodatacow");
+ }
+ }
+ btrfs_clear_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+ btrfs_set_opt(info->mount_opt, NODATACOW);
+ btrfs_set_opt(info->mount_opt, NODATASUM);
+ break;
+ case Opt_datacow:
+ btrfs_clear_and_info(info, NODATACOW,
+ "setting datacow");
+ break;
+ case Opt_compress_force:
+ case Opt_compress_force_type:
+ compress_force = true;
+ fallthrough;
+ case Opt_compress:
+ case Opt_compress_type:
+ saved_compress_type = btrfs_test_opt(info,
+ COMPRESS) ?
+ info->compress_type : BTRFS_COMPRESS_NONE;
+ saved_compress_force =
+ btrfs_test_opt(info, FORCE_COMPRESS);
+ saved_compress_level = info->compress_level;
+ if (token == Opt_compress ||
+ token == Opt_compress_force ||
+ strncmp(args[0].from, "zlib", 4) == 0) {
+ compress_type = "zlib";
+
+ info->compress_type = BTRFS_COMPRESS_ZLIB;
+ info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ /*
+ * args[0] contains uninitialized data since
+ * for these tokens we don't expect any
+ * parameter.
+ */
+ if (token != Opt_compress &&
+ token != Opt_compress_force)
+ info->compress_level =
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZLIB,
+ args[0].from + 4);
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
+ no_compress = 0;
+ } else if (strncmp(args[0].from, "lzo", 3) == 0) {
+ compress_type = "lzo";
+ info->compress_type = BTRFS_COMPRESS_LZO;
+ info->compress_level = 0;
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
+ btrfs_set_fs_incompat(info, COMPRESS_LZO);
+ no_compress = 0;
+ } else if (strncmp(args[0].from, "zstd", 4) == 0) {
+ compress_type = "zstd";
+ info->compress_type = BTRFS_COMPRESS_ZSTD;
+ info->compress_level =
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZSTD,
+ args[0].from + 4);
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
+ btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
+ no_compress = 0;
+ } else if (strncmp(args[0].from, "no", 2) == 0) {
+ compress_type = "no";
+ info->compress_level = 0;
+ info->compress_type = 0;
+ btrfs_clear_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+ compress_force = false;
+ no_compress++;
+ } else {
+ btrfs_err(info, "unrecognized compression value %s",
+ args[0].from);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (compress_force) {
+ btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+ } else {
+ /*
+ * If we remount from compress-force=xxx to
+ * compress=xxx, we need clear FORCE_COMPRESS
+ * flag, otherwise, there is no way for users
+ * to disable forcible compression separately.
+ */
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+ }
+ if (no_compress == 1) {
+ btrfs_info(info, "use no compression");
+ } else if ((info->compress_type != saved_compress_type) ||
+ (compress_force != saved_compress_force) ||
+ (info->compress_level != saved_compress_level)) {
+ btrfs_info(info, "%s %s compression, level %d",
+ (compress_force) ? "force" : "use",
+ compress_type, info->compress_level);
+ }
+ compress_force = false;
+ break;
+ case Opt_ssd:
+ btrfs_set_and_info(info, SSD,
+ "enabling ssd optimizations");
+ btrfs_clear_opt(info->mount_opt, NOSSD);
+ break;
+ case Opt_ssd_spread:
+ btrfs_set_and_info(info, SSD,
+ "enabling ssd optimizations");
+ btrfs_set_and_info(info, SSD_SPREAD,
+ "using spread ssd allocation scheme");
+ btrfs_clear_opt(info->mount_opt, NOSSD);
+ break;
+ case Opt_nossd:
+ btrfs_set_opt(info->mount_opt, NOSSD);
+ btrfs_clear_and_info(info, SSD,
+ "not using ssd optimizations");
+ fallthrough;
+ case Opt_nossd_spread:
+ btrfs_clear_and_info(info, SSD_SPREAD,
+ "not using spread ssd allocation scheme");
+ break;
+ case Opt_barrier:
+ btrfs_clear_and_info(info, NOBARRIER,
+ "turning on barriers");
+ break;
+ case Opt_nobarrier:
+ btrfs_set_and_info(info, NOBARRIER,
+ "turning off barriers");
+ break;
+ case Opt_thread_pool:
+ ret = match_int(&args[0], &intarg);
+ if (ret) {
+ btrfs_err(info, "unrecognized thread_pool value %s",
+ args[0].from);
+ goto out;
+ } else if (intarg == 0) {
+ btrfs_err(info, "invalid value 0 for thread_pool");
+ ret = -EINVAL;
+ goto out;
+ }
+ info->thread_pool_size = intarg;
+ break;
+ case Opt_max_inline:
+ num = match_strdup(&args[0]);
+ if (num) {
+ info->max_inline = memparse(num, NULL);
+ kfree(num);
+
+ if (info->max_inline) {
+ info->max_inline = min_t(u64,
+ info->max_inline,
+ info->sectorsize);
+ }
+ btrfs_info(info, "max_inline at %llu",
+ info->max_inline);
+ } else {
+ ret = -ENOMEM;
+ goto out;
+ }
+ break;
+ case Opt_acl:
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ info->sb->s_flags |= SB_POSIXACL;
+ break;
+#else
+ btrfs_err(info, "support for ACL not compiled in!");
+ ret = -EINVAL;
+ goto out;
+#endif
+ case Opt_noacl:
+ info->sb->s_flags &= ~SB_POSIXACL;
+ break;
+ case Opt_notreelog:
+ btrfs_set_and_info(info, NOTREELOG,
+ "disabling tree log");
+ break;
+ case Opt_treelog:
+ btrfs_clear_and_info(info, NOTREELOG,
+ "enabling tree log");
+ break;
+ case Opt_norecovery:
+ case Opt_nologreplay:
+ btrfs_warn(info,
+ "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
+ case Opt_flushoncommit:
+ btrfs_set_and_info(info, FLUSHONCOMMIT,
+ "turning on flush-on-commit");
+ break;
+ case Opt_noflushoncommit:
+ btrfs_clear_and_info(info, FLUSHONCOMMIT,
+ "turning off flush-on-commit");
+ break;
+ case Opt_ratio:
+ ret = match_int(&args[0], &intarg);
+ if (ret) {
+ btrfs_err(info, "unrecognized metadata_ratio value %s",
+ args[0].from);
+ goto out;
+ }
+ info->metadata_ratio = intarg;
+ btrfs_info(info, "metadata ratio %u",
+ info->metadata_ratio);
+ break;
+ case Opt_discard:
+ case Opt_discard_mode:
+ if (token == Opt_discard ||
+ strcmp(args[0].from, "sync") == 0) {
+ btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
+ btrfs_set_and_info(info, DISCARD_SYNC,
+ "turning on sync discard");
+ } else if (strcmp(args[0].from, "async") == 0) {
+ btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
+ btrfs_set_and_info(info, DISCARD_ASYNC,
+ "turning on async discard");
+ } else {
+ btrfs_err(info, "unrecognized discard mode value %s",
+ args[0].from);
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case Opt_nodiscard:
+ btrfs_clear_and_info(info, DISCARD_SYNC,
+ "turning off discard");
+ btrfs_clear_and_info(info, DISCARD_ASYNC,
+ "turning off async discard");
+ break;
+ case Opt_space_cache:
+ case Opt_space_cache_version:
+ /*
+ * We already set FREE_SPACE_TREE above because we have
+ * compat_ro(FREE_SPACE_TREE) set, and we aren't going
+ * to allow v1 to be set for extent tree v2, simply
+ * ignore this setting if we're extent tree v2.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
+ if (token == Opt_space_cache ||
+ strcmp(args[0].from, "v1") == 0) {
+ btrfs_clear_opt(info->mount_opt,
+ FREE_SPACE_TREE);
+ btrfs_set_and_info(info, SPACE_CACHE,
+ "enabling disk space caching");
+ } else if (strcmp(args[0].from, "v2") == 0) {
+ btrfs_clear_opt(info->mount_opt,
+ SPACE_CACHE);
+ btrfs_set_and_info(info, FREE_SPACE_TREE,
+ "enabling free space tree");
+ } else {
+ btrfs_err(info, "unrecognized space_cache value %s",
+ args[0].from);
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case Opt_rescan_uuid_tree:
+ btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
+ break;
+ case Opt_no_space_cache:
+ /*
+ * We cannot operate without the free space tree with
+ * extent tree v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
+ if (btrfs_test_opt(info, SPACE_CACHE)) {
+ btrfs_clear_and_info(info, SPACE_CACHE,
+ "disabling disk space caching");
+ }
+ if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
+ btrfs_clear_and_info(info, FREE_SPACE_TREE,
+ "disabling free space tree");
+ }
+ break;
+ case Opt_inode_cache:
+ case Opt_noinode_cache:
+ btrfs_warn(info,
+ "the 'inode_cache' option is deprecated and has no effect since 5.11");
+ break;
+ case Opt_clear_cache:
+ /*
+ * We cannot clear the free space tree with extent tree
+ * v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
+ btrfs_set_and_info(info, CLEAR_CACHE,
+ "force clearing of disk cache");
+ break;
+ case Opt_user_subvol_rm_allowed:
+ btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ break;
+ case Opt_enospc_debug:
+ btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_noenospc_debug:
+ btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_defrag:
+ btrfs_set_and_info(info, AUTO_DEFRAG,
+ "enabling auto defrag");
+ break;
+ case Opt_nodefrag:
+ btrfs_clear_and_info(info, AUTO_DEFRAG,
+ "disabling auto defrag");
+ break;
+ case Opt_recovery:
+ case Opt_usebackuproot:
+ btrfs_warn(info,
+ "'%s' is deprecated, use 'rescue=usebackuproot' instead",
+ token == Opt_recovery ? "recovery" :
+ "usebackuproot");
+ btrfs_info(info,
+ "trying to use backup root at mount time");
+ btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ break;
+ case Opt_skip_balance:
+ btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ case Opt_check_integrity_including_extent_data:
+ btrfs_info(info,
+ "enabling check integrity including extent data");
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ break;
+ case Opt_check_integrity:
+ btrfs_info(info, "enabling check integrity");
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ break;
+ case Opt_check_integrity_print_mask:
+ ret = match_int(&args[0], &intarg);
+ if (ret) {
+ btrfs_err(info,
+ "unrecognized check_integrity_print_mask value %s",
+ args[0].from);
+ goto out;
+ }
+ info->check_integrity_print_mask = intarg;
+ btrfs_info(info, "check_integrity_print_mask 0x%x",
+ info->check_integrity_print_mask);
+ break;
+#else
+ case Opt_check_integrity_including_extent_data:
+ case Opt_check_integrity:
+ case Opt_check_integrity_print_mask:
+ btrfs_err(info,
+ "support for check_integrity* not compiled in!");
+ ret = -EINVAL;
+ goto out;
+#endif
+ case Opt_fatal_errors:
+ if (strcmp(args[0].from, "panic") == 0) {
+ btrfs_set_opt(info->mount_opt,
+ PANIC_ON_FATAL_ERROR);
+ } else if (strcmp(args[0].from, "bug") == 0) {
+ btrfs_clear_opt(info->mount_opt,
+ PANIC_ON_FATAL_ERROR);
+ } else {
+ btrfs_err(info, "unrecognized fatal_errors value %s",
+ args[0].from);
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case Opt_commit_interval:
+ intarg = 0;
+ ret = match_int(&args[0], &intarg);
+ if (ret) {
+ btrfs_err(info, "unrecognized commit_interval value %s",
+ args[0].from);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (intarg == 0) {
+ btrfs_info(info,
+ "using default commit interval %us",
+ BTRFS_DEFAULT_COMMIT_INTERVAL);
+ intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ } else if (intarg > 300) {
+ btrfs_warn(info, "excessive commit interval %d",
+ intarg);
+ }
+ info->commit_interval = intarg;
+ break;
+ case Opt_rescue:
+ ret = parse_rescue_options(info, args[0].from);
+ if (ret < 0) {
+ btrfs_err(info, "unrecognized rescue value %s",
+ args[0].from);
+ goto out;
+ }
+ break;
+#ifdef CONFIG_BTRFS_DEBUG
+ case Opt_fragment_all:
+ btrfs_info(info, "fragmenting all space");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_metadata:
+ btrfs_info(info, "fragmenting metadata");
+ btrfs_set_opt(info->mount_opt,
+ FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_data:
+ btrfs_info(info, "fragmenting data");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ break;
+#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ case Opt_ref_verify:
+ btrfs_info(info, "doing ref verification");
+ btrfs_set_opt(info->mount_opt, REF_VERIFY);
+ break;
+#endif
+ case Opt_err:
+ btrfs_err(info, "unrecognized mount option '%s'", p);
+ ret = -EINVAL;
+ goto out;
+ default:
+ break;
+ }
+ }
+check:
+ /* We're read-only, don't have to check. */
+ if (new_flags & SB_RDONLY)
+ goto out;
+
+ if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
+ ret = -EINVAL;
+out:
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(info, CLEAR_CACHE)) {
+ btrfs_err(info, "cannot disable free space tree");
+ ret = -EINVAL;
+ }
+ if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
+ !btrfs_test_opt(info, FREE_SPACE_TREE)) {
+ btrfs_err(info, "cannot disable free space tree with block-group-tree feature");
+ ret = -EINVAL;
+ }
+ if (!ret)
+ ret = btrfs_check_mountopts_zoned(info);
+ if (!ret && !remounting) {
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_test_opt(info, FREE_SPACE_TREE))
+ btrfs_info(info, "using free space tree");
+ }
+ return ret;
+}
+
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_device_options(const char *options, fmode_t flags,
+ void *holder)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *device_name, *opts, *orig, *p;
+ struct btrfs_device *device = NULL;
+ int error = 0;
+
+ lockdep_assert_held(&uuid_mutex);
+
+ if (!options)
+ return 0;
+
+ /*
+ * strsep changes the string, duplicate it because btrfs_parse_options
+ * gets called later
+ */
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ if (token == Opt_device) {
+ device_name = match_strdup(&args[0]);
+ if (!device_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+ device = btrfs_scan_one_device(device_name, flags,
+ holder);
+ kfree(device_name);
+ if (IS_ERR(device)) {
+ error = PTR_ERR(device);
+ goto out;
+ }
+ }
+ }
+
+out:
+ kfree(orig);
+ return error;
+}
+
+/*
+ * Parse mount options that are related to subvolume id
+ *
+ * The value is later passed to mount_subvol()
+ */
+static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
+ u64 *subvol_objectid)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *opts, *orig, *p;
+ int error = 0;
+ u64 subvolid;
+
+ if (!options)
+ return 0;
+
+ /*
+ * strsep changes the string, duplicate it because
+ * btrfs_parse_device_options gets called later
+ */
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_subvol:
+ kfree(*subvol_name);
+ *subvol_name = match_strdup(&args[0]);
+ if (!*subvol_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+ break;
+ case Opt_subvolid:
+ error = match_u64(&args[0], &subvolid);
+ if (error)
+ goto out;
+
+ /* we want the original fs_tree */
+ if (subvolid == 0)
+ subvolid = BTRFS_FS_TREE_OBJECTID;
+
+ *subvol_objectid = subvolid;
+ break;
+ default:
+ break;
+ }
+ }
+
+out:
+ kfree(orig);
+ return error;
+}
+
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_root *fs_root = NULL;
+ struct btrfs_root_ref *root_ref;
+ struct btrfs_inode_ref *inode_ref;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ char *name = NULL, *ptr;
+ u64 dirid;
+ int len;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ name = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ ptr = name + PATH_MAX - 1;
+ ptr[0] = '\0';
+
+ /*
+ * Walk up the subvolume trees in the tree of tree roots by root
+ * backrefs until we hit the top-level subvolume.
+ */
+ while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_backwards(root, &key, path);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ subvol_objectid = key.offset;
+
+ root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_root_ref);
+ len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(root_ref + 1), len);
+ ptr[0] = '/';
+ dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
+ btrfs_release_path(path);
+
+ fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true);
+ if (IS_ERR(fs_root)) {
+ ret = PTR_ERR(fs_root);
+ fs_root = NULL;
+ goto err;
+ }
+
+ /*
+ * Walk up the filesystem tree by inode refs until we hit the
+ * root directory.
+ */
+ while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_backwards(fs_root, &key, path);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ dirid = key.offset;
+
+ inode_ref = btrfs_item_ptr(path->nodes[0],
+ path->slots[0],
+ struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(path->nodes[0],
+ inode_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(inode_ref + 1), len);
+ ptr[0] = '/';
+ btrfs_release_path(path);
+ }
+ btrfs_put_root(fs_root);
+ fs_root = NULL;
+ }
+
+ btrfs_free_path(path);
+ if (ptr == name + PATH_MAX - 1) {
+ name[0] = '/';
+ name[1] = '\0';
+ } else {
+ memmove(name, ptr, name + PATH_MAX - ptr);
+ }
+ return name;
+
+err:
+ btrfs_put_root(fs_root);
+ btrfs_free_path(path);
+ kfree(name);
+ return ERR_PTR(ret);
+}
+
+static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
+ u64 dir_id;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Find the "default" dir item which points to the root item that we
+ * will mount by default if we haven't been given a specific subvolume
+ * to mount.
+ */
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
+ if (IS_ERR(di)) {
+ btrfs_free_path(path);
+ return PTR_ERR(di);
+ }
+ if (!di) {
+ /*
+ * Ok the default dir item isn't there. This is weird since
+ * it's always been there, but don't freak out, just try and
+ * mount the top-level subvolume.
+ */
+ btrfs_free_path(path);
+ *objectid = BTRFS_FS_TREE_OBJECTID;
+ return 0;
+ }
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ btrfs_free_path(path);
+ *objectid = location.objectid;
+ return 0;
+}
+
+static int btrfs_fill_super(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ void *data)
+{
+ struct inode *inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ int err;
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_magic = BTRFS_SUPER_MAGIC;
+ sb->s_op = &btrfs_super_ops;
+ sb->s_d_op = &btrfs_dentry_operations;
+ sb->s_export_op = &btrfs_export_ops;
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &btrfs_verityops;
+#endif
+ sb->s_xattr = btrfs_xattr_handlers;
+ sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ sb->s_flags |= SB_POSIXACL;
+#endif
+ sb->s_flags |= SB_I_VERSION;
+ sb->s_iflags |= SB_I_CGROUPWB;
+
+ err = super_setup_bdi(sb);
+ if (err) {
+ btrfs_err(fs_info, "super_setup_bdi failed");
+ return err;
+ }
+
+ err = open_ctree(sb, fs_devices, (char *)data);
+ if (err) {
+ btrfs_err(fs_info, "open_ctree failed");
+ return err;
+ }
+
+ inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto fail_close;
+ }
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto fail_close;
+ }
+
+ sb->s_flags |= SB_ACTIVE;
+ return 0;
+
+fail_close:
+ close_ctree(fs_info);
+ return err;
+}
+
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
+
+ trace_btrfs_sync_fs(fs_info, wait);
+
+ if (!wait) {
+ filemap_flush(fs_info->btree_inode->i_mapping);
+ return 0;
+ }
+
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ /* no transaction, don't bother */
+ if (PTR_ERR(trans) == -ENOENT) {
+ /*
+ * Exit unless we have some pending changes
+ * that need to go through commit
+ */
+ if (fs_info->pending_changes == 0)
+ return 0;
+ /*
+ * A non-blocking test if the fs is frozen. We must not
+ * start a new transaction here otherwise a deadlock
+ * happens. The pending operations are delayed to the
+ * next commit after thawing.
+ */
+ if (sb_start_write_trylock(sb))
+ sb_end_write(sb);
+ else
+ return 0;
+ trans = btrfs_start_transaction(root, 0);
+ }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ }
+ return btrfs_commit_transaction(trans);
+}
+
+static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed)
+{
+ seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s);
+ *printed = true;
+}
+
+static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+ struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+ const char *compress_type;
+ const char *subvol_name;
+ bool printed = false;
+
+ if (btrfs_test_opt(info, DEGRADED))
+ seq_puts(seq, ",degraded");
+ if (btrfs_test_opt(info, NODATASUM))
+ seq_puts(seq, ",nodatasum");
+ if (btrfs_test_opt(info, NODATACOW))
+ seq_puts(seq, ",nodatacow");
+ if (btrfs_test_opt(info, NOBARRIER))
+ seq_puts(seq, ",nobarrier");
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+ seq_printf(seq, ",max_inline=%llu", info->max_inline);
+ if (info->thread_pool_size != min_t(unsigned long,
+ num_online_cpus() + 2, 8))
+ seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
+ if (btrfs_test_opt(info, COMPRESS)) {
+ compress_type = btrfs_compress_type2str(info->compress_type);
+ if (btrfs_test_opt(info, FORCE_COMPRESS))
+ seq_printf(seq, ",compress-force=%s", compress_type);
+ else
+ seq_printf(seq, ",compress=%s", compress_type);
+ if (info->compress_level)
+ seq_printf(seq, ":%d", info->compress_level);
+ }
+ if (btrfs_test_opt(info, NOSSD))
+ seq_puts(seq, ",nossd");
+ if (btrfs_test_opt(info, SSD_SPREAD))
+ seq_puts(seq, ",ssd_spread");
+ else if (btrfs_test_opt(info, SSD))
+ seq_puts(seq, ",ssd");
+ if (btrfs_test_opt(info, NOTREELOG))
+ seq_puts(seq, ",notreelog");
+ if (btrfs_test_opt(info, NOLOGREPLAY))
+ print_rescue_option(seq, "nologreplay", &printed);
+ if (btrfs_test_opt(info, USEBACKUPROOT))
+ print_rescue_option(seq, "usebackuproot", &printed);
+ if (btrfs_test_opt(info, IGNOREBADROOTS))
+ print_rescue_option(seq, "ignorebadroots", &printed);
+ if (btrfs_test_opt(info, IGNOREDATACSUMS))
+ print_rescue_option(seq, "ignoredatacsums", &printed);
+ if (btrfs_test_opt(info, FLUSHONCOMMIT))
+ seq_puts(seq, ",flushoncommit");
+ if (btrfs_test_opt(info, DISCARD_SYNC))
+ seq_puts(seq, ",discard");
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ seq_puts(seq, ",discard=async");
+ if (!(info->sb->s_flags & SB_POSIXACL))
+ seq_puts(seq, ",noacl");
+ if (btrfs_free_space_cache_v1_active(info))
+ seq_puts(seq, ",space_cache");
+ else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
+ seq_puts(seq, ",space_cache=v2");
+ else
+ seq_puts(seq, ",nospace_cache");
+ if (btrfs_test_opt(info, RESCAN_UUID_TREE))
+ seq_puts(seq, ",rescan_uuid_tree");
+ if (btrfs_test_opt(info, CLEAR_CACHE))
+ seq_puts(seq, ",clear_cache");
+ if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
+ seq_puts(seq, ",user_subvol_rm_allowed");
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ seq_puts(seq, ",enospc_debug");
+ if (btrfs_test_opt(info, AUTO_DEFRAG))
+ seq_puts(seq, ",autodefrag");
+ if (btrfs_test_opt(info, SKIP_BALANCE))
+ seq_puts(seq, ",skip_balance");
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
+ seq_puts(seq, ",check_int_data");
+ else if (btrfs_test_opt(info, CHECK_INTEGRITY))
+ seq_puts(seq, ",check_int");
+ if (info->check_integrity_print_mask)
+ seq_printf(seq, ",check_int_print_mask=%d",
+ info->check_integrity_print_mask);
+#endif
+ if (info->metadata_ratio)
+ seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
+ if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
+ seq_puts(seq, ",fatal_errors=panic");
+ if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
+ seq_printf(seq, ",commit=%u", info->commit_interval);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_test_opt(info, FRAGMENT_DATA))
+ seq_puts(seq, ",fragment=data");
+ if (btrfs_test_opt(info, FRAGMENT_METADATA))
+ seq_puts(seq, ",fragment=metadata");
+#endif
+ if (btrfs_test_opt(info, REF_VERIFY))
+ seq_puts(seq, ",ref_verify");
+ seq_printf(seq, ",subvolid=%llu",
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
+ return 0;
+}
+
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+ struct btrfs_fs_info *p = data;
+ struct btrfs_fs_info *fs_info = btrfs_sb(s);
+
+ return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+ int err = set_anon_super(s, data);
+ if (!err)
+ s->s_fs_info = data;
+ return err;
+}
+
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+ if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
+ struct vfsmount *mnt)
+{
+ struct dentry *root;
+ int ret;
+
+ if (!subvol_name) {
+ if (!subvol_objectid) {
+ ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
+ &subvol_objectid);
+ if (ret) {
+ root = ERR_PTR(ret);
+ goto out;
+ }
+ }
+ subvol_name = btrfs_get_subvol_name_from_objectid(
+ btrfs_sb(mnt->mnt_sb), subvol_objectid);
+ if (IS_ERR(subvol_name)) {
+ root = ERR_CAST(subvol_name);
+ subvol_name = NULL;
+ goto out;
+ }
+
+ }
+
+ root = mount_subtree(mnt, subvol_name);
+ /* mount_subtree() drops our reference on the vfsmount. */
+ mnt = NULL;
+
+ if (!IS_ERR(root)) {
+ struct super_block *s = root->d_sb;
+ struct btrfs_fs_info *fs_info = btrfs_sb(s);
+ struct inode *root_inode = d_inode(root);
+ u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+
+ ret = 0;
+ if (!is_subvolume_inode(root_inode)) {
+ btrfs_err(fs_info, "'%s' is not a valid subvolume",
+ subvol_name);
+ ret = -EINVAL;
+ }
+ if (subvol_objectid && root_objectid != subvol_objectid) {
+ /*
+ * This will also catch a race condition where a
+ * subvolume which was passed by ID is renamed and
+ * another subvolume is renamed over the old location.
+ */
+ btrfs_err(fs_info,
+ "subvol '%s' does not match subvolid %llu",
+ subvol_name, subvol_objectid);
+ ret = -EINVAL;
+ }
+ if (ret) {
+ dput(root);
+ root = ERR_PTR(ret);
+ deactivate_locked_super(s);
+ }
+ }
+
+out:
+ mntput(mnt);
+ kfree(subvol_name);
+ return root;
+}
+
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note: This is based on mount_bdev from fs/super.c with a few additions
+ * for multiple device setup. Make sure to keep it in sync.
+ */
+static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
+ int flags, const char *device_name, void *data)
+{
+ struct block_device *bdev = NULL;
+ struct super_block *s;
+ struct btrfs_device *device = NULL;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ struct btrfs_fs_info *fs_info = NULL;
+ void *new_sec_opts = NULL;
+ fmode_t mode = FMODE_READ;
+ int error = 0;
+
+ if (!(flags & SB_RDONLY))
+ mode |= FMODE_WRITE;
+
+ if (data) {
+ error = security_sb_eat_lsm_opts(data, &new_sec_opts);
+ if (error)
+ return ERR_PTR(error);
+ }
+
+ /*
+ * Setup a dummy root and fs_info for test/set super. This is because
+ * we don't actually fill this stuff out until open_ctree, but we need
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
+ */
+ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+ if (!fs_info) {
+ error = -ENOMEM;
+ goto error_sec_opts;
+ }
+ btrfs_init_fs_info(fs_info);
+
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ error = -ENOMEM;
+ goto error_fs_info;
+ }
+
+ mutex_lock(&uuid_mutex);
+ error = btrfs_parse_device_options(data, mode, fs_type);
+ if (error) {
+ mutex_unlock(&uuid_mutex);
+ goto error_fs_info;
+ }
+
+ device = btrfs_scan_one_device(device_name, mode, fs_type);
+ if (IS_ERR(device)) {
+ mutex_unlock(&uuid_mutex);
+ error = PTR_ERR(device);
+ goto error_fs_info;
+ }
+
+ fs_devices = device->fs_devices;
+ fs_info->fs_devices = fs_devices;
+
+ error = btrfs_open_devices(fs_devices, mode, fs_type);
+ mutex_unlock(&uuid_mutex);
+ if (error)
+ goto error_fs_info;
+
+ if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ error = -EACCES;
+ goto error_close_devices;
+ }
+
+ bdev = fs_devices->latest_dev->bdev;
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
+ fs_info);
+ if (IS_ERR(s)) {
+ error = PTR_ERR(s);
+ goto error_close_devices;
+ }
+
+ if (s->s_root) {
+ btrfs_close_devices(fs_devices);
+ btrfs_free_fs_info(fs_info);
+ if ((flags ^ s->s_flags) & SB_RDONLY)
+ error = -EBUSY;
+ } else {
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+ s->s_id);
+ btrfs_sb(s)->bdev_holder = fs_type;
+ error = btrfs_fill_super(s, fs_devices, data);
+ }
+ if (!error)
+ error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
+ security_free_mnt_opts(&new_sec_opts);
+ if (error) {
+ deactivate_locked_super(s);
+ return ERR_PTR(error);
+ }
+
+ return dget(s->s_root);
+
+error_close_devices:
+ btrfs_close_devices(fs_devices);
+error_fs_info:
+ btrfs_free_fs_info(fs_info);
+error_sec_opts:
+ security_free_mnt_opts(&new_sec_opts);
+ return ERR_PTR(error);
+}
+
+/*
+ * Mount function which is called by VFS layer.
+ *
+ * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
+ * which needs vfsmount* of device's root (/). This means device's root has to
+ * be mounted internally in any case.
+ *
+ * Operation flow:
+ * 1. Parse subvol id related options for later use in mount_subvol().
+ *
+ * 2. Mount device's root (/) by calling vfs_kern_mount().
+ *
+ * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
+ * first place. In order to avoid calling btrfs_mount() again, we use
+ * different file_system_type which is not registered to VFS by
+ * register_filesystem() (btrfs_root_fs_type). As a result,
+ * btrfs_mount_root() is called. The return value will be used by
+ * mount_subtree() in mount_subvol().
+ *
+ * 3. Call mount_subvol() to get the dentry of subvolume. Since there is
+ * "btrfs subvolume set-default", mount_subvol() is called always.
+ */
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+ const char *device_name, void *data)
+{
+ struct vfsmount *mnt_root;
+ struct dentry *root;
+ char *subvol_name = NULL;
+ u64 subvol_objectid = 0;
+ int error = 0;
+
+ error = btrfs_parse_subvol_options(data, &subvol_name,
+ &subvol_objectid);
+ if (error) {
+ kfree(subvol_name);
+ return ERR_PTR(error);
+ }
+
+ /* mount device's root (/) */
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
+ if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
+ if (flags & SB_RDONLY) {
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
+ flags & ~SB_RDONLY, device_name, data);
+ } else {
+ mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
+ flags | SB_RDONLY, device_name, data);
+ if (IS_ERR(mnt_root)) {
+ root = ERR_CAST(mnt_root);
+ kfree(subvol_name);
+ goto out;
+ }
+
+ down_write(&mnt_root->mnt_sb->s_umount);
+ error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
+ up_write(&mnt_root->mnt_sb->s_umount);
+ if (error < 0) {
+ root = ERR_PTR(error);
+ mntput(mnt_root);
+ kfree(subvol_name);
+ goto out;
+ }
+ }
+ }
+ if (IS_ERR(mnt_root)) {
+ root = ERR_CAST(mnt_root);
+ kfree(subvol_name);
+ goto out;
+ }
+
+ /* mount_subvol() will free subvol_name and mnt_root */
+ root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
+
+out:
+ return root;
+}
+
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+ u32 new_pool_size, u32 old_pool_size)
+{
+ if (new_pool_size == old_pool_size)
+ return;
+
+ fs_info->thread_pool_size = new_pool_size;
+
+ btrfs_info(fs_info, "resize thread pool %d -> %d",
+ old_pool_size, new_pool_size);
+
+ btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+ workqueue_set_max_active(fs_info->endio_workers, new_pool_size);
+ workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
+}
+
+static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts, int flags)
+{
+ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+ (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+ (flags & SB_RDONLY))) {
+ /* wait for any defraggers to finish */
+ wait_event(fs_info->transaction_wait,
+ (atomic_read(&fs_info->defrag_running) == 0));
+ if (flags & SB_RDONLY)
+ sync_filesystem(fs_info->sb);
+ }
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts)
+{
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+
+ /*
+ * We need to cleanup all defragable inodes if the autodefragment is
+ * close or the filesystem is read only.
+ */
+ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+ (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) {
+ btrfs_cleanup_defrag_inodes(fs_info);
+ }
+
+ /* If we toggled discard async */
+ if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
+ btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_resume(fs_info);
+ else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
+ !btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_cleanup(fs_info);
+
+ /* If we toggled space cache */
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
+}
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ unsigned old_flags = sb->s_flags;
+ unsigned long old_opts = fs_info->mount_opt;
+ unsigned long old_compress_type = fs_info->compress_type;
+ u64 old_max_inline = fs_info->max_inline;
+ u32 old_thread_pool_size = fs_info->thread_pool_size;
+ u32 old_metadata_ratio = fs_info->metadata_ratio;
+ int ret;
+
+ sync_filesystem(sb);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ if (data) {
+ void *new_sec_opts = NULL;
+
+ ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
+ if (!ret)
+ ret = security_sb_remount(sb, new_sec_opts);
+ security_free_mnt_opts(&new_sec_opts);
+ if (ret)
+ goto restore;
+ }
+
+ ret = btrfs_parse_options(fs_info, data, *flags);
+ if (ret)
+ goto restore;
+
+ ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY));
+ if (ret < 0)
+ goto restore;
+
+ btrfs_remount_begin(fs_info, old_opts, *flags);
+ btrfs_resize_thread_pool(fs_info,
+ fs_info->thread_pool_size, old_thread_pool_size);
+
+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from ro to rw");
+ /* Make sure free space cache options match the state on disk */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ }
+
+ if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ goto out;
+
+ if (*flags & SB_RDONLY) {
+ /*
+ * this also happens on 'umount -rf' or on shutdown, when
+ * the filesystem is busy.
+ */
+ cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
+
+ btrfs_discard_cleanup(fs_info);
+
+ /* wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* avoid complains from lockdep et al. */
+ up(&fs_info->uuid_tree_rescan_sem);
+
+ btrfs_set_sb_rdonly(sb);
+
+ /*
+ * Setting SB_RDONLY will put the cleaner thread to
+ * sleep at the next loop if it's already active.
+ * If it's already asleep, we'll leave unused block
+ * groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ btrfs_delete_unused_bgs(fs_info);
+
+ /*
+ * The cleaner task could be already running before we set the
+ * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
+ * We must make sure that after we finish the remount, i.e. after
+ * we call btrfs_commit_super(), the cleaner can no longer start
+ * a transaction - either because it was dropping a dead root,
+ * running delayed iputs or deleting an unused block group (the
+ * cleaner picked a block group from the list of unused block
+ * groups before we were able to in the previous call to
+ * btrfs_delete_unused_bgs()).
+ */
+ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
+ TASK_UNINTERRUPTIBLE);
+
+ /*
+ * We've set the superblock to RO mode, so we might have made
+ * the cleaner task sleep without running all pending delayed
+ * iputs. Go through all the delayed iputs here, so that if an
+ * unmount happens without remounting RW we don't end up at
+ * finishing close_ctree() with a non-empty list of delayed
+ * iputs.
+ */
+ btrfs_run_delayed_iputs(fs_info);
+
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+ btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
+
+ /*
+ * Pause the qgroup rescan worker if it is running. We don't want
+ * it to be still running after we are in RO mode, as after that,
+ * by the time we unmount, it might have left a transaction open,
+ * so we would leak the transaction and/or crash.
+ */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ ret = btrfs_commit_super(fs_info);
+ if (ret)
+ goto restore;
+ } else {
+ if (BTRFS_FS_ERROR(fs_info)) {
+ btrfs_err(fs_info,
+ "Remounting read-write after error is not allowed");
+ ret = -EINVAL;
+ goto restore;
+ }
+ if (fs_info->fs_devices->rw_devices == 0) {
+ ret = -EACCES;
+ goto restore;
+ }
+
+ if (!btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_warn(fs_info,
+ "too many missing devices, writable remount is not allowed");
+ ret = -EACCES;
+ goto restore;
+ }
+
+ if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
+ ret = -EINVAL;
+ goto restore;
+ }
+
+ /*
+ * NOTE: when remounting with a change that does writes, don't
+ * put it anywhere above this point, as we are not sure to be
+ * safe to write until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
+ if (ret)
+ goto restore;
+
+ btrfs_clear_sb_rdonly(sb);
+
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+ }
+out:
+ /*
+ * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
+ * since the absence of the flag means it can be toggled off by remount.
+ */
+ *flags |= SB_I_VERSION;
+
+ wake_up_process(fs_info->transaction_kthread);
+ btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_clear_oneshot_options(fs_info);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ return 0;
+
+restore:
+ /* We've hit an error - don't reset SB_RDONLY */
+ if (sb_rdonly(sb))
+ old_flags |= SB_RDONLY;
+ if (!(old_flags & SB_RDONLY))
+ clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
+ sb->s_flags = old_flags;
+ fs_info->mount_opt = old_opts;
+ fs_info->compress_type = old_compress_type;
+ fs_info->max_inline = old_max_inline;
+ btrfs_resize_thread_pool(fs_info,
+ old_thread_pool_size, fs_info->thread_pool_size);
+ fs_info->metadata_ratio = old_metadata_ratio;
+ btrfs_remount_cleanup(fs_info, old_opts);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ return ret;
+}
+
+/* Used to sort the devices by max_avail(descending sort) */
+static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
+{
+ const struct btrfs_device_info *dev_info1 = a;
+ const struct btrfs_device_info *dev_info2 = b;
+
+ if (dev_info1->max_avail > dev_info2->max_avail)
+ return -1;
+ else if (dev_info1->max_avail < dev_info2->max_avail)
+ return 1;
+ return 0;
+}
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+ struct btrfs_device_info *devices,
+ size_t nr_devices)
+{
+ sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_free_bytes, NULL);
+}
+
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
+ u64 *free_bytes)
+{
+ struct btrfs_device_info *devices_info;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 type;
+ u64 avail_space;
+ u64 min_stripe_size;
+ int num_stripes = 1;
+ int i = 0, nr_devices;
+ const struct btrfs_raid_attr *rattr;
+
+ /*
+ * We aren't under the device list lock, so this is racy-ish, but good
+ * enough for our purposes.
+ */
+ nr_devices = fs_info->fs_devices->open_devices;
+ if (!nr_devices) {
+ smp_mb();
+ nr_devices = fs_info->fs_devices->open_devices;
+ ASSERT(nr_devices);
+ if (!nr_devices) {
+ *free_bytes = 0;
+ return 0;
+ }
+ }
+
+ devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
+ GFP_KERNEL);
+ if (!devices_info)
+ return -ENOMEM;
+
+ /* calc min stripe number for data space allocation */
+ type = btrfs_data_alloc_profile(fs_info);
+ rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
+
+ if (type & BTRFS_BLOCK_GROUP_RAID0)
+ num_stripes = nr_devices;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK)
+ num_stripes = rattr->ncopies;
+ else if (type & BTRFS_BLOCK_GROUP_RAID10)
+ num_stripes = 4;
+
+ /* Adjust for more than 1 stripe per device */
+ min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) ||
+ !device->bdev ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ continue;
+
+ if (i >= nr_devices)
+ break;
+
+ avail_space = device->total_bytes - device->bytes_used;
+
+ /* align with stripe_len */
+ avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
+
+ /*
+ * Ensure we have at least min_stripe_size on top of the
+ * reserved space on the device.
+ */
+ if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size)
+ continue;
+
+ avail_space -= BTRFS_DEVICE_RANGE_RESERVED;
+
+ devices_info[i].dev = device;
+ devices_info[i].max_avail = avail_space;
+
+ i++;
+ }
+ rcu_read_unlock();
+
+ nr_devices = i;
+
+ btrfs_descending_sort_devices(devices_info, nr_devices);
+
+ i = nr_devices - 1;
+ avail_space = 0;
+ while (nr_devices >= rattr->devs_min) {
+ num_stripes = min(num_stripes, nr_devices);
+
+ if (devices_info[i].max_avail >= min_stripe_size) {
+ int j;
+ u64 alloc_size;
+
+ avail_space += devices_info[i].max_avail * num_stripes;
+ alloc_size = devices_info[i].max_avail;
+ for (j = i + 1 - num_stripes; j <= i; j++)
+ devices_info[j].max_avail -= alloc_size;
+ }
+ i--;
+ nr_devices--;
+ }
+
+ kfree(devices_info);
+ *free_bytes = avail_space;
+ return 0;
+}
+
+/*
+ * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
+ *
+ * If there's a redundant raid level at DATA block groups, use the respective
+ * multiplier to scale the sizes.
+ *
+ * Unused device space usage is based on simulating the chunk allocator
+ * algorithm that respects the device sizes and order of allocations. This is
+ * a close approximation of the actual use but there are other factors that may
+ * change the result (like a new metadata chunk).
+ *
+ * If metadata is exhausted, f_bavail will be 0.
+ */
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct btrfs_space_info *found;
+ u64 total_used = 0;
+ u64 total_free_data = 0;
+ u64 total_free_meta = 0;
+ u32 bits = fs_info->sectorsize_bits;
+ __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
+ unsigned factor = 1;
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ int ret;
+ u64 thresh = 0;
+ int mixed = 0;
+
+ list_for_each_entry(found, &fs_info->space_info, list) {
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int i;
+
+ total_free_data += found->disk_total - found->disk_used;
+ total_free_data -=
+ btrfs_account_ro_block_groups_free_space(found);
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (!list_empty(&found->block_groups[i]))
+ factor = btrfs_bg_type_to_factor(
+ btrfs_raid_array[i].bg_flag);
+ }
+ }
+
+ /*
+ * Metadata in mixed block goup profiles are accounted in data
+ */
+ if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+ mixed = 1;
+ else
+ total_free_meta += found->disk_total -
+ found->disk_used;
+ }
+
+ total_used += found->disk_used;
+ }
+
+ buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
+ buf->f_blocks >>= bits;
+ buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
+
+ /* Account global block reserve as used, it's in logical size already */
+ spin_lock(&block_rsv->lock);
+ /* Mixed block groups accounting is not byte-accurate, avoid overflow */
+ if (buf->f_bfree >= block_rsv->size >> bits)
+ buf->f_bfree -= block_rsv->size >> bits;
+ else
+ buf->f_bfree = 0;
+ spin_unlock(&block_rsv->lock);
+
+ buf->f_bavail = div_u64(total_free_data, factor);
+ ret = btrfs_calc_avail_data_space(fs_info, &total_free_data);
+ if (ret)
+ return ret;
+ buf->f_bavail += div_u64(total_free_data, factor);
+ buf->f_bavail = buf->f_bavail >> bits;
+
+ /*
+ * We calculate the remaining metadata space minus global reserve. If
+ * this is (supposedly) smaller than zero, there's no space. But this
+ * does not hold in practice, the exhausted state happens where's still
+ * some positive delta. So we apply some guesswork and compare the
+ * delta to a 4M threshold. (Practically observed delta was ~2M.)
+ *
+ * We probably cannot calculate the exact threshold value because this
+ * depends on the internal reservations requested by various
+ * operations, so some operations that consume a few metadata will
+ * succeed even if the Avail is zero. But this is better than the other
+ * way around.
+ */
+ thresh = SZ_4M;
+
+ /*
+ * We only want to claim there's no available space if we can no longer
+ * allocate chunks for our metadata profile and our global reserve will
+ * not fit in the free metadata space. If we aren't ->full then we
+ * still can allocate chunks and thus are fine using the currently
+ * calculated f_bavail.
+ */
+ if (!mixed && block_rsv->space_info->full &&
+ (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size))
+ buf->f_bavail = 0;
+
+ buf->f_type = BTRFS_SUPER_MAGIC;
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+ buf->f_namelen = BTRFS_NAME_LEN;
+
+ /* We treat it as constant endianness (it doesn't matter _which_)
+ because we want the fsid to come out the same whether mounted
+ on a big-endian or little-endian host */
+ buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+ buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+ /* Mask in the root object ID too, to disambiguate subvols */
+ buf->f_fsid.val[0] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
+ buf->f_fsid.val[1] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid;
+
+ return 0;
+}
+
+static void btrfs_kill_super(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ kill_anon_super(sb);
+ btrfs_free_fs_info(fs_info);
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .mount = btrfs_mount,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+};
+
+static struct file_system_type btrfs_root_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .mount = btrfs_mount_root,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+};
+
+MODULE_ALIAS_FS("btrfs");
+
+static int btrfs_control_open(struct inode *inode, struct file *file)
+{
+ /*
+ * The control file's private_data is used to hold the
+ * transaction when it is started and is used to keep
+ * track of whether a transaction is already in progress.
+ */
+ file->private_data = NULL;
+ return 0;
+}
+
+/*
+ * Used by /dev/btrfs-control for devices ioctls.
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct btrfs_ioctl_vol_args *vol;
+ struct btrfs_device *device = NULL;
+ dev_t devt = 0;
+ int ret = -ENOTTY;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol = memdup_user((void __user *)arg, sizeof(*vol));
+ if (IS_ERR(vol))
+ return PTR_ERR(vol);
+ vol->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+ switch (cmd) {
+ case BTRFS_IOC_SCAN_DEV:
+ mutex_lock(&uuid_mutex);
+ device = btrfs_scan_one_device(vol->name, FMODE_READ,
+ &btrfs_root_fs_type);
+ ret = PTR_ERR_OR_ZERO(device);
+ mutex_unlock(&uuid_mutex);
+ break;
+ case BTRFS_IOC_FORGET_DEV:
+ if (vol->name[0] != 0) {
+ ret = lookup_bdev(vol->name, &devt);
+ if (ret)
+ break;
+ }
+ ret = btrfs_forget_devices(devt);
+ break;
+ case BTRFS_IOC_DEVICES_READY:
+ mutex_lock(&uuid_mutex);
+ device = btrfs_scan_one_device(vol->name, FMODE_READ,
+ &btrfs_root_fs_type);
+ if (IS_ERR(device)) {
+ mutex_unlock(&uuid_mutex);
+ ret = PTR_ERR(device);
+ break;
+ }
+ ret = !(device->fs_devices->num_devices ==
+ device->fs_devices->total_devices);
+ mutex_unlock(&uuid_mutex);
+ break;
+ case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+ ret = btrfs_ioctl_get_supported_features((void __user*)arg);
+ break;
+ }
+
+ kfree(vol);
+ return ret;
+}
+
+static int btrfs_freeze(struct super_block *sb)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
+
+ set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
+ /*
+ * We don't need a barrier here, we'll wait for any transaction that
+ * could be in progress on other threads (and do delayed iputs that
+ * we want to avoid on a frozen filesystem), or do the commit
+ * ourselves.
+ */
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ /* no transaction, don't bother */
+ if (PTR_ERR(trans) == -ENOENT)
+ return 0;
+ return PTR_ERR(trans);
+ }
+ return btrfs_commit_transaction(trans);
+}
+
+static int check_dev_super(struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_super_block *sb;
+ u16 csum_type;
+ int ret = 0;
+
+ /* This should be called with fs still frozen. */
+ ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
+
+ /* Missing dev, no need to check. */
+ if (!dev->bdev)
+ return 0;
+
+ /* Only need to check the primary super block. */
+ sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ /* Verify the checksum. */
+ csum_type = btrfs_super_csum_type(sb);
+ if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+ btrfs_err(fs_info, "csum type changed, has %u expect %u",
+ csum_type, btrfs_super_csum_type(fs_info->super_copy));
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (btrfs_check_super_csum(fs_info, sb)) {
+ btrfs_err(fs_info, "csum for on-disk super block no longer matches");
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /* Btrfs_validate_super() includes fsid check against super->fsid. */
+ ret = btrfs_validate_super(fs_info, sb, 0);
+ if (ret < 0)
+ goto out;
+
+ if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+ btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
+ btrfs_super_generation(sb),
+ fs_info->last_trans_committed);
+ ret = -EUCLEAN;
+ goto out;
+ }
+out:
+ btrfs_release_disk_super(sb);
+ return ret;
+}
+
+static int btrfs_unfreeze(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_device *device;
+ int ret = 0;
+
+ /*
+ * Make sure the fs is not changed by accident (like hibernation then
+ * modified by other OS).
+ * If we found anything wrong, we mark the fs error immediately.
+ *
+ * And since the fs is frozen, no one can modify the fs yet, thus
+ * we don't need to hold device_list_mutex.
+ */
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ ret = check_dev_super(device);
+ if (ret < 0) {
+ btrfs_handle_fs_error(fs_info, ret,
+ "super block on devid %llu got modified unexpectedly",
+ device->devid);
+ break;
+ }
+ }
+ clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
+
+ /*
+ * We still return 0, to allow VFS layer to unfreeze the fs even the
+ * above checks failed. Since the fs is either fine or read-only, we're
+ * safe to continue, without causing further damage.
+ */
+ return 0;
+}
+
+static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
+
+ /*
+ * There should be always a valid pointer in latest_dev, it may be stale
+ * for a short moment in case it's being deleted but still valid until
+ * the end of RCU grace period.
+ */
+ rcu_read_lock();
+ seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static const struct super_operations btrfs_super_ops = {
+ .drop_inode = btrfs_drop_inode,
+ .evict_inode = btrfs_evict_inode,
+ .put_super = btrfs_put_super,
+ .sync_fs = btrfs_sync_fs,
+ .show_options = btrfs_show_options,
+ .show_devname = btrfs_show_devname,
+ .alloc_inode = btrfs_alloc_inode,
+ .destroy_inode = btrfs_destroy_inode,
+ .free_inode = btrfs_free_inode,
+ .statfs = btrfs_statfs,
+ .remount_fs = btrfs_remount,
+ .freeze_fs = btrfs_freeze,
+ .unfreeze_fs = btrfs_unfreeze,
+};
+
+static const struct file_operations btrfs_ctl_fops = {
+ .open = btrfs_control_open,
+ .unlocked_ioctl = btrfs_control_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice btrfs_misc = {
+ .minor = BTRFS_MINOR,
+ .name = "btrfs-control",
+ .fops = &btrfs_ctl_fops
+};
+
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
+
+static int __init btrfs_interface_init(void)
+{
+ return misc_register(&btrfs_misc);
+}
+
+static __cold void btrfs_interface_exit(void)
+{
+ misc_deregister(&btrfs_misc);
+}
+
+static void __init btrfs_print_mod_info(void)
+{
+ static const char options[] = ""
+#ifdef CONFIG_BTRFS_DEBUG
+ ", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_ASSERT
+ ", assert=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ ", integrity-checker=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ ", ref-verify=on"
+#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ ", zoned=yes"
+#else
+ ", zoned=no"
+#endif
+#ifdef CONFIG_FS_VERITY
+ ", fsverity=yes"
+#else
+ ", fsverity=no"
+#endif
+ ;
+ pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
+}
+
+static int __init init_btrfs_fs(void)
+{
+ int err;
+
+ btrfs_props_init();
+
+ err = btrfs_init_sysfs();
+ if (err)
+ return err;
+
+ btrfs_init_compress();
+
+ err = btrfs_init_cachep();
+ if (err)
+ goto free_compress;
+
+ err = extent_state_init_cachep();
+ if (err)
+ goto free_cachep;
+
+ err = extent_buffer_init_cachep();
+ if (err)
+ goto free_extent_cachep;
+
+ err = btrfs_bioset_init();
+ if (err)
+ goto free_eb_cachep;
+
+ err = extent_map_init();
+ if (err)
+ goto free_bioset;
+
+ err = ordered_data_init();
+ if (err)
+ goto free_extent_map;
+
+ err = btrfs_delayed_inode_init();
+ if (err)
+ goto free_ordered_data;
+
+ err = btrfs_auto_defrag_init();
+ if (err)
+ goto free_delayed_inode;
+
+ err = btrfs_delayed_ref_init();
+ if (err)
+ goto free_auto_defrag;
+
+ err = btrfs_prelim_ref_init();
+ if (err)
+ goto free_delayed_ref;
+
+ err = btrfs_interface_init();
+ if (err)
+ goto free_prelim_ref;
+
+ btrfs_print_mod_info();
+
+ err = btrfs_run_sanity_tests();
+ if (err)
+ goto unregister_ioctl;
+
+ err = register_filesystem(&btrfs_fs_type);
+ if (err)
+ goto unregister_ioctl;
+
+ return 0;
+
+unregister_ioctl:
+ btrfs_interface_exit();
+free_prelim_ref:
+ btrfs_prelim_ref_exit();
+free_delayed_ref:
+ btrfs_delayed_ref_exit();
+free_auto_defrag:
+ btrfs_auto_defrag_exit();
+free_delayed_inode:
+ btrfs_delayed_inode_exit();
+free_ordered_data:
+ ordered_data_exit();
+free_extent_map:
+ extent_map_exit();
+free_bioset:
+ btrfs_bioset_exit();
+free_eb_cachep:
+ extent_buffer_free_cachep();
+free_extent_cachep:
+ extent_state_free_cachep();
+free_cachep:
+ btrfs_destroy_cachep();
+free_compress:
+ btrfs_exit_compress();
+ btrfs_exit_sysfs();
+
+ return err;
+}
+
+static void __exit exit_btrfs_fs(void)
+{
+ btrfs_destroy_cachep();
+ btrfs_delayed_ref_exit();
+ btrfs_auto_defrag_exit();
+ btrfs_delayed_inode_exit();
+ btrfs_prelim_ref_exit();
+ ordered_data_exit();
+ extent_map_exit();
+ btrfs_bioset_exit();
+ extent_state_free_cachep();
+ extent_buffer_free_cachep();
+ btrfs_interface_exit();
+ unregister_filesystem(&btrfs_fs_type);
+ btrfs_exit_sysfs();
+ btrfs_cleanup_fs_uuids();
+ btrfs_exit_compress();
+}
+
+late_initcall(init_btrfs_fs);
+module_exit(exit_btrfs_fs)
+
+MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: crc32c");
+MODULE_SOFTDEP("pre: xxhash64");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: blake2b-256");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000..fc468d107
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,2354 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/bug.h>
+#include <crypto/hash.h>
+
+#include "ctree.h"
+#include "discard.h"
+#include "disk-io.h"
+#include "send.h"
+#include "transaction.h"
+#include "sysfs.h"
+#include "volumes.h"
+#include "space-info.h"
+#include "block-group.h"
+#include "qgroup.h"
+#include "misc.h"
+
+/*
+ * Structure name Path
+ * --------------------------------------------------------------------------
+ * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features
+ * btrfs_supported_feature_attrs /sys/fs/btrfs/features and
+ * /sys/fs/btrfs/<uuid>/features
+ * btrfs_attrs /sys/fs/btrfs/<uuid>
+ * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid>
+ * allocation_attrs /sys/fs/btrfs/<uuid>/allocation
+ * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
+ * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>
+ * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ * discard_attrs /sys/fs/btrfs/<uuid>/discard
+ *
+ * When built with BTRFS_CONFIG_DEBUG:
+ *
+ * btrfs_debug_feature_attrs /sys/fs/btrfs/debug
+ * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
+ */
+
+struct btrfs_feature_attr {
+ struct kobj_attribute kobj_attr;
+ enum btrfs_feature_set feature_set;
+ u64 feature_bit;
+};
+
+/* For raid type sysfs entries */
+struct raid_kobject {
+ u64 flags;
+ struct kobject kobj;
+};
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
+{ \
+ .attr = { .name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+}
+
+#define BTRFS_ATTR_W(_prefix, _name, _store) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
+
+#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define BTRFS_ATTR(_prefix, _name, _show) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#define BTRFS_ATTR_PTR(_prefix, _name) \
+ (&btrfs_attr_##_prefix##_##_name.attr)
+
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \
+static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
+ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \
+ btrfs_feature_attr_show, \
+ btrfs_feature_attr_store), \
+ .feature_set = _feature_set, \
+ .feature_bit = _feature_prefix ##_## _feature_bit, \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name) \
+ (&btrfs_attr_features_##_name.kobj_attr.attr)
+
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
+static struct kobject *get_btrfs_kobj(struct kobject *kobj);
+
+static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
+{
+ return container_of(a, struct btrfs_feature_attr, kobj_attr);
+}
+
+static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+ return container_of(attr, struct kobj_attribute, attr);
+}
+
+static struct btrfs_feature_attr *attr_to_btrfs_feature_attr(
+ struct attribute *attr)
+{
+ return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
+
+static u64 get_features(struct btrfs_fs_info *fs_info,
+ enum btrfs_feature_set set)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ if (set == FEAT_COMPAT)
+ return btrfs_super_compat_flags(disk_super);
+ else if (set == FEAT_COMPAT_RO)
+ return btrfs_super_compat_ro_flags(disk_super);
+ else
+ return btrfs_super_incompat_flags(disk_super);
+}
+
+static void set_features(struct btrfs_fs_info *fs_info,
+ enum btrfs_feature_set set, u64 features)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ if (set == FEAT_COMPAT)
+ btrfs_set_super_compat_flags(disk_super, features);
+ else if (set == FEAT_COMPAT_RO)
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ else
+ btrfs_set_super_incompat_flags(disk_super, features);
+}
+
+static int can_modify_feature(struct btrfs_feature_attr *fa)
+{
+ int val = 0;
+ u64 set, clear;
+ switch (fa->feature_set) {
+ case FEAT_COMPAT:
+ set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+ break;
+ case FEAT_COMPAT_RO:
+ set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+ break;
+ case FEAT_INCOMPAT:
+ set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+ break;
+ default:
+ pr_warn("btrfs: sysfs: unknown feature set %d\n",
+ fa->feature_set);
+ return 0;
+ }
+
+ if (set & fa->feature_bit)
+ val |= 1;
+ if (clear & fa->feature_bit)
+ val |= 2;
+
+ return val;
+}
+
+static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val = 0;
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+ if (fs_info) {
+ u64 features = get_features(fs_info, fa->feature_set);
+ if (features & fa->feature_bit)
+ val = 1;
+ } else
+ val = can_modify_feature(fa);
+
+ return sysfs_emit(buf, "%d\n", val);
+}
+
+static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t count)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+ u64 features, set, clear;
+ unsigned long val;
+ int ret;
+
+ fs_info = to_fs_info(kobj);
+ if (!fs_info)
+ return -EPERM;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &val);
+ if (ret)
+ return ret;
+
+ if (fa->feature_set == FEAT_COMPAT) {
+ set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+ } else if (fa->feature_set == FEAT_COMPAT_RO) {
+ set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+ } else {
+ set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+ }
+
+ features = get_features(fs_info, fa->feature_set);
+
+ /* Nothing to do */
+ if ((val && (features & fa->feature_bit)) ||
+ (!val && !(features & fa->feature_bit)))
+ return count;
+
+ if ((val && !(set & fa->feature_bit)) ||
+ (!val && !(clear & fa->feature_bit))) {
+ btrfs_info(fs_info,
+ "%sabling feature %s on mounted fs is not supported.",
+ val ? "En" : "Dis", fa->kobj_attr.attr.name);
+ return -EPERM;
+ }
+
+ btrfs_info(fs_info, "%s %s feature flag",
+ val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
+
+ spin_lock(&fs_info->super_lock);
+ features = get_features(fs_info, fa->feature_set);
+ if (val)
+ features |= fa->feature_bit;
+ else
+ features &= ~fa->feature_bit;
+ set_features(fs_info, fa->feature_set, features);
+ spin_unlock(&fs_info->super_lock);
+
+ /*
+ * We don't want to do full transaction commit from inside sysfs
+ */
+ btrfs_set_pending(fs_info, COMMIT);
+ wake_up_process(fs_info->transaction_kthread);
+
+ return count;
+}
+
+static umode_t btrfs_feature_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ umode_t mode = attr->mode;
+
+ if (fs_info) {
+ struct btrfs_feature_attr *fa;
+ u64 features;
+
+ fa = attr_to_btrfs_feature_attr(attr);
+ features = get_features(fs_info, fa->feature_set);
+
+ if (can_modify_feature(fa))
+ mode |= S_IWUSR;
+ else if (!(features & fa->feature_bit))
+ mode = 0;
+ }
+
+ return mode;
+}
+
+BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD);
+BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
+BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
+BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
+BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
+BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+#ifdef CONFIG_BLK_DEV_ZONED
+BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
+/* Remove once support for extent tree v2 is feature complete */
+BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
+#endif
+#ifdef CONFIG_FS_VERITY
+BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
+#endif
+
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features - all available features implemented by this version
+ * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
+ * can be changed on a mounted filesystem.
+ */
+static struct attribute *btrfs_supported_feature_attrs[] = {
+ BTRFS_FEAT_ATTR_PTR(default_subvol),
+ BTRFS_FEAT_ATTR_PTR(mixed_groups),
+ BTRFS_FEAT_ATTR_PTR(compress_lzo),
+ BTRFS_FEAT_ATTR_PTR(compress_zstd),
+ BTRFS_FEAT_ATTR_PTR(extended_iref),
+ BTRFS_FEAT_ATTR_PTR(raid56),
+ BTRFS_FEAT_ATTR_PTR(skinny_metadata),
+ BTRFS_FEAT_ATTR_PTR(no_holes),
+ BTRFS_FEAT_ATTR_PTR(metadata_uuid),
+ BTRFS_FEAT_ATTR_PTR(free_space_tree),
+ BTRFS_FEAT_ATTR_PTR(raid1c34),
+ BTRFS_FEAT_ATTR_PTR(block_group_tree),
+#ifdef CONFIG_BLK_DEV_ZONED
+ BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
+#endif
+#ifdef CONFIG_FS_VERITY
+ BTRFS_FEAT_ATTR_PTR(verity),
+#endif
+ NULL
+};
+
+static const struct attribute_group btrfs_feature_attr_group = {
+ .name = "features",
+ .is_visible = btrfs_feature_visible,
+ .attrs = btrfs_supported_feature_attrs,
+};
+
+static ssize_t rmdir_subvol_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "0\n");
+}
+BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
+
+static ssize_t supported_checksums_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < btrfs_get_num_csums(); i++) {
+ /*
+ * This "trick" only works as long as 'enum btrfs_csum_type' has
+ * no holes in it
+ */
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "),
+ btrfs_super_csum_name(i));
+
+ }
+
+ ret += sysfs_emit_at(buf, ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+
+static ssize_t send_stream_version_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION);
+}
+BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+
+static const char *rescue_opts[] = {
+ "usebackuproot",
+ "nologreplay",
+ "ignorebadroots",
+ "ignoredatacsums",
+ "all",
+};
+
+static ssize_t supported_rescue_options_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]);
+ ret += sysfs_emit_at(buf, ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_rescue_options,
+ supported_rescue_options_show);
+
+static ssize_t supported_sectorsizes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+
+ /* An artificial limit to only support 4K and PAGE_SIZE */
+ if (PAGE_SIZE > SZ_4K)
+ ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
+ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
+
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_sectorsizes,
+ supported_sectorsizes_show);
+
+/*
+ * Features which only depend on kernel version.
+ *
+ * These are listed in /sys/fs/btrfs/features along with
+ * btrfs_supported_feature_attrs.
+ */
+static struct attribute *btrfs_supported_static_feature_attrs[] = {
+ BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
+ BTRFS_ATTR_PTR(static_feature, supported_checksums),
+ BTRFS_ATTR_PTR(static_feature, send_stream_version),
+ BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
+ BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
+ NULL
+};
+
+static const struct attribute_group btrfs_static_feature_attr_group = {
+ .name = "features",
+ .attrs = btrfs_supported_static_feature_attrs,
+};
+
+/*
+ * Discard statistics and tunables
+ */
+#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj))
+
+static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%lld\n",
+ atomic64_read(&fs_info->discard_ctl.discardable_bytes));
+}
+BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
+
+static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%d\n",
+ atomic_read(&fs_info->discard_ctl.discardable_extents));
+}
+BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
+
+static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_bitmap_bytes);
+}
+BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
+
+static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%lld\n",
+ atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
+}
+BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
+
+static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_extent_bytes);
+}
+BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
+
+static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.iops_limit));
+}
+
+static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u32 iops_limit;
+ int ret;
+
+ ret = kstrtou32(buf, 10, &iops_limit);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
+ btrfs_discard_calc_delay(discard_ctl);
+ btrfs_discard_schedule_work(discard_ctl, true);
+ return len;
+}
+BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
+ btrfs_discard_iops_limit_store);
+
+static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.kbps_limit));
+}
+
+static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u32 kbps_limit;
+ int ret;
+
+ ret = kstrtou32(buf, 10, &kbps_limit);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
+ btrfs_discard_schedule_work(discard_ctl, true);
+ return len;
+}
+BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
+ btrfs_discard_kbps_limit_store);
+
+static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n",
+ READ_ONCE(fs_info->discard_ctl.max_discard_size));
+}
+
+static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u64 max_discard_size;
+ int ret;
+
+ ret = kstrtou64(buf, 10, &max_discard_size);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size);
+
+ return len;
+}
+BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
+ btrfs_discard_max_discard_size_store);
+
+/*
+ * Per-filesystem stats for discard (when mounted with discard=async).
+ *
+ * Path: /sys/fs/btrfs/<uuid>/discard/
+ */
+static const struct attribute *discard_attrs[] = {
+ BTRFS_ATTR_PTR(discard, discardable_bytes),
+ BTRFS_ATTR_PTR(discard, discardable_extents),
+ BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
+ BTRFS_ATTR_PTR(discard, discard_bytes_saved),
+ BTRFS_ATTR_PTR(discard, discard_extent_bytes),
+ BTRFS_ATTR_PTR(discard, iops_limit),
+ BTRFS_ATTR_PTR(discard, kbps_limit),
+ BTRFS_ATTR_PTR(discard, max_discard_size),
+ NULL,
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+
+/*
+ * Per-filesystem runtime debugging exported via sysfs.
+ *
+ * Path: /sys/fs/btrfs/UUID/debug/
+ */
+static const struct attribute *btrfs_debug_mount_attrs[] = {
+ NULL,
+};
+
+/*
+ * Runtime debugging exported via sysfs, applies to all mounted filesystems.
+ *
+ * Path: /sys/fs/btrfs/debug
+ */
+static struct attribute *btrfs_debug_feature_attrs[] = {
+ NULL
+};
+
+static const struct attribute_group btrfs_debug_feature_attr_group = {
+ .name = "debug",
+ .attrs = btrfs_debug_feature_attrs,
+};
+
+#endif
+
+static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
+{
+ u64 val;
+ if (lock)
+ spin_lock(lock);
+ val = *value_ptr;
+ if (lock)
+ spin_unlock(lock);
+ return sysfs_emit(buf, "%llu\n", val);
+}
+
+static ssize_t global_rsv_size_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show);
+
+static ssize_t global_rsv_reserved_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show);
+
+#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf);
+BTRFS_ATTR(raid, total_bytes, raid_bytes_show);
+BTRFS_ATTR(raid, used_bytes, raid_bytes_show);
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
+ struct btrfs_block_group *block_group;
+ int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags);
+ u64 val = 0;
+
+ down_read(&sinfo->groups_sem);
+ list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
+ if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes))
+ val += block_group->length;
+ else
+ val += block_group->used;
+ }
+ up_read(&sinfo->groups_sem);
+ return sysfs_emit(buf, "%llu\n", val);
+}
+
+/*
+ * Allocation information about block group profiles.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/
+ */
+static struct attribute *raid_attrs[] = {
+ BTRFS_ATTR_PTR(raid, total_bytes),
+ BTRFS_ATTR_PTR(raid, used_bytes),
+ NULL
+};
+ATTRIBUTE_GROUPS(raid);
+
+static void release_raid_kobj(struct kobject *kobj)
+{
+ kfree(to_raid_kobj(kobj));
+}
+
+static struct kobj_type btrfs_raid_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = release_raid_kobj,
+ .default_groups = raid_groups,
+};
+
+#define SPACE_INFO_ATTR(field) \
+static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_space_info *sinfo = to_space_info(kobj); \
+ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
+} \
+BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
+
+static ssize_t btrfs_chunk_size_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size));
+}
+
+/*
+ * Store new chunk size in space info. Can be called on a read-only filesystem.
+ *
+ * If the new chunk size value is larger than 10% of free space it is reduced
+ * to match that limit. Alignment must be to 256M and the system chunk size
+ * cannot be set.
+ */
+static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ char *retptr;
+ u64 val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!fs_info->fs_devices)
+ return -EINVAL;
+
+ if (btrfs_is_zoned(fs_info))
+ return -EINVAL;
+
+ /* System block type must not be changed. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return -EPERM;
+
+ val = memparse(buf, &retptr);
+ /* There could be trailing '\n', also catch any typos after the value */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || val == 0)
+ return -EINVAL;
+
+ val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
+
+ /* Limit stripe size to 10% of available space. */
+ val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
+
+ /* Must be multiple of 256M. */
+ val &= ~((u64)SZ_256M - 1);
+
+ /* Must be at least 256M. */
+ if (val < SZ_256M)
+ return -EINVAL;
+
+ btrfs_update_space_info_chunk_size(space_info, val);
+
+ return len;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Request chunk allocation with current chunk size.
+ */
+static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ struct btrfs_trans_handle *trans;
+ bool val;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ return -EINVAL;
+
+ /*
+ * This is unsafe to be called from sysfs context and may cause
+ * unexpected problems.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_force_chunk_alloc(trans, space_info->flags);
+ btrfs_end_transaction(trans);
+
+ if (ret == 1)
+ return len;
+
+ return -ENOSPC;
+}
+BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store);
+
+#endif
+
+SPACE_INFO_ATTR(flags);
+SPACE_INFO_ATTR(total_bytes);
+SPACE_INFO_ATTR(bytes_used);
+SPACE_INFO_ATTR(bytes_pinned);
+SPACE_INFO_ATTR(bytes_reserved);
+SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(bytes_readonly);
+SPACE_INFO_ATTR(bytes_zone_unusable);
+SPACE_INFO_ATTR(disk_used);
+SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+}
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh < 0 || thresh > 100)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
+ btrfs_sinfo_bg_reclaim_threshold_show,
+ btrfs_sinfo_bg_reclaim_threshold_store);
+
+/*
+ * Allocation information about block group types.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/
+ */
+static struct attribute *space_info_attrs[] = {
+ BTRFS_ATTR_PTR(space_info, flags),
+ BTRFS_ATTR_PTR(space_info, total_bytes),
+ BTRFS_ATTR_PTR(space_info, bytes_used),
+ BTRFS_ATTR_PTR(space_info, bytes_pinned),
+ BTRFS_ATTR_PTR(space_info, bytes_reserved),
+ BTRFS_ATTR_PTR(space_info, bytes_may_use),
+ BTRFS_ATTR_PTR(space_info, bytes_readonly),
+ BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
+ BTRFS_ATTR_PTR(space_info, disk_used),
+ BTRFS_ATTR_PTR(space_info, disk_total),
+ BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(space_info, chunk_size),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
+#endif
+ NULL,
+};
+ATTRIBUTE_GROUPS(space_info);
+
+static void space_info_release(struct kobject *kobj)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+ kfree(sinfo);
+}
+
+static struct kobj_type space_info_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = space_info_release,
+ .default_groups = space_info_groups,
+};
+
+/*
+ * Allocation information about block groups.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/
+ */
+static const struct attribute *allocation_attrs[] = {
+ BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
+ BTRFS_ATTR_PTR(allocation, global_rsv_size),
+ NULL,
+};
+
+static ssize_t btrfs_label_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ char *label = fs_info->super_copy->label;
+ ssize_t ret;
+
+ spin_lock(&fs_info->super_lock);
+ ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label);
+ spin_unlock(&fs_info->super_lock);
+
+ return ret;
+}
+
+static ssize_t btrfs_label_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ size_t p_len;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ /*
+ * p_len is the len until the first occurrence of either
+ * '\n' or '\0'
+ */
+ p_len = strcspn(buf, "\n");
+
+ if (p_len >= BTRFS_LABEL_SIZE)
+ return -EINVAL;
+
+ spin_lock(&fs_info->super_lock);
+ memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
+ memcpy(fs_info->super_copy->label, buf, p_len);
+ spin_unlock(&fs_info->super_lock);
+
+ /*
+ * We don't want to do full transaction commit from inside sysfs
+ */
+ btrfs_set_pending(fs_info, COMMIT);
+ wake_up_process(fs_info->transaction_kthread);
+
+ return len;
+}
+BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store);
+
+static ssize_t btrfs_nodesize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
+}
+
+BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
+
+static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
+
+static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf,
+ "commits %llu\n"
+ "last_commit_ms %llu\n"
+ "max_commit_ms %llu\n"
+ "total_commit_ms %llu\n",
+ fs_info->commit_stats.commit_count,
+ div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
+}
+
+static ssize_t btrfs_commit_stats_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ unsigned long val;
+ int ret;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return ret;
+ if (val)
+ return -EINVAL;
+
+ WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0);
+
+ return len;
+}
+BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store);
+
+static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
+
+static ssize_t quota_override_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ int quota_override;
+
+ quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+ return sysfs_emit(buf, "%d\n", quota_override);
+}
+
+static ssize_t quota_override_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ unsigned long knob;
+ int err;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ err = kstrtoul(buf, 10, &knob);
+ if (err)
+ return err;
+ if (knob > 1)
+ return -EINVAL;
+
+ if (knob)
+ set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+ else
+ clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store);
+
+static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid);
+}
+
+BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
+
+static ssize_t btrfs_checksum_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
+
+ return sysfs_emit(buf, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
+}
+
+BTRFS_ATTR(, checksum, btrfs_checksum_show);
+
+static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ const char *str;
+
+ switch (READ_ONCE(fs_info->exclusive_operation)) {
+ case BTRFS_EXCLOP_NONE:
+ str = "none\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ str = "balance\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ str = "balance paused\n";
+ break;
+ case BTRFS_EXCLOP_DEV_ADD:
+ str = "device add\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REMOVE:
+ str = "device remove\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REPLACE:
+ str = "device replace\n";
+ break;
+ case BTRFS_EXCLOP_RESIZE:
+ str = "resize\n";
+ break;
+ case BTRFS_EXCLOP_SWAP_ACTIVATE:
+ str = "swap activate\n";
+ break;
+ default:
+ str = "UNKNOWN\n";
+ break;
+ }
+ return sysfs_emit(buf, "%s", str);
+}
+BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+
+static ssize_t btrfs_generation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n", fs_info->generation);
+}
+BTRFS_ATTR(, generation, btrfs_generation_show);
+
+static const char * const btrfs_read_policy_name[] = { "pid" };
+
+static ssize_t btrfs_read_policy_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (fs_devices->read_policy == i)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ else
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ }
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+
+ return ret;
+}
+
+static ssize_t btrfs_read_policy_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
+ if (i != fs_devices->read_policy) {
+ fs_devices->read_policy = i;
+ btrfs_info(fs_devices->fs_info,
+ "read policy set to '%s'",
+ btrfs_read_policy_name[i]);
+ }
+ return len;
+ }
+ }
+
+ return -EINVAL;
+}
+BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
+
+static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
+}
+
+static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh != 0 && (thresh <= 50 || thresh > 100))
+ return -EINVAL;
+
+ WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
+
+ return len;
+}
+BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
+ btrfs_bg_reclaim_threshold_store);
+
+/*
+ * Per-filesystem information and stats.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/
+ */
+static const struct attribute *btrfs_attrs[] = {
+ BTRFS_ATTR_PTR(, label),
+ BTRFS_ATTR_PTR(, nodesize),
+ BTRFS_ATTR_PTR(, sectorsize),
+ BTRFS_ATTR_PTR(, clone_alignment),
+ BTRFS_ATTR_PTR(, quota_override),
+ BTRFS_ATTR_PTR(, metadata_uuid),
+ BTRFS_ATTR_PTR(, checksum),
+ BTRFS_ATTR_PTR(, exclusive_operation),
+ BTRFS_ATTR_PTR(, generation),
+ BTRFS_ATTR_PTR(, read_policy),
+ BTRFS_ATTR_PTR(, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(, commit_stats),
+ NULL,
+};
+
+static void btrfs_release_fsid_kobj(struct kobject *kobj)
+{
+ struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
+
+ memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
+ complete(&fs_devs->kobj_unregister);
+}
+
+static struct kobj_type btrfs_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = btrfs_release_fsid_kobj,
+};
+
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
+{
+ if (kobj->ktype != &btrfs_ktype)
+ return NULL;
+ return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
+}
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+{
+ if (kobj->ktype != &btrfs_ktype)
+ return NULL;
+ return to_fs_devs(kobj)->fs_info;
+}
+
+static struct kobject *get_btrfs_kobj(struct kobject *kobj)
+{
+ while (kobj) {
+ if (kobj->ktype == &btrfs_ktype)
+ return kobj;
+ kobj = kobj->parent;
+ }
+ return NULL;
+}
+
+#define NUM_FEATURE_BITS 64
+#define BTRFS_FEATURE_NAME_MAX 13
+static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
+static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
+
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) ==
+ ARRAY_SIZE(btrfs_feature_attrs));
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) ==
+ ARRAY_SIZE(btrfs_feature_attrs[0]));
+
+static const u64 supported_feature_masks[FEAT_MAX] = {
+ [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
+ [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+ [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
+};
+
+static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
+{
+ int set;
+
+ for (set = 0; set < FEAT_MAX; set++) {
+ int i;
+ struct attribute *attrs[2];
+ struct attribute_group agroup = {
+ .name = "features",
+ .attrs = attrs,
+ };
+ u64 features = get_features(fs_info, set);
+ features &= ~supported_feature_masks[set];
+
+ if (!features)
+ continue;
+
+ attrs[1] = NULL;
+ for (i = 0; i < NUM_FEATURE_BITS; i++) {
+ struct btrfs_feature_attr *fa;
+
+ if (!(features & (1ULL << i)))
+ continue;
+
+ fa = &btrfs_feature_attrs[set][i];
+ attrs[0] = &fa->kobj_attr.attr;
+ if (add) {
+ int ret;
+ ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
+ &agroup);
+ if (ret)
+ return ret;
+ } else
+ sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
+ &agroup);
+ }
+
+ }
+ return 0;
+}
+
+static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+{
+ if (fs_devs->devinfo_kobj) {
+ kobject_del(fs_devs->devinfo_kobj);
+ kobject_put(fs_devs->devinfo_kobj);
+ fs_devs->devinfo_kobj = NULL;
+ }
+
+ if (fs_devs->devices_kobj) {
+ kobject_del(fs_devs->devices_kobj);
+ kobject_put(fs_devs->devices_kobj);
+ fs_devs->devices_kobj = NULL;
+ }
+
+ if (fs_devs->fsid_kobj.state_initialized) {
+ kobject_del(&fs_devs->fsid_kobj);
+ kobject_put(&fs_devs->fsid_kobj);
+ wait_for_completion(&fs_devs->kobj_unregister);
+ }
+}
+
+/* when fs_devs is NULL it will remove all fsid kobject */
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+{
+ struct list_head *fs_uuids = btrfs_get_fs_uuids();
+
+ if (fs_devs) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ return;
+ }
+
+ list_for_each_entry(fs_devs, fs_uuids, fs_list) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ }
+}
+
+static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+ }
+}
+
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
+{
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+
+ sysfs_remove_link(fsid_kobj, "bdi");
+
+ if (fs_info->space_info_kobj) {
+ sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+ kobject_del(fs_info->space_info_kobj);
+ kobject_put(fs_info->space_info_kobj);
+ }
+ if (fs_info->discard_kobj) {
+ sysfs_remove_files(fs_info->discard_kobj, discard_attrs);
+ kobject_del(fs_info->discard_kobj);
+ kobject_put(fs_info->discard_kobj);
+ }
+#ifdef CONFIG_BTRFS_DEBUG
+ if (fs_info->debug_kobj) {
+ sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ kobject_del(fs_info->debug_kobj);
+ kobject_put(fs_info->debug_kobj);
+ }
+#endif
+ addrm_unknown_feature_attrs(fs_info, false);
+ sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(fsid_kobj, btrfs_attrs);
+ btrfs_sysfs_remove_fs_devices(fs_info->fs_devices);
+}
+
+static const char * const btrfs_feature_set_names[FEAT_MAX] = {
+ [FEAT_COMPAT] = "compat",
+ [FEAT_COMPAT_RO] = "compat_ro",
+ [FEAT_INCOMPAT] = "incompat",
+};
+
+const char *btrfs_feature_set_name(enum btrfs_feature_set set)
+{
+ return btrfs_feature_set_names[set];
+}
+
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
+{
+ size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
+ int len = 0;
+ int i;
+ char *str;
+
+ str = kmalloc(bufsize, GFP_KERNEL);
+ if (!str)
+ return str;
+
+ for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+ const char *name;
+
+ if (!(flags & (1ULL << i)))
+ continue;
+
+ name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
+ len += scnprintf(str + len, bufsize - len, "%s%s",
+ len ? "," : "", name);
+ }
+
+ return str;
+}
+
+static void init_feature_attrs(void)
+{
+ struct btrfs_feature_attr *fa;
+ int set, i;
+
+ memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+ memset(btrfs_unknown_feature_names, 0,
+ sizeof(btrfs_unknown_feature_names));
+
+ for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
+ struct btrfs_feature_attr *sfa;
+ struct attribute *a = btrfs_supported_feature_attrs[i];
+ int bit;
+ sfa = attr_to_btrfs_feature_attr(a);
+ bit = ilog2(sfa->feature_bit);
+ fa = &btrfs_feature_attrs[sfa->feature_set][bit];
+
+ fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
+ }
+
+ for (set = 0; set < FEAT_MAX; set++) {
+ for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+ char *name = btrfs_unknown_feature_names[set][i];
+ fa = &btrfs_feature_attrs[set][i];
+
+ if (fa->kobj_attr.attr.name)
+ continue;
+
+ snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u",
+ btrfs_feature_set_names[set], i);
+
+ fa->kobj_attr.attr.name = name;
+ fa->kobj_attr.attr.mode = S_IRUGO;
+ fa->feature_set = set;
+ fa->feature_bit = 1ULL << i;
+ }
+ }
+}
+
+/*
+ * Create a sysfs entry for a given block group type at path
+ * /sys/fs/btrfs/UUID/allocation/data/TYPE
+ */
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_space_info *space_info = cache->space_info;
+ struct raid_kobject *rkobj;
+ const int index = btrfs_bg_flags_to_raid_index(cache->flags);
+ unsigned int nofs_flag;
+ int ret;
+
+ /*
+ * Setup a NOFS context because kobject_add(), deep in its call chain,
+ * does GFP_KERNEL allocations, and we are often called in a context
+ * where if reclaim is triggered we can deadlock (we are either holding
+ * a transaction handle or some lock required for a transaction
+ * commit).
+ */
+ nofs_flag = memalloc_nofs_save();
+
+ rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj) {
+ memalloc_nofs_restore(nofs_flag);
+ btrfs_warn(cache->fs_info,
+ "couldn't alloc memory for raid level kobject");
+ return;
+ }
+
+ rkobj->flags = cache->flags;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ /*
+ * We call this either on mount, or if we've created a block group for a
+ * new index type while running (i.e. when restriping). The running
+ * case is tricky because we could race with other threads, so we need
+ * to have this check to make sure we didn't already init the kobject.
+ *
+ * We don't have to protect on the free side because it only happens on
+ * unmount.
+ */
+ spin_lock(&space_info->lock);
+ if (space_info->block_group_kobjs[index]) {
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ return;
+ } else {
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+ spin_unlock(&space_info->lock);
+
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
+ btrfs_bg_type_to_raid_name(rkobj->flags));
+ memalloc_nofs_restore(nofs_flag);
+ if (ret) {
+ spin_lock(&space_info->lock);
+ space_info->block_group_kobjs[index] = NULL;
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ btrfs_warn(fs_info,
+ "failed to add kobject for block cache, ignoring");
+ return;
+ }
+}
+
+/*
+ * Remove sysfs directories for all block group types of a given space info and
+ * the space info as well
+ */
+void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ struct kobject *kobj;
+
+ kobj = space_info->block_group_kobjs[i];
+ space_info->block_group_kobjs[i] = NULL;
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+ }
+ kobject_del(&space_info->kobj);
+ kobject_put(&space_info->kobj);
+}
+
+static const char *alloc_name(u64 flags)
+{
+ switch (flags) {
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "mixed";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "metadata";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "data";
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "system";
+ default:
+ WARN_ON(1);
+ return "invalid-combination";
+ }
+}
+
+/*
+ * Create a sysfs entry for a space info type at path
+ * /sys/fs/btrfs/UUID/allocation/TYPE
+ */
+int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ int ret;
+
+ ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
+ fs_info->space_info_kobj, "%s",
+ alloc_name(space_info->flags));
+ if (ret) {
+ kobject_put(&space_info->kobj);
+ return ret;
+ }
+
+ return 0;
+}
+
+void btrfs_sysfs_remove_device(struct btrfs_device *device)
+{
+ struct kobject *devices_kobj;
+
+ /*
+ * Seed fs_devices devices_kobj aren't used, fetch kobject from the
+ * fs_info::fs_devices.
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ ASSERT(devices_kobj);
+
+ if (device->bdev)
+ sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name);
+
+ if (device->devid_kobj.state_initialized) {
+ kobject_del(&device->devid_kobj);
+ kobject_put(&device->devid_kobj);
+ wait_for_completion(&device->kobj_unregister);
+ }
+}
+
+static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+
+ return sysfs_emit(buf, "%d\n", val);
+}
+BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
+
+static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+
+ return sysfs_emit(buf, "%d\n", val);
+}
+BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
+
+static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ return sysfs_emit(buf, "%d\n", val);
+}
+BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
+
+static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max));
+}
+
+static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+ char *endptr;
+ unsigned long long limit;
+
+ limit = memparse(buf, &endptr);
+ /* There could be trailing '\n', also catch any typos after the value. */
+ endptr = skip_spaces(endptr);
+ if (*endptr != 0)
+ return -EINVAL;
+ WRITE_ONCE(device->scrub_speed_max, limit);
+ return len;
+}
+BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show,
+ btrfs_devinfo_scrub_speed_max_store);
+
+static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+
+ return sysfs_emit(buf, "%d\n", val);
+}
+BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
+
+static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid);
+}
+BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show);
+
+static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ if (!device->dev_stats_valid)
+ return sysfs_emit(buf, "invalid\n");
+
+ /*
+ * Print all at once so we get a snapshot of all values from the same
+ * time. Keep them in sync and in order of definition of
+ * btrfs_dev_stat_values.
+ */
+ return sysfs_emit(buf,
+ "write_errs %d\n"
+ "read_errs %d\n"
+ "flush_errs %d\n"
+ "corruption_errs %d\n"
+ "generation_errs %d\n",
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
+
+/*
+ * Information about one device.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/
+ */
+static struct attribute *devid_attrs[] = {
+ BTRFS_ATTR_PTR(devid, error_stats),
+ BTRFS_ATTR_PTR(devid, fsid),
+ BTRFS_ATTR_PTR(devid, in_fs_metadata),
+ BTRFS_ATTR_PTR(devid, missing),
+ BTRFS_ATTR_PTR(devid, replace_target),
+ BTRFS_ATTR_PTR(devid, scrub_speed_max),
+ BTRFS_ATTR_PTR(devid, writeable),
+ NULL
+};
+ATTRIBUTE_GROUPS(devid);
+
+static void btrfs_release_devid_kobj(struct kobject *kobj)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ memset(&device->devid_kobj, 0, sizeof(struct kobject));
+ complete(&device->kobj_unregister);
+}
+
+static struct kobj_type devid_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = devid_groups,
+ .release = btrfs_release_devid_kobj,
+};
+
+int btrfs_sysfs_add_device(struct btrfs_device *device)
+{
+ int ret;
+ unsigned int nofs_flag;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
+
+ /*
+ * Make sure we use the fs_info::fs_devices to fetch the kobjects even
+ * for the seed fs_devices
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj;
+ ASSERT(devices_kobj);
+ ASSERT(devinfo_kobj);
+
+ nofs_flag = memalloc_nofs_save();
+
+ if (device->bdev) {
+ struct kobject *disk_kobj = bdev_kobj(device->bdev);
+
+ ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+ if (ret) {
+ btrfs_warn(device->fs_info,
+ "creating sysfs device link for devid %llu failed: %d",
+ device->devid, ret);
+ goto out;
+ }
+ }
+
+ init_completion(&device->kobj_unregister);
+ ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype,
+ devinfo_kobj, "%llu", device->devid);
+ if (ret) {
+ kobject_put(&device->devid_kobj);
+ btrfs_warn(device->fs_info,
+ "devinfo init for devid %llu failed: %d",
+ device->devid, ret);
+ }
+
+out:
+ memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
+
+static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ btrfs_sysfs_remove_fs_devices(fs_devices);
+ return ret;
+}
+
+void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
+{
+ int ret;
+
+ ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+ if (ret)
+ pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+ action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+ &disk_to_dev(bdev->bd_disk)->kobj);
+}
+
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices)
+
+{
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+
+ /*
+ * Sprouting changes fsid of the mounted filesystem, rename the fsid
+ * directory
+ */
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid);
+ if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
+ btrfs_warn(fs_devices->fs_info,
+ "sysfs: failed to create fsid for sprout");
+}
+
+void btrfs_sysfs_update_devid(struct btrfs_device *device)
+{
+ char tmp[24];
+
+ snprintf(tmp, sizeof(tmp), "%llu", device->devid);
+
+ if (kobject_rename(&device->devid_kobj, tmp))
+ btrfs_warn(device->fs_devices->fs_info,
+ "sysfs: failed to update devid for %llu",
+ device->devid);
+}
+
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
+/*
+ * Creates:
+ * /sys/fs/btrfs/UUID
+ *
+ * Can be called by the device discovery thread.
+ */
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
+{
+ int error;
+
+ init_completion(&fs_devs->kobj_unregister);
+ fs_devs->fsid_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_devs->fsid);
+ if (error) {
+ kobject_put(&fs_devs->fsid_kobj);
+ return error;
+ }
+
+ fs_devs->devices_kobj = kobject_create_and_add("devices",
+ &fs_devs->fsid_kobj);
+ if (!fs_devs->devices_kobj) {
+ btrfs_err(fs_devs->fs_info,
+ "failed to init sysfs device interface");
+ btrfs_sysfs_remove_fsid(fs_devs);
+ return -ENOMEM;
+ }
+
+ fs_devs->devinfo_kobj = kobject_create_and_add("devinfo",
+ &fs_devs->fsid_kobj);
+ if (!fs_devs->devinfo_kobj) {
+ btrfs_err(fs_devs->fs_info,
+ "failed to init sysfs devinfo kobject");
+ btrfs_sysfs_remove_fsid(fs_devs);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
+{
+ int error;
+ struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
+ struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
+
+ error = btrfs_sysfs_add_fs_devices(fs_devs);
+ if (error)
+ return error;
+
+ error = sysfs_create_files(fsid_kobj, btrfs_attrs);
+ if (error) {
+ btrfs_sysfs_remove_fs_devices(fs_devs);
+ return error;
+ }
+
+ error = sysfs_create_group(fsid_kobj,
+ &btrfs_feature_attr_group);
+ if (error)
+ goto failure;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj);
+ if (!fs_info->debug_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ if (error)
+ goto failure;
+#endif
+
+ /* Discard directory */
+ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
+ if (!fs_info->discard_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
+ if (error)
+ goto failure;
+
+ error = addrm_unknown_feature_attrs(fs_info, true);
+ if (error)
+ goto failure;
+
+ error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (error)
+ goto failure;
+
+ fs_info->space_info_kobj = kobject_create_and_add("allocation",
+ fsid_kobj);
+ if (!fs_info->space_info_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+ if (error)
+ goto failure;
+
+ return 0;
+failure:
+ btrfs_sysfs_remove_mounted(fs_info);
+ return error;
+}
+
+static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool enabled;
+
+ spin_lock(&fs_info->qgroup_lock);
+ enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", enabled);
+}
+BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+
+static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool inconsistent;
+
+ spin_lock(&fs_info->qgroup_lock);
+ inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", inconsistent);
+}
+BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show);
+
+static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 result;
+
+ spin_lock(&fs_info->qgroup_lock);
+ result = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", result);
+}
+
+static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 new_thres;
+ int ret;
+
+ ret = kstrtou8(buf, 10, &new_thres);
+ if (ret)
+ return -EINVAL;
+
+ if (new_thres > BTRFS_MAX_LEVEL)
+ return -EINVAL;
+
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->qgroup_drop_subtree_thres = new_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return len;
+}
+BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show,
+ qgroup_drop_subtree_thres_store);
+
+/*
+ * Qgroups global info
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/
+ */
+static struct attribute *qgroups_attrs[] = {
+ BTRFS_ATTR_PTR(qgroups, enabled),
+ BTRFS_ATTR_PTR(qgroups, inconsistent),
+ BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroups);
+
+static void qgroups_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static struct kobj_type qgroups_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = qgroups_groups,
+ .release = qgroups_release,
+};
+
+static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
+{
+ return to_fs_info(kobj->parent->parent);
+}
+
+#define QGROUP_ATTR(_member, _show_name) \
+static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member)
+
+#define QGROUP_RSV_ATTR(_name, _type) \
+static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->rsv.values[_type], \
+ &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
+
+QGROUP_ATTR(rfer, referenced);
+QGROUP_ATTR(excl, exclusive);
+QGROUP_ATTR(max_rfer, max_referenced);
+QGROUP_ATTR(max_excl, max_exclusive);
+QGROUP_ATTR(lim_flags, limit_flags);
+QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
+QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
+QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
+
+/*
+ * Qgroup information.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/
+ */
+static struct attribute *qgroup_attrs[] = {
+ BTRFS_ATTR_PTR(qgroup, referenced),
+ BTRFS_ATTR_PTR(qgroup, exclusive),
+ BTRFS_ATTR_PTR(qgroup, max_referenced),
+ BTRFS_ATTR_PTR(qgroup, max_exclusive),
+ BTRFS_ATTR_PTR(qgroup, limit_flags),
+ BTRFS_ATTR_PTR(qgroup, rsv_data),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroup);
+
+static void qgroup_release(struct kobject *kobj)
+{
+ struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj);
+
+ memset(&qgroup->kobj, 0, sizeof(*kobj));
+}
+
+static struct kobj_type qgroup_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = qgroup_release,
+ .default_groups = qgroup_groups,
+};
+
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ struct kobject *qgroups_kobj = fs_info->qgroups_kobj;
+ int ret;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+ if (qgroup->kobj.state_initialized)
+ return 0;
+ if (!qgroups_kobj)
+ return -EINVAL;
+
+ ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
+ "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid));
+ if (ret < 0)
+ kobject_put(&qgroup->kobj);
+
+ return ret;
+}
+
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node)
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ if (fs_info->qgroups_kobj) {
+ kobject_del(fs_info->qgroups_kobj);
+ kobject_put(fs_info->qgroups_kobj);
+ fs_info->qgroups_kobj = NULL;
+ }
+}
+
+/* Called when qgroups get initialized, thus there is no need for locking */
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+ int ret = 0;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+
+ ASSERT(fsid_kobj);
+ if (fs_info->qgroups_kobj)
+ return 0;
+
+ fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!fs_info->qgroups_kobj)
+ return -ENOMEM;
+
+ ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype,
+ fsid_kobj, "qgroups");
+ if (ret < 0)
+ goto out;
+
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node) {
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ if (ret < 0)
+ btrfs_sysfs_del_qgroups(fs_info);
+ return ret;
+}
+
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ if (qgroup->kobj.state_initialized) {
+ kobject_del(&qgroup->kobj);
+ kobject_put(&qgroup->kobj);
+ }
+}
+
+/*
+ * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
+ * values in superblock. Call after any changes to incompat/compat_ro flags
+ */
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+ u64 bit, enum btrfs_feature_set set)
+{
+ struct btrfs_fs_devices *fs_devs;
+ struct kobject *fsid_kobj;
+ u64 __maybe_unused features;
+ int __maybe_unused ret;
+
+ if (!fs_info)
+ return;
+
+ /*
+ * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+ * safe when called from some contexts (eg. balance)
+ */
+ features = get_features(fs_info, set);
+ ASSERT(bit & supported_feature_masks[set]);
+
+ fs_devs = fs_info->fs_devices;
+ fsid_kobj = &fs_devs->fsid_kobj;
+
+ if (!fsid_kobj->state_initialized)
+ return;
+
+ /*
+ * FIXME: this is too heavy to update just one value, ideally we'd like
+ * to use sysfs_update_group but some refactoring is needed first.
+ */
+ sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+}
+
+int __init btrfs_init_sysfs(void)
+{
+ int ret;
+
+ btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+ if (!btrfs_kset)
+ return -ENOMEM;
+
+ init_feature_attrs();
+ ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+ if (ret)
+ goto out2;
+ ret = sysfs_merge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ if (ret)
+ goto out_remove_group;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
+ if (ret) {
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ goto out_remove_group;
+ }
+#endif
+
+ return 0;
+
+out_remove_group:
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+out2:
+ kset_unregister(btrfs_kset);
+
+ return ret;
+}
+
+void __cold btrfs_exit_sysfs(void)
+{
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+#ifdef CONFIG_BTRFS_DEBUG
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
+#endif
+ kset_unregister(btrfs_kset);
+}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
new file mode 100644
index 000000000..bacef43f7
--- /dev/null
+++ b/fs/btrfs/sysfs.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SYSFS_H
+#define BTRFS_SYSFS_H
+
+#include <linux/kobject.h>
+
+enum btrfs_feature_set {
+ FEAT_COMPAT,
+ FEAT_COMPAT_RO,
+ FEAT_INCOMPAT,
+ FEAT_MAX
+};
+
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
+const char *btrfs_feature_set_name(enum btrfs_feature_set set);
+int btrfs_sysfs_add_device(struct btrfs_device *device);
+void btrfs_sysfs_remove_device(struct btrfs_device *device);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+ u64 bit, enum btrfs_feature_set set);
+void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
+
+int __init btrfs_init_sysfs(void);
+void __cold btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache);
+int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info);
+void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
+void btrfs_sysfs_update_devid(struct btrfs_device *device);
+
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+
+#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
new file mode 100644
index 000000000..d43cb5242
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../free-space-cache.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+#include "../block-group.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+const char *test_error[] = {
+ [TEST_ALLOC_FS_INFO] = "cannot allocate fs_info",
+ [TEST_ALLOC_ROOT] = "cannot allocate root",
+ [TEST_ALLOC_EXTENT_BUFFER] = "cannot extent buffer",
+ [TEST_ALLOC_PATH] = "cannot allocate path",
+ [TEST_ALLOC_INODE] = "cannot allocate inode",
+ [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
+ [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
+};
+
+static const struct super_operations btrfs_test_super_ops = {
+ .alloc_inode = btrfs_alloc_inode,
+ .destroy_inode = btrfs_test_destroy_inode,
+};
+
+
+static int btrfs_test_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx = init_pseudo(fc, BTRFS_TEST_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->ops = &btrfs_test_super_ops;
+ return 0;
+}
+
+static struct file_system_type test_type = {
+ .name = "btrfs_test_fs",
+ .init_fs_context = btrfs_test_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+ struct inode *inode;
+
+ inode = new_inode(test_mnt->mnt_sb);
+ if (!inode)
+ return NULL;
+
+ inode->i_mode = S_IFREG;
+ inode->i_ino = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.offset = 0;
+ inode_init_owner(&init_user_ns, inode, NULL, S_IFREG);
+
+ return inode;
+}
+
+static int btrfs_init_test_fs(void)
+{
+ int ret;
+
+ ret = register_filesystem(&test_type);
+ if (ret) {
+ printk(KERN_ERR "btrfs: cannot register test file system\n");
+ return ret;
+ }
+
+ test_mnt = kern_mount(&test_type);
+ if (IS_ERR(test_mnt)) {
+ printk(KERN_ERR "btrfs: cannot mount test file system\n");
+ unregister_filesystem(&test_type);
+ return PTR_ERR(test_mnt);
+ }
+ return 0;
+}
+
+static void btrfs_destroy_test_fs(void)
+{
+ kern_unmount(test_mnt);
+ unregister_filesystem(&test_type);
+}
+
+struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *dev;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ INIT_LIST_HEAD(&dev->dev_list);
+ list_add(&dev->dev_list, &fs_info->fs_devices->devices);
+
+ return dev;
+}
+
+static void btrfs_free_dummy_device(struct btrfs_device *dev)
+{
+ extent_io_tree_release(&dev->alloc_state);
+ kfree(dev);
+}
+
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
+{
+ struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+ GFP_KERNEL);
+
+ if (!fs_info)
+ return fs_info;
+ fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+ GFP_KERNEL);
+ if (!fs_info->fs_devices) {
+ kfree(fs_info);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&fs_info->fs_devices->devices);
+
+ fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
+ GFP_KERNEL);
+ if (!fs_info->super_copy) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ btrfs_init_fs_info(fs_info);
+
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+ test_mnt->mnt_sb->s_fs_info = fs_info;
+
+ return fs_info;
+}
+
+void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct btrfs_device *dev, *tmp;
+
+ if (!fs_info)
+ return;
+
+ if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
+ &fs_info->fs_state)))
+ return;
+
+ test_mnt->mnt_sb->s_fs_info = NULL;
+
+ spin_lock(&fs_info->buffer_lock);
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+ struct extent_buffer *eb;
+
+ eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+ if (!eb)
+ continue;
+ /* Shouldn't happen but that kind of thinking creates CVE's */
+ if (radix_tree_exception(eb)) {
+ if (radix_tree_deref_retry(eb))
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ slot = radix_tree_iter_resume(slot, &iter);
+ spin_unlock(&fs_info->buffer_lock);
+ free_extent_buffer_stale(eb);
+ spin_lock(&fs_info->buffer_lock);
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
+ dev_list) {
+ btrfs_free_dummy_device(dev);
+ }
+ btrfs_free_qgroup_config(fs_info);
+ btrfs_free_fs_roots(fs_info);
+ kfree(fs_info->super_copy);
+ btrfs_check_leaked_roots(fs_info);
+ btrfs_extent_buffer_leak_debug_check(fs_info);
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+}
+
+void btrfs_free_dummy_root(struct btrfs_root *root)
+{
+ if (IS_ERR_OR_NULL(root))
+ return;
+ /* Will be freed by btrfs_free_fs_roots */
+ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
+ return;
+ btrfs_global_root_delete(root);
+ btrfs_put_root(root);
+}
+
+struct btrfs_block_group *
+btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
+ unsigned long length)
+{
+ struct btrfs_block_group *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+ if (!cache)
+ return NULL;
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_KERNEL);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->start = 0;
+ cache->length = length;
+ cache->full_stripe_len = fs_info->sectorsize;
+ cache->fs_info = fs_info;
+
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
+ mutex_init(&cache->free_space_lock);
+
+ return cache;
+}
+
+void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
+{
+ if (!cache)
+ return;
+ btrfs_remove_free_space_cache(cache);
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+}
+
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ trans->type = __TRANS_DUMMY;
+ trans->fs_info = fs_info;
+}
+
+int btrfs_run_sanity_tests(void)
+{
+ int ret, i;
+ u32 sectorsize, nodesize;
+ u32 test_sectorsize[] = {
+ PAGE_SIZE,
+ };
+ ret = btrfs_init_test_fs();
+ if (ret)
+ return ret;
+ for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) {
+ sectorsize = test_sectorsize[i];
+ for (nodesize = sectorsize;
+ nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE;
+ nodesize <<= 1) {
+ pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n",
+ sectorsize, nodesize);
+ ret = btrfs_test_free_space_cache(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_buffer_operations(sectorsize,
+ nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_io(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_inodes(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_qgroups(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ }
+ }
+ ret = btrfs_test_extent_map();
+
+out:
+ btrfs_destroy_test_fs();
+ return ret;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
new file mode 100644
index 000000000..7a2d7ffbe
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ */
+
+#ifndef BTRFS_TESTS_H
+#define BTRFS_TESTS_H
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_run_sanity_tests(void);
+
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__)
+#define test_err(fmt, ...) pr_err("BTRFS: selftest: %s:%d " fmt "\n", \
+ __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define test_std_err(index) test_err("%s", test_error[index])
+
+enum {
+ TEST_ALLOC_FS_INFO,
+ TEST_ALLOC_ROOT,
+ TEST_ALLOC_EXTENT_BUFFER,
+ TEST_ALLOC_PATH,
+ TEST_ALLOC_INODE,
+ TEST_ALLOC_BLOCK_GROUP,
+ TEST_ALLOC_EXTENT_MAP,
+};
+
+extern const char *test_error[];
+
+struct btrfs_root;
+struct btrfs_trans_handle;
+
+int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
+int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
+int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
+int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
+int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
+int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
+int btrfs_test_extent_map(void);
+struct inode *btrfs_new_test_inode(void);
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
+void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
+void btrfs_free_dummy_root(struct btrfs_root *root);
+struct btrfs_block_group *
+btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
+void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
+#else
+static inline int btrfs_run_sanity_tests(void)
+{
+ return 0;
+}
+#endif
+
+#endif
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
new file mode 100644
index 000000000..b7d181a08
--- /dev/null
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../disk-io.h"
+
+static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_path *path = NULL;
+ struct btrfs_root *root = NULL;
+ struct extent_buffer *eb;
+ char *value = "mary had a little lamb";
+ char *split1 = "mary had a little";
+ char *split2 = " lamb";
+ char *split3 = "mary";
+ char *split4 = " had a little";
+ char buf[32];
+ struct btrfs_key key;
+ u32 value_len = strlen(value);
+ int ret = 0;
+
+ test_msg("running btrfs_split_item tests");
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_std_err(TEST_ALLOC_PATH);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ eb = alloc_dummy_extent_buffer(fs_info, nodesize);
+ path->nodes[0] = eb;
+ if (!eb) {
+ test_std_err(TEST_ALLOC_EXTENT_BUFFER);
+ ret = -ENOMEM;
+ goto out;
+ }
+ path->slots[0] = 0;
+
+ key.objectid = 0;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = 0;
+
+ btrfs_setup_item_for_insert(root, path, &key, value_len);
+ write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+ value_len);
+
+ key.offset = 3;
+
+ /*
+ * Passing NULL trans here should be safe because we have plenty of
+ * space in this leaf to split the item without having to split the
+ * leaf.
+ */
+ ret = btrfs_split_item(NULL, root, path, &key, 17);
+ if (ret) {
+ test_err("split item failed %d", ret);
+ goto out;
+ }
+
+ /*
+ * Read the first slot, it should have the original key and contain only
+ * 'mary had a little'
+ */
+ btrfs_item_key_to_cpu(eb, &key, 0);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 0) {
+ test_err("invalid key at slot 0");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_item_size(eb, 0) != strlen(split1)) {
+ test_err("invalid len in the first split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+ strlen(split1));
+ if (memcmp(buf, split1, strlen(split1))) {
+ test_err(
+"data in the buffer doesn't match what it should in the first split have='%.*s' want '%s'",
+ (int)strlen(split1), buf, split1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 1);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 3) {
+ test_err("invalid key at slot 1");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_item_size(eb, 1) != strlen(split2)) {
+ test_err("invalid len in the second split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+ strlen(split2));
+ if (memcmp(buf, split2, strlen(split2))) {
+ test_err(
+ "data in the buffer doesn't match what it should in the second split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ key.offset = 1;
+ /* Do it again so we test memmoving the other items in the leaf */
+ ret = btrfs_split_item(NULL, root, path, &key, 4);
+ if (ret) {
+ test_err("second split item failed %d", ret);
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 0);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 0) {
+ test_err("invalid key at slot 0");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_item_size(eb, 0) != strlen(split3)) {
+ test_err("invalid len in the first split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+ strlen(split3));
+ if (memcmp(buf, split3, strlen(split3))) {
+ test_err(
+ "data in the buffer doesn't match what it should in the third split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 1);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 1) {
+ test_err("invalid key at slot 1");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_item_size(eb, 1) != strlen(split4)) {
+ test_err("invalid len in the second split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+ strlen(split4));
+ if (memcmp(buf, split4, strlen(split4))) {
+ test_err(
+ "data in the buffer doesn't match what it should in the fourth split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 2);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 3) {
+ test_err("invalid key at slot 2");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_item_size(eb, 2) != strlen(split2)) {
+ test_err("invalid len in the second split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
+ strlen(split2));
+ if (memcmp(buf, split2, strlen(split2))) {
+ test_err(
+ "data in the buffer doesn't match what it should in the last chunk");
+ ret = -EINVAL;
+ goto out;
+ }
+out:
+ btrfs_free_path(path);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize)
+{
+ test_msg("running extent buffer operation tests");
+ return test_btrfs_split_item(sectorsize, nodesize);
+}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
new file mode 100644
index 000000000..350da449d
--- /dev/null
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sizes.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../btrfs_inode.h"
+
+#define PROCESS_UNLOCK (1 << 0)
+#define PROCESS_RELEASE (1 << 1)
+#define PROCESS_TEST_LOCKED (1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+ unsigned long flags)
+{
+ int ret;
+ struct folio_batch fbatch;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
+ int i;
+ int count = 0;
+ int loops = 0;
+
+ folio_batch_init(&fbatch);
+
+ while (index <= end_index) {
+ ret = filemap_get_folios_contig(inode->i_mapping, &index,
+ end_index, &fbatch);
+ for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ if (flags & PROCESS_TEST_LOCKED &&
+ !folio_test_locked(folio))
+ count++;
+ if (flags & PROCESS_UNLOCK && folio_test_locked(folio))
+ folio_unlock(folio);
+ if (flags & PROCESS_RELEASE)
+ folio_put(folio);
+ }
+ folio_batch_release(&fbatch);
+ cond_resched();
+ loops++;
+ if (loops > 100000) {
+ printk(KERN_ERR
+ "stuck in a loop, start %llu, end %llu, ret %d\n",
+ start, end, ret);
+ break;
+ }
+ }
+
+ return count;
+}
+
+#define STATE_FLAG_STR_LEN 256
+
+#define PRINT_ONE_FLAG(state, dest, cur, name) \
+({ \
+ if (state->state & EXTENT_##name) \
+ cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur, \
+ "%s" #name, cur == 0 ? "" : "|"); \
+})
+
+static void extent_flag_to_str(const struct extent_state *state, char *dest)
+{
+ int cur = 0;
+
+ dest[0] = 0;
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY);
+ PRINT_ONE_FLAG(state, dest, cur, UPTODATE);
+ PRINT_ONE_FLAG(state, dest, cur, LOCKED);
+ PRINT_ONE_FLAG(state, dest, cur, NEW);
+ PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
+ PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
+ PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
+ PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
+ PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
+ PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
+ PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
+ PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
+ PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
+}
+
+static void dump_extent_io_tree(const struct extent_io_tree *tree)
+{
+ struct rb_node *node;
+ char flags_str[STATE_FLAG_STR_LEN];
+
+ node = rb_first(&tree->state);
+ test_msg("io tree content:");
+ while (node) {
+ struct extent_state *state;
+
+ state = rb_entry(node, struct extent_state, rb_node);
+ extent_flag_to_str(state, flags_str);
+ test_msg(" start=%llu len=%llu flags=%s", state->start,
+ state->end + 1 - state->start, flags_str);
+ node = rb_next(node);
+ }
+}
+
+static int test_find_delalloc(u32 sectorsize)
+{
+ struct inode *inode;
+ struct extent_io_tree *tmp;
+ struct page *page;
+ struct page *locked_page = NULL;
+ unsigned long index = 0;
+ /* In this test we need at least 2 file extents at its maximum size */
+ u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
+ u64 total_dirty = 2 * max_bytes;
+ u64 start, end, test_start;
+ bool found;
+ int ret = -EINVAL;
+
+ test_msg("running find delalloc tests");
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
+ return -ENOMEM;
+ }
+ tmp = &BTRFS_I(inode)->io_tree;
+
+ /*
+ * Passing NULL as we don't have fs_info but tracepoints are not used
+ * at this point
+ */
+ extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
+
+ /*
+ * First go through and create and mark all of our pages dirty, we pin
+ * everything to make sure our pages don't get evicted and screw up our
+ * test.
+ */
+ for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
+ page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
+ if (!page) {
+ test_err("failed to allocate test page");
+ ret = -ENOMEM;
+ goto out;
+ }
+ SetPageDirty(page);
+ if (index) {
+ unlock_page(page);
+ } else {
+ get_page(page);
+ locked_page = page;
+ }
+ }
+
+ /* Test this scenario
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
+ start = 0;
+ end = start + PAGE_SIZE - 1;
+ found = find_lock_delalloc_range(inode, locked_page, &start,
+ &end);
+ if (!found) {
+ test_err("should have found at least one delalloc");
+ goto out_bits;
+ }
+ if (start != 0 || end != (sectorsize - 1)) {
+ test_err("expected start 0 end %u, got start %llu end %llu",
+ sectorsize - 1, start, end);
+ goto out_bits;
+ }
+ unlock_extent(tmp, start, end, NULL);
+ unlock_page(locked_page);
+ put_page(locked_page);
+
+ /*
+ * Test this scenario
+ *
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ test_start = SZ_64M;
+ locked_page = find_lock_page(inode->i_mapping,
+ test_start >> PAGE_SHIFT);
+ if (!locked_page) {
+ test_err("couldn't find the locked page");
+ goto out_bits;
+ }
+ set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
+ start = test_start;
+ end = start + PAGE_SIZE - 1;
+ found = find_lock_delalloc_range(inode, locked_page, &start,
+ &end);
+ if (!found) {
+ test_err("couldn't find delalloc in our range");
+ goto out_bits;
+ }
+ if (start != test_start || end != max_bytes - 1) {
+ test_err("expected start %llu end %llu, got start %llu, end %llu",
+ test_start, max_bytes - 1, start, end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end,
+ PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+ test_err("there were unlocked pages in the range");
+ goto out_bits;
+ }
+ unlock_extent(tmp, start, end, NULL);
+ /* locked_page was unlocked above */
+ put_page(locked_page);
+
+ /*
+ * Test this scenario
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ test_start = max_bytes + sectorsize;
+ locked_page = find_lock_page(inode->i_mapping, test_start >>
+ PAGE_SHIFT);
+ if (!locked_page) {
+ test_err("couldn't find the locked page");
+ goto out_bits;
+ }
+ start = test_start;
+ end = start + PAGE_SIZE - 1;
+ found = find_lock_delalloc_range(inode, locked_page, &start,
+ &end);
+ if (found) {
+ test_err("found range when we shouldn't have");
+ goto out_bits;
+ }
+ if (end != test_start + PAGE_SIZE - 1) {
+ test_err("did not return the proper end offset");
+ goto out_bits;
+ }
+
+ /*
+ * Test this scenario
+ * [------- delalloc -------|
+ * [max_bytes]|-- search--|
+ *
+ * We are re-using our test_start from above since it works out well.
+ */
+ set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
+ start = test_start;
+ end = start + PAGE_SIZE - 1;
+ found = find_lock_delalloc_range(inode, locked_page, &start,
+ &end);
+ if (!found) {
+ test_err("didn't find our range");
+ goto out_bits;
+ }
+ if (start != test_start || end != total_dirty - 1) {
+ test_err("expected start %llu end %llu, got start %llu end %llu",
+ test_start, total_dirty - 1, start, end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end,
+ PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+ test_err("pages in range were not all locked");
+ goto out_bits;
+ }
+ unlock_extent(tmp, start, end, NULL);
+
+ /*
+ * Now to test where we run into a page that is no longer dirty in the
+ * range we want to find.
+ */
+ page = find_get_page(inode->i_mapping,
+ (max_bytes + SZ_1M) >> PAGE_SHIFT);
+ if (!page) {
+ test_err("couldn't find our page");
+ goto out_bits;
+ }
+ ClearPageDirty(page);
+ put_page(page);
+
+ /* We unlocked it in the previous test */
+ lock_page(locked_page);
+ start = test_start;
+ end = start + PAGE_SIZE - 1;
+ /*
+ * Currently if we fail to find dirty pages in the delalloc range we
+ * will adjust max_bytes down to PAGE_SIZE and then re-search. If
+ * this changes at any point in the future we will need to fix this
+ * tests expected behavior.
+ */
+ found = find_lock_delalloc_range(inode, locked_page, &start,
+ &end);
+ if (!found) {
+ test_err("didn't find our range");
+ goto out_bits;
+ }
+ if (start != test_start && end != test_start + PAGE_SIZE - 1) {
+ test_err("expected start %llu end %llu, got start %llu end %llu",
+ test_start, test_start + PAGE_SIZE - 1, start, end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+ PROCESS_UNLOCK)) {
+ test_err("pages in range were not all locked");
+ goto out_bits;
+ }
+ ret = 0;
+out_bits:
+ if (ret)
+ dump_extent_io_tree(tmp);
+ clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
+out:
+ if (locked_page)
+ put_page(locked_page);
+ process_page_range(inode, 0, total_dirty - 1,
+ PROCESS_UNLOCK | PROCESS_RELEASE);
+ iput(inode);
+ return ret;
+}
+
+static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb,
+ unsigned long len)
+{
+ unsigned long i;
+
+ for (i = 0; i < len * BITS_PER_BYTE; i++) {
+ int bit, bit1;
+
+ bit = !!test_bit(i, bitmap);
+ bit1 = !!extent_buffer_test_bit(eb, 0, i);
+ if (bit1 != bit) {
+ test_err("bits do not match");
+ return -EINVAL;
+ }
+
+ bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1 != bit) {
+ test_err("offset bits do not match");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
+ unsigned long len)
+{
+ unsigned long i, j;
+ u32 x;
+ int ret;
+
+ memset(bitmap, 0, len);
+ memzero_extent_buffer(eb, 0, len);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_err("bitmap was not zeroed");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ ret = check_eb_bitmap(bitmap, eb, len);
+ if (ret) {
+ test_err("setting all bits failed");
+ return ret;
+ }
+
+ bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
+ ret = check_eb_bitmap(bitmap, eb, len);
+ if (ret) {
+ test_err("clearing all bits failed");
+ return ret;
+ }
+
+ /* Straddling pages test */
+ if (len > PAGE_SIZE) {
+ bitmap_set(bitmap,
+ (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ ret = check_eb_bitmap(bitmap, eb, len);
+ if (ret) {
+ test_err("setting straddling pages failed");
+ return ret;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ bitmap_clear(bitmap,
+ (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ ret = check_eb_bitmap(bitmap, eb, len);
+ if (ret) {
+ test_err("clearing straddling pages failed");
+ return ret;
+ }
+ }
+
+ /*
+ * Generate a wonky pseudo-random bit pattern for the sake of not using
+ * something repetitive that could miss some hypothetical off-by-n bug.
+ */
+ x = 0;
+ bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
+ for (i = 0; i < len * BITS_PER_BYTE / 32; i++) {
+ x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU;
+ for (j = 0; j < 32; j++) {
+ if (x & (1U << j)) {
+ bitmap_set(bitmap, i * 32 + j, 1);
+ extent_buffer_bitmap_set(eb, 0, i * 32 + j, 1);
+ }
+ }
+ }
+
+ ret = check_eb_bitmap(bitmap, eb, len);
+ if (ret) {
+ test_err("random bit pattern failed");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info;
+ unsigned long *bitmap = NULL;
+ struct extent_buffer *eb = NULL;
+ int ret;
+
+ test_msg("running extent buffer bitmap tests");
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ bitmap = kmalloc(nodesize, GFP_KERNEL);
+ if (!bitmap) {
+ test_err("couldn't allocate test bitmap");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize);
+ if (!eb) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, nodesize);
+ if (ret)
+ goto out;
+
+ free_extent_buffer(eb);
+
+ /*
+ * Test again for case where the tree block is sectorsize aligned but
+ * not nodesize aligned.
+ */
+ eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize);
+ if (!eb) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, nodesize);
+out:
+ free_extent_buffer(eb);
+ kfree(bitmap);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+static int test_find_first_clear_extent_bit(void)
+{
+ struct extent_io_tree tree;
+ u64 start, end;
+ int ret = -EINVAL;
+
+ test_msg("running find_first_clear_extent_bit test");
+
+ extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
+
+ /* Test correct handling of empty tree */
+ find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
+ if (start != 0 || end != -1) {
+ test_err(
+ "error getting a range from completely empty tree: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+ /*
+ * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
+ * 4M-32M
+ */
+ set_extent_bits(&tree, SZ_1M, SZ_4M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != 0 || end != SZ_1M - 1) {
+ test_err("error finding beginning range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /* Now add 32M-64M so that we have a hole between 4M-32M */
+ set_extent_bits(&tree, SZ_32M, SZ_64M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ /*
+ * Request first hole starting at 12M, we should get 4M-32M
+ */
+ find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != SZ_4M || end != SZ_32M - 1) {
+ test_err("error finding trimmed range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Search in the middle of allocated range, should get the next one
+ * available, which happens to be unallocated -> 4M-32M
+ */
+ find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != SZ_4M || end != SZ_32M - 1) {
+ test_err("error finding next unalloc range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
+ * being unset in this range, we should get the entry in range 64M-72M
+ */
+ set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED);
+ find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
+ CHUNK_TRIMMED);
+
+ if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
+ test_err("error finding exact range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
+ CHUNK_TRIMMED);
+
+ /*
+ * Search in the middle of set range whose immediate neighbour doesn't
+ * have the bits set so it must be returned
+ */
+ if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
+ test_err("error finding next alloc range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Search beyond any known range, shall return after last known range
+ * and end should be -1
+ */
+ find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
+ if (start != SZ_64M + SZ_8M || end != -1) {
+ test_err(
+ "error handling beyond end of range search: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret)
+ dump_extent_io_tree(&tree);
+ clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ return ret;
+}
+
+int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
+{
+ int ret;
+
+ test_msg("running extent I/O tests");
+
+ ret = test_find_delalloc(sectorsize);
+ if (ret)
+ goto out;
+
+ ret = test_find_first_clear_extent_bit();
+ if (ret)
+ goto out;
+
+ ret = test_eb_bitmaps(sectorsize, nodesize);
+out:
+ return ret;
+}
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
new file mode 100644
index 000000000..c5b3a631b
--- /dev/null
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -0,0 +1,636 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Oracle. All rights reserved.
+ */
+
+#include <linux/types.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../block-group.h"
+
+static void free_extent_map_tree(struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ struct rb_node *node;
+
+ write_lock(&em_tree->lock);
+ while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
+ node = rb_first_cached(&em_tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ remove_extent_mapping(em_tree, em);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (refcount_read(&em->refs) != 1) {
+ test_err(
+"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
+ em->start, em->len, em->block_start,
+ em->block_len, refcount_read(&em->refs));
+
+ refcount_set(&em->refs, 1);
+ }
+#endif
+ free_extent_map(em);
+ }
+ write_unlock(&em_tree->lock);
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet, there is a file
+ * extent [0, 16K), followed by another file extent [16K, 20K), two dio reads
+ * are entering btrfs_get_extent() concurrently, t1 is reading [8K, 16K), t2 is
+ * reading [0, 8K)
+ *
+ * t1 t2
+ * btrfs_get_extent() btrfs_get_extent()
+ * -> lookup_extent_mapping() ->lookup_extent_mapping()
+ * -> add_extent_mapping(0, 16K)
+ * -> return em
+ * ->add_extent_mapping(0, 16K)
+ * -> #handle -EEXIST
+ */
+static int test_case_1(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ u64 start = 0;
+ u64 len = SZ_8K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ /* Add [0, 16K) */
+ em->start = 0;
+ em->len = SZ_16K;
+ em->block_start = 0;
+ em->block_len = SZ_16K;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [0, 16K)");
+ goto out;
+ }
+ free_extent_map(em);
+
+ /* Add [16K, 20K) following [0, 16K) */
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ em->start = SZ_16K;
+ em->len = SZ_4K;
+ em->block_start = SZ_32K; /* avoid merging */
+ em->block_len = SZ_4K;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [16K, 20K)");
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Add [0, 8K), should return [0, 16K) instead. */
+ em->start = start;
+ em->len = len;
+ em->block_start = start;
+ em->block_len = len;
+ write_lock(&em_tree->lock);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ write_unlock(&em_tree->lock);
+ if (ret) {
+ test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
+ goto out;
+ }
+ if (em &&
+ (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ em->block_start != 0 || em->block_len != SZ_16K)) {
+ test_err(
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
+ start, start + len, ret, em->start, em->len,
+ em->block_start, em->block_len);
+ ret = -EINVAL;
+ }
+ free_extent_map(em);
+out:
+ free_extent_map_tree(em_tree);
+
+ return ret;
+}
+
+/*
+ * Test scenario:
+ *
+ * Reading the inline ending up with EEXIST, ie. read an inline
+ * extent and discard page cache and read it again.
+ */
+static int test_case_2(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
+{
+ struct extent_map *em;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ /* Add [0, 1K) */
+ em->start = 0;
+ em->len = SZ_1K;
+ em->block_start = EXTENT_MAP_INLINE;
+ em->block_len = (u64)-1;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [0, 1K)");
+ goto out;
+ }
+ free_extent_map(em);
+
+ /* Add [4K, 8K) following [0, 1K) */
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ em->start = SZ_4K;
+ em->len = SZ_4K;
+ em->block_start = SZ_4K;
+ em->block_len = SZ_4K;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [4K, 8K)");
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Add [0, 1K) */
+ em->start = 0;
+ em->len = SZ_1K;
+ em->block_start = EXTENT_MAP_INLINE;
+ em->block_len = (u64)-1;
+ write_lock(&em_tree->lock);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
+ write_unlock(&em_tree->lock);
+ if (ret) {
+ test_err("case2 [0 1K]: ret %d", ret);
+ goto out;
+ }
+ if (em &&
+ (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
+ test_err(
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
+ ret, em->start, em->len, em->block_start,
+ em->block_len);
+ ret = -EINVAL;
+ }
+ free_extent_map(em);
+out:
+ free_extent_map_tree(em_tree);
+
+ return ret;
+}
+
+static int __test_case_3(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree, u64 start)
+{
+ struct extent_map *em;
+ u64 len = SZ_4K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ /* Add [4K, 8K) */
+ em->start = SZ_4K;
+ em->len = SZ_4K;
+ em->block_start = SZ_4K;
+ em->block_len = SZ_4K;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [4K, 8K)");
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Add [0, 16K) */
+ em->start = 0;
+ em->len = SZ_16K;
+ em->block_start = 0;
+ em->block_len = SZ_16K;
+ write_lock(&em_tree->lock);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ write_unlock(&em_tree->lock);
+ if (ret) {
+ test_err("case3 [0x%llx 0x%llx): ret %d",
+ start, start + len, ret);
+ goto out;
+ }
+ /*
+ * Since bytes within em are contiguous, em->block_start is identical to
+ * em->start.
+ */
+ if (em &&
+ (start < em->start || start + len > extent_map_end(em) ||
+ em->start != em->block_start || em->len != em->block_len)) {
+ test_err(
+"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
+ start, start + len, ret, em->start, em->len,
+ em->block_start, em->block_len);
+ ret = -EINVAL;
+ }
+ free_extent_map(em);
+out:
+ free_extent_map_tree(em_tree);
+
+ return ret;
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet.
+ * There is a file extent [0, 16K), two jobs are running concurrently
+ * against it, t1 is buffered writing to [4K, 8K) and t2 is doing dio
+ * read from [0, 4K) or [8K, 12K) or [12K, 16K).
+ *
+ * t1 goes ahead of t2 and adds em [4K, 8K) into tree.
+ *
+ * t1 t2
+ * cow_file_range() btrfs_get_extent()
+ * -> lookup_extent_mapping()
+ * -> add_extent_mapping()
+ * -> add_extent_mapping()
+ */
+static int test_case_3(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
+{
+ int ret;
+
+ ret = __test_case_3(fs_info, em_tree, 0);
+ if (ret)
+ return ret;
+ ret = __test_case_3(fs_info, em_tree, SZ_8K);
+ if (ret)
+ return ret;
+ ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K));
+
+ return ret;
+}
+
+static int __test_case_4(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree, u64 start)
+{
+ struct extent_map *em;
+ u64 len = SZ_4K;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ /* Add [0K, 8K) */
+ em->start = 0;
+ em->len = SZ_8K;
+ em->block_start = 0;
+ em->block_len = SZ_8K;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [0, 8K)");
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Add [8K, 32K) */
+ em->start = SZ_8K;
+ em->len = 24 * SZ_1K;
+ em->block_start = SZ_16K; /* avoid merging */
+ em->block_len = 24 * SZ_1K;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("cannot add extent range [8K, 32K)");
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* Add [0K, 32K) */
+ em->start = 0;
+ em->len = SZ_32K;
+ em->block_start = 0;
+ em->block_len = SZ_32K;
+ write_lock(&em_tree->lock);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ write_unlock(&em_tree->lock);
+ if (ret) {
+ test_err("case4 [0x%llx 0x%llx): ret %d",
+ start, len, ret);
+ goto out;
+ }
+ if (em && (start < em->start || start + len > extent_map_end(em))) {
+ test_err(
+"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
+ start, len, ret, em->start, em->len, em->block_start,
+ em->block_len);
+ ret = -EINVAL;
+ }
+ free_extent_map(em);
+out:
+ free_extent_map_tree(em_tree);
+
+ return ret;
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet.
+ * There is a file extent [0, 32K), two jobs are running concurrently
+ * against it, t1 is doing dio write to [8K, 32K) and t2 is doing dio
+ * read from [0, 4K) or [4K, 8K).
+ *
+ * t1 goes ahead of t2 and splits em [0, 32K) to em [0K, 8K) and [8K 32K).
+ *
+ * t1 t2
+ * btrfs_get_blocks_direct() btrfs_get_blocks_direct()
+ * -> btrfs_get_extent() -> btrfs_get_extent()
+ * -> lookup_extent_mapping()
+ * -> add_extent_mapping() -> lookup_extent_mapping()
+ * # load [0, 32K)
+ * -> btrfs_new_extent_direct()
+ * -> btrfs_drop_extent_cache()
+ * # split [0, 32K)
+ * -> add_extent_mapping()
+ * # add [8K, 32K)
+ * -> add_extent_mapping()
+ * # handle -EEXIST when adding
+ * # [0, 32K)
+ */
+static int test_case_4(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree)
+{
+ int ret;
+
+ ret = __test_case_4(fs_info, em_tree, 0);
+ if (ret)
+ return ret;
+ ret = __test_case_4(fs_info, em_tree, SZ_4K);
+
+ return ret;
+}
+
+struct rmap_test_vector {
+ u64 raid_type;
+ u64 physical_start;
+ u64 data_stripe_size;
+ u64 num_data_stripes;
+ u64 num_stripes;
+ /* Assume we won't have more than 5 physical stripes */
+ u64 data_stripe_phys_start[5];
+ bool expected_mapped_addr;
+ /* Physical to logical addresses */
+ u64 mapped_logical[5];
+};
+
+static int test_rmap_block(struct btrfs_fs_info *fs_info,
+ struct rmap_test_vector *test)
+{
+ struct extent_map *em;
+ struct map_lookup *map = NULL;
+ u64 *logical = NULL;
+ int i, out_ndaddrs, out_stripe_len;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+ if (!map) {
+ kfree(em);
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ /* Start at 4GiB logical address */
+ em->start = SZ_4G;
+ em->len = test->data_stripe_size * test->num_data_stripes;
+ em->block_len = em->len;
+ em->orig_block_len = test->data_stripe_size;
+ em->map_lookup = map;
+
+ map->num_stripes = test->num_stripes;
+ map->stripe_len = BTRFS_STRIPE_LEN;
+ map->type = test->raid_type;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *dev = btrfs_alloc_dummy_device(fs_info);
+
+ if (IS_ERR(dev)) {
+ test_err("cannot allocate device");
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ map->stripes[i].dev = dev;
+ map->stripes[i].physical = test->data_stripe_phys_start[i];
+ }
+
+ write_lock(&fs_info->mapping_tree.lock);
+ ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
+ write_unlock(&fs_info->mapping_tree.lock);
+ if (ret) {
+ test_err("error adding block group mapping to mapping tree");
+ goto out_free;
+ }
+
+ ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1),
+ &logical, &out_ndaddrs, &out_stripe_len);
+ if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
+ test_err("didn't rmap anything but expected %d",
+ test->expected_mapped_addr);
+ goto out;
+ }
+
+ if (out_stripe_len != BTRFS_STRIPE_LEN) {
+ test_err("calculated stripe length doesn't match");
+ goto out;
+ }
+
+ if (out_ndaddrs != test->expected_mapped_addr) {
+ for (i = 0; i < out_ndaddrs; i++)
+ test_msg("mapped %llu", logical[i]);
+ test_err("unexpected number of mapped addresses: %d", out_ndaddrs);
+ goto out;
+ }
+
+ for (i = 0; i < out_ndaddrs; i++) {
+ if (logical[i] != test->mapped_logical[i]) {
+ test_err("unexpected logical address mapped");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ write_lock(&fs_info->mapping_tree.lock);
+ remove_extent_mapping(&fs_info->mapping_tree, em);
+ write_unlock(&fs_info->mapping_tree.lock);
+ /* For us */
+ free_extent_map(em);
+out_free:
+ /* For the tree */
+ free_extent_map(em);
+ kfree(logical);
+ return ret;
+}
+
+int btrfs_test_extent_map(void)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct extent_map_tree *em_tree;
+ int ret = 0, i;
+ struct rmap_test_vector rmap_tests[] = {
+ {
+ /*
+ * Test a chunk with 2 data stripes one of which
+ * intersects the physical address of the super block
+ * is correctly recognised.
+ */
+ .raid_type = BTRFS_BLOCK_GROUP_RAID1,
+ .physical_start = SZ_64M - SZ_4M,
+ .data_stripe_size = SZ_256M,
+ .num_data_stripes = 2,
+ .num_stripes = 2,
+ .data_stripe_phys_start =
+ {SZ_64M - SZ_4M, SZ_64M - SZ_4M + SZ_256M},
+ .expected_mapped_addr = true,
+ .mapped_logical= {SZ_4G + SZ_4M}
+ },
+ {
+ /*
+ * Test that out-of-range physical addresses are
+ * ignored
+ */
+
+ /* SINGLE chunk type */
+ .raid_type = 0,
+ .physical_start = SZ_4G,
+ .data_stripe_size = SZ_256M,
+ .num_data_stripes = 1,
+ .num_stripes = 1,
+ .data_stripe_phys_start = {SZ_256M},
+ .expected_mapped_addr = false,
+ .mapped_logical = {0}
+ }
+ };
+
+ test_msg("running extent_map tests");
+
+ /*
+ * Note: the fs_info is not set up completely, we only need
+ * fs_info::fsid for the tracepoint.
+ */
+ fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
+ if (!em_tree) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ extent_map_tree_init(em_tree);
+
+ ret = test_case_1(fs_info, em_tree);
+ if (ret)
+ goto out;
+ ret = test_case_2(fs_info, em_tree);
+ if (ret)
+ goto out;
+ ret = test_case_3(fs_info, em_tree);
+ if (ret)
+ goto out;
+ ret = test_case_4(fs_info, em_tree);
+
+ test_msg("running rmap tests");
+ for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) {
+ ret = test_rmap_block(fs_info, &rmap_tests[i]);
+ if (ret)
+ goto out;
+ }
+
+out:
+ kfree(em_tree);
+ btrfs_free_dummy_fs_info(fs_info);
+
+ return ret;
+}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
new file mode 100644
index 000000000..ebf68fcd2
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -0,0 +1,1063 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../disk-io.h"
+#include "../free-space-cache.h"
+#include "../block-group.h"
+
+#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
+
+/*
+ * This test just does basic sanity checking, making sure we can add an extent
+ * entry and remove space from either end and the middle, and make sure we can
+ * remove space that covers adjacent extent entries.
+ */
+static int test_extents(struct btrfs_block_group *cache)
+{
+ int ret = 0;
+
+ test_msg("running extent only tests");
+
+ /* First just make sure we can remove an entire entry */
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
+ if (ret) {
+ test_err("error adding initial extents %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
+ if (ret) {
+ test_err("error removing extent %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 0, SZ_4M)) {
+ test_err("full remove left some lingering space");
+ return -1;
+ }
+
+ /* Ok edge and middle cases now */
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
+ if (ret) {
+ test_err("error adding half extent %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
+ if (ret) {
+ test_err("error removing tail end %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
+ if (ret) {
+ test_err("error removing front end %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
+ if (ret) {
+ test_err("error removing middle piece %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 0, SZ_1M)) {
+ test_err("still have space at the front");
+ return -1;
+ }
+
+ if (test_check_exists(cache, SZ_2M, 4096)) {
+ test_err("still have space in the middle");
+ return -1;
+ }
+
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
+ test_err("still have space at the end");
+ return -1;
+ }
+
+ /* Cleanup */
+ btrfs_remove_free_space_cache(cache);
+
+ return 0;
+}
+
+static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
+{
+ u64 next_bitmap_offset;
+ int ret;
+
+ test_msg("running bitmap only tests");
+
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
+ if (ret) {
+ test_err("couldn't create a bitmap entry %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
+ if (ret) {
+ test_err("error removing bitmap full range %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 0, SZ_4M)) {
+ test_err("left some space in bitmap");
+ return -1;
+ }
+
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
+ if (ret) {
+ test_err("couldn't add to our bitmap entry %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
+ if (ret) {
+ test_err("couldn't remove middle chunk %d", ret);
+ return ret;
+ }
+
+ /*
+ * The first bitmap we have starts at offset 0 so the next one is just
+ * at the end of the first bitmap.
+ */
+ next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
+
+ /* Test a bit straddling two bitmaps */
+ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
+ SZ_4M, 1);
+ if (ret) {
+ test_err("couldn't add space that straddles two bitmaps %d",
+ ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
+ if (ret) {
+ test_err("couldn't remove overlapping space %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
+ test_err("left some space when removing overlapping");
+ return -1;
+ }
+
+ btrfs_remove_free_space_cache(cache);
+
+ return 0;
+}
+
+/* This is the high grade jackassery */
+static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
+ u32 sectorsize)
+{
+ u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
+ int ret;
+
+ test_msg("running bitmap and extent tests");
+
+ /*
+ * First let's do something simple, an extent at the same offset as the
+ * bitmap, but the free space completely in the extent and then
+ * completely in the bitmap.
+ */
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
+ if (ret) {
+ test_err("couldn't create bitmap entry %d", ret);
+ return ret;
+ }
+
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
+ if (ret) {
+ test_err("couldn't remove extent entry %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 0, SZ_1M)) {
+ test_err("left remnants after our remove");
+ return -1;
+ }
+
+ /* Now to add back the extent entry and remove from the bitmap */
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
+ if (ret) {
+ test_err("couldn't re-add extent entry %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
+ if (ret) {
+ test_err("couldn't remove from bitmap %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, SZ_4M, SZ_1M)) {
+ test_err("left remnants in the bitmap");
+ return -1;
+ }
+
+ /*
+ * Ok so a little more evil, extent entry and bitmap at the same offset,
+ * removing an overlapping chunk.
+ */
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
+ if (ret) {
+ test_err("couldn't add to a bitmap %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
+ if (ret) {
+ test_err("couldn't remove overlapping space %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
+ test_err("left over pieces after removing overlapping");
+ return -1;
+ }
+
+ btrfs_remove_free_space_cache(cache);
+
+ /* Now with the extent entry offset into the bitmap */
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
+ if (ret) {
+ test_err("couldn't add space to the bitmap %d", ret);
+ return ret;
+ }
+
+ ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
+ if (ret) {
+ test_err("couldn't add extent to the cache %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
+ if (ret) {
+ test_err("problem removing overlapping space %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
+ test_err("left something behind when removing space");
+ return -1;
+ }
+
+ /*
+ * This has blown up in the past, the extent entry starts before the
+ * bitmap entry, but we're trying to remove an offset that falls
+ * completely within the bitmap range and is in both the extent entry
+ * and the bitmap entry, looks like this
+ *
+ * [ extent ]
+ * [ bitmap ]
+ * [ del ]
+ */
+ btrfs_remove_free_space_cache(cache);
+ ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
+ if (ret) {
+ test_err("couldn't add bitmap %d", ret);
+ return ret;
+ }
+
+ ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
+ 5 * SZ_1M, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
+ if (ret) {
+ test_err("failed to free our space %d", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
+ test_err("left stuff over");
+ return -1;
+ }
+
+ btrfs_remove_free_space_cache(cache);
+
+ /*
+ * This blew up before, we have part of the free space in a bitmap and
+ * then the entirety of the rest of the space in an extent. This used
+ * to return -EAGAIN back from btrfs_remove_extent, make sure this
+ * doesn't happen.
+ */
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry %d", ret);
+ return ret;
+ }
+
+ ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
+ if (ret) {
+ test_err("error removing bitmap and extent overlapping %d", ret);
+ return ret;
+ }
+
+ btrfs_remove_free_space_cache(cache);
+ return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return ctl->free_extents > 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int
+check_num_extents_and_bitmaps(const struct btrfs_block_group *cache,
+ const int num_extents,
+ const int num_bitmaps)
+{
+ if (cache->free_space_ctl->free_extents != num_extents) {
+ test_err(
+ "incorrect # of extent entries in the cache: %d, expected %d",
+ cache->free_space_ctl->free_extents, num_extents);
+ return -EINVAL;
+ }
+ if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
+ test_err(
+ "incorrect # of extent entries in the cache: %d, expected %d",
+ cache->free_space_ctl->total_bitmaps, num_bitmaps);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int check_cache_empty(struct btrfs_block_group *cache)
+{
+ u64 offset;
+ u64 max_extent_size;
+
+ /*
+ * Now lets confirm that there's absolutely no free space left to
+ * allocate.
+ */
+ if (cache->free_space_ctl->free_space != 0) {
+ test_err("cache free space is not 0");
+ return -EINVAL;
+ }
+
+ /* And any allocation request, no matter how small, should fail now. */
+ offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
+ &max_extent_size);
+ if (offset != 0) {
+ test_err("space allocation did not fail, returned offset: %llu",
+ offset);
+ return -EINVAL;
+ }
+
+ /* And no extent nor bitmap entries in the cache anymore. */
+ return check_num_extents_and_bitmaps(cache, 0, 0);
+}
+
+/*
+ * Before we were able to steal free space from a bitmap entry to an extent
+ * entry, we could end up with 2 entries representing a contiguous free space.
+ * One would be an extent entry and the other a bitmap entry. Since in order
+ * to allocate space to a caller we use only 1 entry, we couldn't return that
+ * whole range to the caller if it was requested. This forced the caller to
+ * either assume ENOSPC or perform several smaller space allocations, which
+ * wasn't optimal as they could be spread all over the block group while under
+ * concurrency (extra overhead and fragmentation).
+ *
+ * This stealing approach is beneficial, since we always prefer to allocate
+ * from extent entries, both for clustered and non-clustered allocation
+ * requests.
+ */
+static int
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
+ u32 sectorsize)
+{
+ int ret;
+ u64 offset;
+ u64 max_extent_size;
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .use_bitmap = test_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
+
+ test_msg("running space stealing from bitmap to extent tests");
+
+ /*
+ * For this test, we want to ensure we end up with an extent entry
+ * immediately adjacent to a bitmap entry, where the bitmap starts
+ * at an offset where the extent entry ends. We keep adding and
+ * removing free space to reach into this state, but to get there
+ * we need to reach a point where marking new free space doesn't
+ * result in adding new extent entries or merging the new space
+ * with existing extent entries - the space ends up being marked
+ * in an existing bitmap that covers the new free space range.
+ *
+ * To get there, we need to reach the threshold defined set at
+ * cache->free_space_ctl->extents_thresh, which currently is
+ * 256 extents on a x86_64 system at least, and a few other
+ * conditions (check free_space_cache.c). Instead of making the
+ * test much longer and complicated, use a "use_bitmap" operation
+ * that forces use of bitmaps as soon as we have at least 1
+ * extent entry.
+ */
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
+
+ /*
+ * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
+ */
+ ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
+ SZ_128M - SZ_512K, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry %d", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the first 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb - 256Kb, 128Mb - 128Kb[
+ * [128Mb + 512Kb, 128Mb + 768Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K);
+ if (ret) {
+ test_err("failed to free part of bitmap space %d", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
+ test_err("free space range missing");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
+ test_err("free space range missing");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K)) {
+ test_err("bitmap region not removed from space cache");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
+ test_err("invalid bitmap region marked as free");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
+ * by the bitmap too, isn't marked as free either.
+ */
+ if (test_check_exists(cache, SZ_128M, SZ_256K)) {
+ test_err("invalid bitmap region marked as free");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
+ if (ret) {
+ test_err("error adding free space: %d", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
+ test_err("bitmap region not marked as free");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the right of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize);
+ if (ret) {
+ test_err("error adding free space: %d", ret);
+ return ret;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
+ */
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
+ if (ret) {
+ test_err("error adding free space: %d", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
+ test_err("extent region not marked as free");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 4Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb - 256Kb, 128Mb[
+ * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
+ test_err("expected region not marked as free");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) {
+ test_err("cache free space is not 1Mb + %u", sectorsize);
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, SZ_1M, 0,
+ &max_extent_size);
+ if (offset != (SZ_128M - SZ_256K)) {
+ test_err(
+ "failed to allocate 1Mb from space cache, returned offset is: %llu",
+ offset);
+ return -EINVAL;
+ }
+
+ /*
+ * All that remains is a sectorsize free space region in a bitmap.
+ * Confirm.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != sectorsize) {
+ test_err("cache free space is not %u", sectorsize);
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, sectorsize, 0,
+ &max_extent_size);
+ if (offset != (SZ_128M + SZ_16M)) {
+ test_err("failed to allocate %u, returned offset : %llu",
+ sectorsize, offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ btrfs_remove_free_space_cache(cache);
+
+ /*
+ * Now test a similar scenario, but where our extent entry is located
+ * to the right of the bitmap entry, so that we can check that stealing
+ * space from a bitmap to the front of an extent entry works.
+ */
+
+ /*
+ * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
+ */
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
+ ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry %d", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the last 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb + 128b, 128Mb + 256Kb[
+ * [128Mb - 768Kb, 128Mb - 512Kb[
+ */
+ ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
+ if (ret) {
+ test_err("failed to free part of bitmap space %d", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
+ test_err("free space range missing");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
+ test_err("free space range missing");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
+ test_err("bitmap region not removed from space cache");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
+ test_err("invalid bitmap region marked as free");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
+ if (ret) {
+ test_err("error adding free space: %d", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
+ test_err("bitmap region not marked as free");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the left of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize);
+ if (ret) {
+ test_err("error adding free space: %d", ret);
+ return ret;
+ }
+
+ /*
+ * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
+ */
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
+ if (ret) {
+ test_err("error adding free space: %d", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
+ test_err("extent region not marked as free");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 2 * sectorsize free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb, 128Mb + 256Kb[
+ * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
+ test_err("expected region not marked as free");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) {
+ test_err("cache free space is not 1Mb + %u", 2 * sectorsize);
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
+ &max_extent_size);
+ if (offset != (SZ_128M - 768 * SZ_1K)) {
+ test_err(
+ "failed to allocate 1Mb from space cache, returned offset is: %llu",
+ offset);
+ return -EINVAL;
+ }
+
+ /*
+ * All that remains is 2 * sectorsize free space region
+ * in a bitmap. Confirm.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 2 * sectorsize) {
+ test_err("cache free space is not %u", 2 * sectorsize);
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 2 * sectorsize, 0,
+ &max_extent_size);
+ if (offset != SZ_32M) {
+ test_err("failed to allocate %u, offset: %llu",
+ 2 * sectorsize, offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ cache->free_space_ctl->op = orig_free_space_ops;
+ btrfs_remove_free_space_cache(cache);
+
+ return 0;
+}
+
+static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return true;
+}
+
+static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
+{
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .use_bitmap = bytes_index_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ u64 offset, max_extent_size, bytes;
+ int ret, i;
+
+ test_msg("running bytes index tests");
+
+ /* First just validate that it does everything in order. */
+ offset = 0;
+ for (i = 0; i < 10; i++) {
+ bytes = (i + 1) * SZ_1M;
+ ret = test_add_free_space_entry(cache, offset, bytes, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+ offset += bytes + sectorsize;
+ }
+
+ for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node;
+ node = rb_next(node), i--) {
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ bytes = (i + 1) * SZ_1M;
+ if (entry->bytes != bytes) {
+ test_err("invalid bytes index order, found %llu expected %llu",
+ entry->bytes, bytes);
+ return -EINVAL;
+ }
+ }
+
+ /* Now validate bitmaps do the correct thing. */
+ btrfs_remove_free_space_cache(cache);
+ for (i = 0; i < 2; i++) {
+ offset = i * BITS_PER_BITMAP * sectorsize;
+ bytes = (i + 1) * SZ_1M;
+ ret = test_add_free_space_entry(cache, offset, bytes, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry");
+ return ret;
+ }
+ }
+
+ for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node;
+ node = rb_next(node), i--) {
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ bytes = (i + 1) * SZ_1M;
+ if (entry->bytes != bytes) {
+ test_err("invalid bytes index order, found %llu expected %llu",
+ entry->bytes, bytes);
+ return -EINVAL;
+ }
+ }
+
+ /* Now validate bitmaps with different ->max_extent_size. */
+ btrfs_remove_free_space_cache(cache);
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
+
+ ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry");
+ return ret;
+ }
+
+ offset = BITS_PER_BITMAP * sectorsize;
+ ret = test_add_free_space_entry(cache, offset, sectorsize, 1);
+ if (ret) {
+ test_err("couldn't add bitmap_entry");
+ return ret;
+ }
+
+ /*
+ * Now set a bunch of sectorsize extents in the first entry so it's
+ * ->bytes is large.
+ */
+ for (i = 2; i < 20; i += 2) {
+ offset = sectorsize * i;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error populating sparse bitmap %d", ret);
+ return ret;
+ }
+ }
+
+ /*
+ * Now set a contiguous extent in the second bitmap so its
+ * ->max_extent_size is larger than the first bitmaps.
+ */
+ offset = (BITS_PER_BITMAP * sectorsize) + sectorsize;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error adding contiguous extent %d", ret);
+ return ret;
+ }
+
+ /*
+ * Since we don't set ->max_extent_size unless we search everything
+ * should be indexed on bytes.
+ */
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (10 * sectorsize)) {
+ test_err("error, wrong entry in the first slot in bytes_index");
+ return -EINVAL;
+ }
+
+ max_extent_size = 0;
+ offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3,
+ 0, &max_extent_size);
+ if (offset != 0) {
+ test_err("found space to alloc even though we don't have enough space");
+ return -EINVAL;
+ }
+
+ if (max_extent_size != (2 * sectorsize)) {
+ test_err("got the wrong max_extent size %llu expected %llu",
+ max_extent_size, (unsigned long long)(2 * sectorsize));
+ return -EINVAL;
+ }
+
+ /*
+ * The search should have re-arranged the bytes index to use the
+ * ->max_extent_size, validate it's now what we expect it to be.
+ */
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (2 * sectorsize)) {
+ test_err("error, the bytes index wasn't recalculated properly");
+ return -EINVAL;
+ }
+
+ /* Add another sectorsize to re-arrange the tree back to ->bytes. */
+ offset = (BITS_PER_BITMAP * sectorsize) - sectorsize;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error adding extent to the sparse entry %d", ret);
+ return ret;
+ }
+
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (11 * sectorsize)) {
+ test_err("error, wrong entry in the first slot in bytes_index");
+ return -EINVAL;
+ }
+
+ /*
+ * Now make sure we find our correct entry after searching that will
+ * result in a re-arranging of the tree.
+ */
+ max_extent_size = 0;
+ offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2,
+ 0, &max_extent_size);
+ if (offset != (BITS_PER_BITMAP * sectorsize)) {
+ test_err("error, found %llu instead of %llu for our alloc",
+ offset,
+ (unsigned long long)(BITS_PER_BITMAP * sectorsize));
+ return -EINVAL;
+ }
+
+ cache->free_space_ctl->op = orig_free_space_ops;
+ btrfs_remove_free_space_cache(cache);
+ return 0;
+}
+
+int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_block_group *cache;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
+
+ test_msg("running btrfs free space cache tests");
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ /*
+ * For ppc64 (with 64k page size), bytes per bitmap might be
+ * larger than 1G. To make bitmap test available in ppc64,
+ * alloc dummy block group whose size cross bitmaps.
+ */
+ cache = btrfs_alloc_dummy_block_group(fs_info,
+ BITS_PER_BITMAP * sectorsize + PAGE_SIZE);
+ if (!cache) {
+ test_std_err(TEST_ALLOC_BLOCK_GROUP);
+ btrfs_free_dummy_fs_info(fs_info);
+ return 0;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
+
+ ret = test_extents(cache);
+ if (ret)
+ goto out;
+ ret = test_bitmaps(cache, sectorsize);
+ if (ret)
+ goto out;
+ ret = test_bitmaps_and_extents(cache, sectorsize);
+ if (ret)
+ goto out;
+
+ ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
+ if (ret)
+ goto out;
+ ret = test_bytes_index(cache, sectorsize);
+out:
+ btrfs_free_dummy_block_group(cache);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
new file mode 100644
index 000000000..766117a76
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -0,0 +1,588 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ */
+
+#include <linux/types.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../disk-io.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
+#include "../block-group.h"
+
+struct free_space_extent {
+ u64 start;
+ u64 length;
+};
+
+static int __check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ const struct free_space_extent * const extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ u64 extent_start = 0, offset, end;
+ u32 flags, extent_count;
+ unsigned int i;
+ int ret;
+
+ info = search_free_space_info(trans, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_err("could not find free space info");
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ if (extent_count != num_extents) {
+ test_err("extent count is wrong");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (path->slots[0] != 0)
+ goto invalid;
+ end = cache->start + cache->length;
+ i = 0;
+ while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY)
+ goto invalid;
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(cache, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ offset - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ prev_bit = bit;
+ offset += fs_info->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ end - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ if (i != num_extents)
+ goto invalid;
+ } else {
+ if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 ||
+ path->slots[0] != 0)
+ goto invalid;
+ for (i = 0; i < num_extents; i++) {
+ path->slots[0]++;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY ||
+ key.objectid != extents[i].start ||
+ key.offset != extents[i].length)
+ goto invalid;
+ }
+ }
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+invalid:
+ test_err("free space tree is invalid");
+ ret = -EINVAL;
+ goto out;
+}
+
+static int check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ const struct free_space_extent * const extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ info = search_free_space_info(trans, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_err("could not find free space info");
+ btrfs_release_path(path);
+ return PTR_ERR(info);
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ ret = __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+ if (ret)
+ return ret;
+
+ /* Flip it to the other format and check that for good measure. */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ ret = convert_free_space_to_extents(trans, cache, path);
+ if (ret) {
+ test_err("could not convert to extents");
+ return ret;
+ }
+ } else {
+ ret = convert_free_space_to_bitmaps(trans, cache, path);
+ if (ret) {
+ test_err("could not convert to bitmaps");
+ return ret;
+ }
+ }
+ return __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+}
+
+static int test_empty_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {
+ {cache->start, cache->length},
+ };
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_all(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {};
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, cache, path,
+ cache->start,
+ cache->length);
+ if (ret) {
+ test_err("could not remove free space");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_beginning(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {
+ {cache->start + alignment, cache->length - alignment},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, cache, path,
+ cache->start, alignment);
+ if (ret) {
+ test_err("could not remove free space");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+
+}
+
+static int test_remove_end(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {
+ {cache->start, cache->length - alignment},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, cache, path,
+ cache->start + cache->length - alignment,
+ alignment);
+ if (ret) {
+ test_err("could not remove free space");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_middle(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {
+ {cache->start, alignment},
+ {cache->start + 2 * alignment, cache->length - 2 * alignment},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, cache, path,
+ cache->start + alignment,
+ alignment);
+ if (ret) {
+ test_err("could not remove free space");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_left(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {
+ {cache->start, 2 * alignment},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
+ if (ret) {
+ test_err("could not remove free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment,
+ alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_right(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {
+ {cache->start + alignment, 2 * alignment},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
+ if (ret) {
+ test_err("could not remove free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment,
+ alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment,
+ alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_both(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {
+ {cache->start, 3 * alignment},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
+ if (ret) {
+ test_err("could not remove free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_none(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ u32 alignment)
+{
+ const struct free_space_extent extents[] = {
+ {cache->start, alignment},
+ {cache->start + 2 * alignment, alignment},
+ {cache->start + 4 * alignment, alignment},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
+ if (ret) {
+ test_err("could not remove free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path,
+ cache->start + 4 * alignment, alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
+ if (ret) {
+ test_err("could not add free space");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *,
+ struct btrfs_fs_info *,
+ struct btrfs_block_group *,
+ struct btrfs_path *,
+ u32 alignment);
+
+static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
+ u32 nodesize, u32 alignment)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root = NULL;
+ struct btrfs_block_group *cache = NULL;
+ struct btrfs_trans_handle trans;
+ struct btrfs_path *path = NULL;
+ int ret;
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
+ root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
+ root->fs_info->tree_root = root;
+
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
+ if (IS_ERR(root->node)) {
+ test_std_err(TEST_ALLOC_EXTENT_BUFFER);
+ ret = PTR_ERR(root->node);
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 2 * nodesize;
+
+ cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment);
+ if (!cache) {
+ test_std_err(TEST_ALLOC_BLOCK_GROUP);
+ ret = -ENOMEM;
+ goto out;
+ }
+ cache->bitmap_low_thresh = 0;
+ cache->bitmap_high_thresh = (u32)-1;
+ set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
+ cache->fs_info = root->fs_info;
+
+ btrfs_init_dummy_trans(&trans, root->fs_info);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = add_block_group_free_space(&trans, cache);
+ if (ret) {
+ test_err("could not add block group free space");
+ goto out;
+ }
+
+ if (bitmaps) {
+ ret = convert_free_space_to_bitmaps(&trans, cache, path);
+ if (ret) {
+ test_err("could not convert block group to bitmaps");
+ goto out;
+ }
+ }
+
+ ret = test_func(&trans, root->fs_info, cache, path, alignment);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_free_space(&trans, cache);
+ if (ret) {
+ test_err("could not remove block group free space");
+ goto out;
+ }
+
+ if (btrfs_header_nritems(root->node) != 0) {
+ test_err("free space tree has leftover items");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ btrfs_free_dummy_block_group(cache);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+static int run_test_both_formats(test_func_t test_func, u32 sectorsize,
+ u32 nodesize, u32 alignment)
+{
+ int test_ret = 0;
+ int ret;
+
+ ret = run_test(test_func, 0, sectorsize, nodesize, alignment);
+ if (ret) {
+ test_err(
+ "%ps failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
+ test_func, sectorsize, nodesize, alignment);
+ test_ret = ret;
+ }
+
+ ret = run_test(test_func, 1, sectorsize, nodesize, alignment);
+ if (ret) {
+ test_err(
+ "%ps failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
+ test_func, sectorsize, nodesize, alignment);
+ test_ret = ret;
+ }
+
+ return test_ret;
+}
+
+int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
+{
+ test_func_t tests[] = {
+ test_empty_block_group,
+ test_remove_all,
+ test_remove_beginning,
+ test_remove_end,
+ test_remove_middle,
+ test_merge_left,
+ test_merge_right,
+ test_merge_both,
+ test_merge_none,
+ };
+ u32 bitmap_alignment;
+ int test_ret = 0;
+ int i;
+
+ /*
+ * Align some operations to a page to flush out bugs in the extent
+ * buffer bitmap handling of highmem.
+ */
+ bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE;
+
+ test_msg("running free space tree tests");
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ int ret;
+
+ ret = run_test_both_formats(tests[i], sectorsize, nodesize,
+ sectorsize);
+ if (ret)
+ test_ret = ret;
+
+ ret = run_test_both_formats(tests[i], sectorsize, nodesize,
+ bitmap_alignment);
+ if (ret)
+ test_ret = ret;
+ }
+
+ return test_ret;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
new file mode 100644
index 000000000..625f7d398
--- /dev/null
+++ b/fs/btrfs/tests/inode-tests.c
@@ -0,0 +1,1118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ */
+
+#include <linux/types.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../btrfs_inode.h"
+#include "../disk-io.h"
+#include "../extent_io.h"
+#include "../volumes.h"
+#include "../compression.h"
+
+static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
+ u64 ram_bytes, u64 offset, u64 disk_bytenr,
+ u64 disk_len, u32 type, u8 compression, int slot)
+{
+ struct btrfs_path path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf = root->node;
+ struct btrfs_key key;
+ u32 value_len = sizeof(struct btrfs_file_extent_item);
+
+ if (type == BTRFS_FILE_EXTENT_INLINE)
+ value_len += len;
+ memset(&path, 0, sizeof(path));
+
+ path.nodes[0] = leaf;
+ path.slots[0] = slot;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, 1);
+ btrfs_set_file_extent_type(leaf, fi, type);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len);
+ btrfs_set_file_extent_offset(leaf, fi, offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi, len);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+ btrfs_set_file_extent_compression(leaf, fi, compression);
+ btrfs_set_file_extent_encryption(leaf, fi, 0);
+ btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+}
+
+static void insert_inode_item_key(struct btrfs_root *root)
+{
+ struct btrfs_path path;
+ struct extent_buffer *leaf = root->node;
+ struct btrfs_key key;
+ u32 value_len = 0;
+
+ memset(&path, 0, sizeof(path));
+
+ path.nodes[0] = leaf;
+ path.slots[0] = 0;
+
+ key.objectid = BTRFS_INODE_ITEM_KEY;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
+}
+
+/*
+ * Build the most complicated map of extents the earth has ever seen. We want
+ * this so we can test all of the corner cases of btrfs_get_extent. Here is a
+ * diagram of how the extents will look though this may not be possible we still
+ * want to make sure everything acts normally (the last number is not inclusive)
+ *
+ * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
+ * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split]
+ *
+ * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ]
+ * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written]
+ *
+ * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635]
+ * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1]
+ *
+ * [69635-73731][ 73731 - 86019 ][86019-90115]
+ * [ regular ][ hole but no extent][ regular ]
+ */
+static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
+{
+ int slot = 0;
+ u64 disk_bytenr = SZ_1M;
+ u64 offset = 0;
+
+ /* First we want a hole */
+ insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
+ slot++;
+ offset += 5;
+
+ /*
+ * Now we want an inline extent, I don't think this is possible but hey
+ * why not? Also keep in mind if we have an inline extent it counts as
+ * the whole first page. If we were to expand it we would have to cow
+ * and we wouldn't have an inline extent anymore.
+ */
+ insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+ slot);
+ slot++;
+ offset = sectorsize;
+
+ /* Now another hole */
+ insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
+ slot++;
+ offset += 4;
+
+ /* Now for a regular extent */
+ insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0,
+ disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ disk_bytenr += sectorsize;
+ offset += sectorsize - 1;
+
+ /*
+ * Now for 3 extents that were split from a hole punch so we test
+ * offsets properly.
+ */
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, 4 * sectorsize,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 2 * sectorsize;
+ disk_bytenr += 4 * sectorsize;
+
+ /* Now for a unwritten prealloc extent */
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += sectorsize;
+
+ /*
+ * We want to jack up disk_bytenr a little more so the em stuff doesn't
+ * merge our records.
+ */
+ disk_bytenr += 2 * sectorsize;
+
+ /*
+ * Now for a partially written prealloc extent, basically the same as
+ * the hole punch example above. Ram_bytes never changes when you mark
+ * extents written btw.
+ */
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize,
+ disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
+ slot++;
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, 4 * sectorsize,
+ BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += 2 * sectorsize;
+ disk_bytenr += 4 * sectorsize;
+
+ /* Now a normal compressed extent */
+ insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0,
+ disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG,
+ BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += 2 * sectorsize;
+ /* No merges */
+ disk_bytenr += 2 * sectorsize;
+
+ /* Now a split compressed extent */
+ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG,
+ BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0,
+ disk_bytenr + sectorsize, sectorsize,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += sectorsize;
+ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize,
+ 2 * sectorsize, disk_bytenr, sectorsize,
+ BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += 2 * sectorsize;
+ disk_bytenr += 2 * sectorsize;
+
+ /* Now extents that have a hole but no hole extent */
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 4 * sectorsize;
+ disk_bytenr += sectorsize;
+ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
+}
+
+static unsigned long prealloc_only = 0;
+static unsigned long compressed_only = 0;
+static unsigned long vacancy_only = 0;
+
+static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ struct extent_map *em = NULL;
+ u64 orig_start;
+ u64 disk_bytenr;
+ u64 offset;
+ int ret = -ENOMEM;
+
+ test_msg("running btrfs_get_extent tests");
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
+ return ret;
+ }
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ goto out;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ goto out;
+ }
+
+ root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
+ if (!root->node) {
+ test_std_err(TEST_ALLOC_ROOT);
+ goto out;
+ }
+
+ btrfs_set_header_nritems(root->node, 0);
+ btrfs_set_header_level(root->node, 0);
+ ret = -EINVAL;
+
+ /* First with no extents */
+ BTRFS_I(inode)->root = root;
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
+ if (IS_ERR(em)) {
+ em = NULL;
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->block_start);
+ goto out;
+ }
+ free_extent_map(em);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+
+ /*
+ * All of the magic numbers are based on the mapping setup in
+ * setup_file_extents, so if you change anything there you need to
+ * update the comment and update the expected values below.
+ */
+ setup_file_extents(root, sectorsize);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != 0 || em->len != 5) {
+ test_err(
+ "unexpected extent wanted start 0 len 5, got start %llu len %llu",
+ em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_INLINE) {
+ test_err("expected an inline, got %llu", em->block_start);
+ goto out;
+ }
+
+ if (em->start != offset || em->len != (sectorsize - 5)) {
+ test_err(
+ "unexpected extent wanted start %llu len 1, got start %llu len %llu",
+ offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ /*
+ * We don't test anything else for inline since it doesn't get set
+ * unless we have a page for it to write into. Maybe we should change
+ * this?
+ */
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4) {
+ test_err(
+ "unexpected extent wanted start %llu len 4, got start %llu len %llu",
+ offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Regular extent */
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize - 1) {
+ test_err(
+ "unexpected extent wanted start %llu len 4095, got start %llu len %llu",
+ offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* The next 3 are split extents */
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
+ em->orig_start);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, 2 * sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_err("wrong orig offset, want %llu, have %llu",
+ orig_start, em->orig_start);
+ goto out;
+ }
+ disk_bytenr += (em->start - orig_start);
+ if (em->block_start != disk_bytenr) {
+ test_err("wrong block start, want %llu, have %llu",
+ disk_bytenr, em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Prealloc extent */
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_err("unexpected flags set, want %lu have %lu",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* The next 3 are a half written prealloc extent */
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_err("unexpected flags set, want %lu have %lu",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
+ em->orig_start);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_HOLE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_err("unexpected orig offset, wanted %llu, have %llu",
+ orig_start, em->orig_start);
+ goto out;
+ }
+ if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ test_err("unexpected block start, wanted %llu, have %llu",
+ disk_bytenr + (em->start - em->orig_start),
+ em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, 2 * sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_err("unexpected flags set, want %lu have %lu",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_err("wrong orig offset, want %llu, have %llu", orig_start,
+ em->orig_start);
+ goto out;
+ }
+ if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ test_err("unexpected block start, wanted %llu, have %llu",
+ disk_bytenr + (em->start - em->orig_start),
+ em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Now for the compressed extent */
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, 2 * sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_err("unexpected flags set, want %lu have %lu",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu",
+ em->start, em->orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_err("unexpected compress type, wanted %d, got %d",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Split compressed extent */
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_err("unexpected flags set, want %lu have %lu",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu",
+ em->start, em->orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_err("unexpected compress type, wanted %d, got %d",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != disk_bytenr) {
+ test_err("block start does not match, want %llu got %llu",
+ disk_bytenr, em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 2 * sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, 2 * sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_err("unexpected flags set, want %lu have %lu",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_err("wrong orig offset, want %llu, have %llu",
+ em->start, orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_err("unexpected compress type, wanted %d, got %d",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* A hole between regular extents but no hole extent */
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_err("expected a hole extent, got %llu", em->block_start);
+ goto out;
+ }
+ /*
+ * Currently we just return a length that we requested rather than the
+ * length of the actual hole, if this changes we'll have to change this
+ * test.
+ */
+ if (em->start != offset || em->len != 3 * sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, 3 * sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != vacancy_only) {
+ test_err("unexpected flags set, want %lu have %lu",
+ vacancy_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %llu len %u, got start %llu len %llu",
+ offset, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, want 0 have %lu", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_err("wrong orig offset, want %llu, have %llu", em->start,
+ em->orig_start);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (!IS_ERR(em))
+ free_extent_map(em);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+static int test_hole_first(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ struct extent_map *em = NULL;
+ int ret = -ENOMEM;
+
+ test_msg("running hole first btrfs_get_extent test");
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
+ return ret;
+ }
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ goto out;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ goto out;
+ }
+
+ root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
+ if (!root->node) {
+ test_std_err(TEST_ALLOC_ROOT);
+ goto out;
+ }
+
+ btrfs_set_header_nritems(root->node, 0);
+ btrfs_set_header_level(root->node, 0);
+ BTRFS_I(inode)->root = root;
+ ret = -EINVAL;
+
+ /*
+ * Need a blank inode item here just so we don't confuse
+ * btrfs_get_extent.
+ */
+ insert_inode_item_key(root);
+ insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
+ sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != 0 || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start 0 len %u, got start %llu len %llu",
+ sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != vacancy_only) {
+ test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
+ em->flags);
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
+ if (IS_ERR(em)) {
+ test_err("got an error when we shouldn't have");
+ goto out;
+ }
+ if (em->block_start != sectorsize) {
+ test_err("expected a real extent, got %llu", em->block_start);
+ goto out;
+ }
+ if (em->start != sectorsize || em->len != sectorsize) {
+ test_err(
+ "unexpected extent wanted start %u len %u, got start %llu len %llu",
+ sectorsize, sectorsize, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_err("unexpected flags set, wanted 0 got %lu",
+ em->flags);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (!IS_ERR(em))
+ free_extent_map(em);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+static int test_extent_accounting(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
+
+ test_msg("running outstanding_extents tests");
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
+ return ret;
+ }
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ goto out;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ goto out;
+ }
+
+ BTRFS_I(inode)->root = root;
+
+ /* [BTRFS_MAX_EXTENT_SIZE] */
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
+ BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL);
+ if (ret) {
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 1) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 1, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
+ 0, NULL);
+ if (ret) {
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 2, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, NULL);
+ if (ret) {
+ test_err("clear_extent_bit returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 2, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1)
+ + sectorsize - 1,
+ 0, NULL);
+ if (ret) {
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 2, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
+ */
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
+ (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
+ 0, NULL);
+ if (ret) {
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 4) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 4, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
+ */
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
+ if (ret) {
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 3) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 3, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, NULL);
+ if (ret) {
+ test_err("clear_extent_bit returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 4) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 4, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * Refill the hole again just for good measure, because I thought it
+ * might fail and I'd rather satisfy my paranoia at this point.
+ */
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
+ if (ret) {
+ test_err("btrfs_set_extent_delalloc returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 3) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 3, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* Empty */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, NULL);
+ if (ret) {
+ test_err("clear_extent_bit returned %d", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents) {
+ ret = -EINVAL;
+ test_err("miscount, wanted 0, got %u",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (ret)
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, NULL);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
+int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
+{
+ int ret;
+
+ test_msg("running inode tests");
+
+ set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+ set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
+ ret = test_btrfs_get_extent(sectorsize, nodesize);
+ if (ret)
+ return ret;
+ ret = test_hole_first(sectorsize, nodesize);
+ if (ret)
+ return ret;
+ return test_extent_accounting(sectorsize, nodesize);
+}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 000000000..63676ea19
--- /dev/null
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,527 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Facebook. All rights reserved.
+ */
+
+#include <linux/types.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../transaction.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+#include "../backref.h"
+
+static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_tree_block_info *block_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
+ int ret;
+
+ btrfs_init_dummy_trans(&trans, NULL);
+
+ ins.objectid = bytenr;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_std_err(TEST_ALLOC_ROOT);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
+ if (ret) {
+ test_err("couldn't insert ref %d", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, item, 1);
+ btrfs_set_extent_generation(leaf, item, 1);
+ btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ block_info = (struct btrfs_tree_block_info *)(item + 1);
+ btrfs_set_tree_block_level(leaf, block_info, 0);
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ if (parent > 0) {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_SHARED_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ btrfs_init_dummy_trans(&trans, NULL);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_std_err(TEST_ALLOC_ROOT);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_err("couldn't find extent ref");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
+ if (ret)
+ test_err("failed to insert backref");
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+
+ btrfs_init_dummy_trans(&trans, NULL);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_std_err(TEST_ALLOC_ROOT);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_err("didn't find our key %d", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ btrfs_init_dummy_trans(&trans, NULL);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_std_err(TEST_ALLOC_ROOT);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_err("couldn't find extent ref");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_err("couldn't find backref %d", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int test_no_shared_qgroup(struct btrfs_root *root,
+ u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
+ int ret;
+
+ btrfs_init_dummy_trans(&trans, fs_info);
+
+ test_msg("running qgroup add/remove tests");
+ ret = btrfs_create_qgroup(&trans, BTRFS_FS_TREE_OBJECTID);
+ if (ret) {
+ test_err("couldn't create a qgroup %d", ret);
+ return ret;
+ }
+
+ /*
+ * Since the test trans doesn't have the complicated delayed refs,
+ * we can only call btrfs_qgroup_account_extent() directly to test
+ * quota.
+ */
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ if (ret) {
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FS_TREE_OBJECTID);
+ if (ret) {
+ ulist_free(old_roots);
+ return ret;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
+ new_roots);
+ if (ret) {
+ test_err("couldn't account space for a qgroup %d", ret);
+ return ret;
+ }
+
+ /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */
+ old_roots = NULL;
+ new_roots = NULL;
+
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
+ test_err("qgroup counts didn't match expected values");
+ return -EINVAL;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ if (ret) {
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = remove_extent_item(root, nodesize, nodesize);
+ if (ret) {
+ ulist_free(old_roots);
+ return -EINVAL;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
+ new_roots);
+ if (ret) {
+ test_err("couldn't account space for a qgroup %d", ret);
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) {
+ test_err("qgroup counts didn't match expected values");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Add a ref for two different roots to make sure the shared value comes out
+ * right, also remove one of the roots and make sure the exclusive count is
+ * adjusted properly.
+ */
+static int test_multiple_refs(struct btrfs_root *root,
+ u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
+ int ret;
+
+ btrfs_init_dummy_trans(&trans, fs_info);
+
+ test_msg("running qgroup multiple refs test");
+
+ /*
+ * We have BTRFS_FS_TREE_OBJECTID created already from the
+ * previous test.
+ */
+ ret = btrfs_create_qgroup(&trans, BTRFS_FIRST_FREE_OBJECTID);
+ if (ret) {
+ test_err("couldn't create a qgroup %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ if (ret) {
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FS_TREE_OBJECTID);
+ if (ret) {
+ ulist_free(old_roots);
+ return ret;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
+ new_roots);
+ if (ret) {
+ test_err("couldn't account space for a qgroup %d", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
+ test_err("qgroup counts didn't match expected values");
+ return -EINVAL;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ if (ret) {
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = add_tree_ref(root, nodesize, nodesize, 0,
+ BTRFS_FIRST_FREE_OBJECTID);
+ if (ret) {
+ ulist_free(old_roots);
+ return ret;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
+ new_roots);
+ if (ret) {
+ test_err("couldn't account space for a qgroup %d", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, 0)) {
+ test_err("qgroup counts didn't match expected values");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
+ nodesize, 0)) {
+ test_err("qgroup counts didn't match expected values");
+ return -EINVAL;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
+ if (ret) {
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = remove_extent_ref(root, nodesize, nodesize, 0,
+ BTRFS_FIRST_FREE_OBJECTID);
+ if (ret) {
+ ulist_free(old_roots);
+ return ret;
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
+ new_roots);
+ if (ret) {
+ test_err("couldn't account space for a qgroup %d", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID,
+ 0, 0)) {
+ test_err("qgroup counts didn't match expected values");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
+ nodesize, nodesize)) {
+ test_err("qgroup counts didn't match expected values");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct btrfs_root *root;
+ struct btrfs_root *tmp_root;
+ int ret = 0;
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ /* We are using this root as our extent root */
+ root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
+
+ /*
+ * Some of the paths we test assume we have a filled out fs_info, so we
+ * just need to add the root in there so we don't panic.
+ */
+ root->fs_info->tree_root = root;
+ root->fs_info->quota_root = root;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+
+ /*
+ * Can't use bytenr 0, some things freak out
+ * *cough*backref walking code*cough*
+ */
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
+ if (IS_ERR(root->node)) {
+ test_err("couldn't allocate dummy buffer");
+ ret = PTR_ERR(root->node);
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 2 * nodesize;
+
+ tmp_root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(tmp_root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID;
+ root->fs_info->fs_root = tmp_root;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_err("couldn't insert fs root %d", ret);
+ goto out;
+ }
+ btrfs_put_root(tmp_root);
+
+ tmp_root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(tmp_root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_err("couldn't insert fs root %d", ret);
+ goto out;
+ }
+ btrfs_put_root(tmp_root);
+
+ test_msg("running qgroup tests");
+ ret = test_no_shared_qgroup(root, sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = test_multiple_refs(root, sectorsize, nodesize);
+out:
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000..60db4c3b8
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,2649 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/uuid.h>
+#include <linux/timekeeping.h>
+#include "misc.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "volumes.h"
+#include "dev-replace.h"
+#include "qgroup.h"
+#include "block-group.h"
+#include "space-info.h"
+#include "zoned.h"
+
+#define BTRFS_ROOT_TRANS_TAG 0
+
+/*
+ * Transaction states and transitions
+ *
+ * No running transaction (fs tree blocks are not modified)
+ * |
+ * | To next stage:
+ * | Call start_transaction() variants. Except btrfs_join_transaction_nostart().
+ * V
+ * Transaction N [[TRANS_STATE_RUNNING]]
+ * |
+ * | New trans handles can be attached to transaction N by calling all
+ * | start_transaction() variants.
+ * |
+ * | To next stage:
+ * | Call btrfs_commit_transaction() on any trans handle attached to
+ * | transaction N
+ * V
+ * Transaction N [[TRANS_STATE_COMMIT_START]]
+ * |
+ * | Will wait for previous running transaction to completely finish if there
+ * | is one
+ * |
+ * | Then one of the following happes:
+ * | - Wait for all other trans handle holders to release.
+ * | The btrfs_commit_transaction() caller will do the commit work.
+ * | - Wait for current transaction to be committed by others.
+ * | Other btrfs_commit_transaction() caller will do the commit work.
+ * |
+ * | At this stage, only btrfs_join_transaction*() variants can attach
+ * | to this running transaction.
+ * | All other variants will wait for current one to finish and attach to
+ * | transaction N+1.
+ * |
+ * | To next stage:
+ * | Caller is chosen to commit transaction N, and all other trans handle
+ * | haven been released.
+ * V
+ * Transaction N [[TRANS_STATE_COMMIT_DOING]]
+ * |
+ * | The heavy lifting transaction work is started.
+ * | From running delayed refs (modifying extent tree) to creating pending
+ * | snapshots, running qgroups.
+ * | In short, modify supporting trees to reflect modifications of subvolume
+ * | trees.
+ * |
+ * | At this stage, all start_transaction() calls will wait for this
+ * | transaction to finish and attach to transaction N+1.
+ * |
+ * | To next stage:
+ * | Until all supporting trees are updated.
+ * V
+ * Transaction N [[TRANS_STATE_UNBLOCKED]]
+ * | Transaction N+1
+ * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]]
+ * | need to write them back to disk and update |
+ * | super blocks. |
+ * | |
+ * | At this stage, new transaction is allowed to |
+ * | start. |
+ * | All new start_transaction() calls will be |
+ * | attached to transid N+1. |
+ * | |
+ * | To next stage: |
+ * | Until all tree blocks are super blocks are |
+ * | written to block devices |
+ * V |
+ * Transaction N [[TRANS_STATE_COMPLETED]] V
+ * All tree blocks and super blocks are written. Transaction N+1
+ * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]]
+ * data structures will be cleaned up. | Life goes on
+ */
+static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+ [TRANS_STATE_RUNNING] = 0U,
+ [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
+ [TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOSTART),
+ [TRANS_STATE_UNBLOCKED] = (__TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK |
+ __TRANS_JOIN_NOSTART),
+ [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK |
+ __TRANS_JOIN_NOSTART),
+ [TRANS_STATE_COMPLETED] = (__TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK |
+ __TRANS_JOIN_NOSTART),
+};
+
+void btrfs_put_transaction(struct btrfs_transaction *transaction)
+{
+ WARN_ON(refcount_read(&transaction->use_count) == 0);
+ if (refcount_dec_and_test(&transaction->use_count)) {
+ BUG_ON(!list_empty(&transaction->list));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.href_root.rb_root));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.dirty_extent_root));
+ if (transaction->delayed_refs.pending_csums)
+ btrfs_err(transaction->fs_info,
+ "pending csums is %llu",
+ transaction->delayed_refs.pending_csums);
+ /*
+ * If any block groups are found in ->deleted_bgs then it's
+ * because the transaction was aborted and a commit did not
+ * happen (things failed before writing the new superblock
+ * and calling btrfs_finish_extent_commit()), so we can not
+ * discard the physical locations of the block groups.
+ */
+ while (!list_empty(&transaction->deleted_bgs)) {
+ struct btrfs_block_group *cache;
+
+ cache = list_first_entry(&transaction->deleted_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&cache->bg_list);
+ btrfs_unfreeze_block_group(cache);
+ btrfs_put_block_group(cache);
+ }
+ WARN_ON(!list_empty(&transaction->dev_update_list));
+ kfree(transaction);
+ }
+}
+
+static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root, *tmp;
+
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
+
+ down_write(&fs_info->commit_root_sem);
+
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ fs_info->last_reloc_trans = trans->transid;
+
+ list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
+ dirty_list) {
+ list_del_init(&root->dirty_list);
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+ extent_io_tree_release(&root->dirty_log_pages);
+ btrfs_qgroup_clean_swapped_blocks(root);
+ }
+
+ /* We can free old roots now. */
+ spin_lock(&cur_trans->dropped_roots_lock);
+ while (!list_empty(&cur_trans->dropped_roots)) {
+ root = list_first_entry(&cur_trans->dropped_roots,
+ struct btrfs_root, root_list);
+ list_del_init(&root->root_list);
+ spin_unlock(&cur_trans->dropped_roots_lock);
+ btrfs_free_log(trans, root);
+ btrfs_drop_and_free_fs_root(fs_info, root);
+ spin_lock(&cur_trans->dropped_roots_lock);
+ }
+ spin_unlock(&cur_trans->dropped_roots_lock);
+
+ up_write(&fs_info->commit_root_sem);
+}
+
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
+{
+ return atomic_read(&trans->num_extwriters);
+}
+
+/*
+ * To be called after doing the chunk btree updates right after allocating a new
+ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
+ * chunk after all chunk btree updates and after finishing the second phase of
+ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
+ * group had its chunk item insertion delayed to the second phase.
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (!trans->chunk_bytes_reserved)
+ return;
+
+ btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
+ trans->chunk_bytes_reserved, NULL);
+ trans->chunk_bytes_reserved = 0;
+}
+
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_fs_info *fs_info,
+ unsigned int type)
+{
+ struct btrfs_transaction *cur_trans;
+
+ spin_lock(&fs_info->trans_lock);
+loop:
+ /* The file system has been taken offline. No new transactions. */
+ if (BTRFS_FS_ERROR(fs_info)) {
+ spin_unlock(&fs_info->trans_lock);
+ return -EROFS;
+ }
+
+ cur_trans = fs_info->running_transaction;
+ if (cur_trans) {
+ if (TRANS_ABORTED(cur_trans)) {
+ spin_unlock(&fs_info->trans_lock);
+ return cur_trans->aborted;
+ }
+ if (btrfs_blocked_trans_types[cur_trans->state] & type) {
+ spin_unlock(&fs_info->trans_lock);
+ return -EBUSY;
+ }
+ refcount_inc(&cur_trans->use_count);
+ atomic_inc(&cur_trans->num_writers);
+ extwriter_counter_inc(cur_trans, type);
+ spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
+ return 0;
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the
+ * current transaction, and commit it. If there is no transaction, just
+ * return ENOENT.
+ */
+ if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART)
+ return -ENOENT;
+
+ /*
+ * JOIN_NOLOCK only happens during the transaction commit, so
+ * it is impossible that ->running_transaction is NULL
+ */
+ BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+ cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
+ if (!cur_trans)
+ return -ENOMEM;
+
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
+
+ spin_lock(&fs_info->trans_lock);
+ if (fs_info->running_transaction) {
+ /*
+ * someone started a transaction after we unlocked. Make sure
+ * to redo the checks above
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ kfree(cur_trans);
+ goto loop;
+ } else if (BTRFS_FS_ERROR(fs_info)) {
+ spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ kfree(cur_trans);
+ return -EROFS;
+ }
+
+ cur_trans->fs_info = fs_info;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ init_waitqueue_head(&cur_trans->pending_wait);
+ atomic_set(&cur_trans->num_writers, 1);
+ extwriter_counter_init(cur_trans, type);
+ init_waitqueue_head(&cur_trans->writer_wait);
+ init_waitqueue_head(&cur_trans->commit_wait);
+ cur_trans->state = TRANS_STATE_RUNNING;
+ /*
+ * One for this trans handle, one so it will live on until we
+ * commit the transaction.
+ */
+ refcount_set(&cur_trans->use_count, 2);
+ cur_trans->flags = 0;
+ cur_trans->start_time = ktime_get_seconds();
+
+ memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
+
+ cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
+ cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
+ atomic_set(&cur_trans->delayed_refs.num_entries, 0);
+
+ /*
+ * although the tree mod log is per file system and not per transaction,
+ * the log must never go across transaction boundaries.
+ */
+ smp_mb();
+ if (!list_empty(&fs_info->tree_mod_seq_list))
+ WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
+ if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
+ WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
+ atomic64_set(&fs_info->tree_mod_seq, 0);
+
+ spin_lock_init(&cur_trans->delayed_refs.lock);
+
+ INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+ INIT_LIST_HEAD(&cur_trans->dev_update_list);
+ INIT_LIST_HEAD(&cur_trans->switch_commits);
+ INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+ INIT_LIST_HEAD(&cur_trans->io_bgs);
+ INIT_LIST_HEAD(&cur_trans->dropped_roots);
+ mutex_init(&cur_trans->cache_write_mutex);
+ spin_lock_init(&cur_trans->dirty_bgs_lock);
+ INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+ spin_lock_init(&cur_trans->dropped_roots_lock);
+ INIT_LIST_HEAD(&cur_trans->releasing_ebs);
+ spin_lock_init(&cur_trans->releasing_ebs_lock);
+ list_add_tail(&cur_trans->list, &fs_info->trans_list);
+ extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+ IO_TREE_TRANS_DIRTY_PAGES, NULL);
+ extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
+ IO_TREE_FS_PINNED_EXTENTS, NULL);
+ fs_info->generation++;
+ cur_trans->transid = fs_info->generation;
+ fs_info->running_transaction = cur_trans;
+ cur_trans->aborted = 0;
+ spin_unlock(&fs_info->trans_lock);
+
+ return 0;
+}
+
+/*
+ * This does all the record keeping required to make sure that a shareable root
+ * is properly recorded in a given transaction. This is required to make sure
+ * the old root from before we joined the transaction is deleted when the
+ * transaction commits.
+ */
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int force)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+
+ if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
+ root->last_trans < trans->transid) || force) {
+ WARN_ON(!force && root->commit_root != root->node);
+
+ /*
+ * see below for IN_TRANS_SETUP usage rules
+ * we have the reloc mutex held now, so there
+ * is only one writer in this function
+ */
+ set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
+
+ /* make sure readers find IN_TRANS_SETUP before
+ * they find our root->last_trans update
+ */
+ smp_wmb();
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ if (root->last_trans == trans->transid && !force) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return 0;
+ }
+ radix_tree_tag_set(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ root->last_trans = trans->transid;
+
+ /* this is pretty tricky. We don't want to
+ * take the relocation lock in btrfs_record_root_in_trans
+ * unless we're really doing the first setup for this root in
+ * this transaction.
+ *
+ * Normally we'd use root->last_trans as a flag to decide
+ * if we want to take the expensive mutex.
+ *
+ * But, we have to set root->last_trans before we
+ * init the relocation root, otherwise, we trip over warnings
+ * in ctree.c. The solution used here is to flag ourselves
+ * with root IN_TRANS_SETUP. When this is 1, we're still
+ * fixing up the reloc trees and everyone must wait.
+ *
+ * When this is zero, they can trust root->last_trans and fly
+ * through btrfs_record_root_in_trans without having to take the
+ * lock. smp_wmb() makes sure that all the writes above are
+ * done before we pop in the zero below
+ */
+ ret = btrfs_init_reloc_root(trans, root);
+ smp_mb__before_atomic();
+ clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
+ }
+ return ret;
+}
+
+
+void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ /* Add ourselves to the transaction dropped list */
+ spin_lock(&cur_trans->dropped_roots_lock);
+ list_add_tail(&root->root_list, &cur_trans->dropped_roots);
+ spin_unlock(&cur_trans->dropped_roots_lock);
+
+ /* Make sure we don't try to update the root at commit time */
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+}
+
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return 0;
+
+ /*
+ * see record_root_in_trans for comments about IN_TRANS_SETUP usage
+ * and barriers
+ */
+ smp_rmb();
+ if (root->last_trans == trans->transid &&
+ !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
+ return 0;
+
+ mutex_lock(&fs_info->reloc_mutex);
+ ret = record_root_in_trans(trans, root, 0);
+ mutex_unlock(&fs_info->reloc_mutex);
+
+ return ret;
+}
+
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+ return (trans->state >= TRANS_STATE_COMMIT_START &&
+ trans->state < TRANS_STATE_UNBLOCKED &&
+ !TRANS_ABORTED(trans));
+}
+
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_transaction *cur_trans;
+
+ spin_lock(&fs_info->trans_lock);
+ cur_trans = fs_info->running_transaction;
+ if (cur_trans && is_transaction_blocked(cur_trans)) {
+ refcount_inc(&cur_trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ wait_event(fs_info->transaction_wait,
+ cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+ TRANS_ABORTED(cur_trans));
+ btrfs_put_transaction(cur_trans);
+ } else {
+ spin_unlock(&fs_info->trans_lock);
+ }
+}
+
+static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
+{
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return 0;
+
+ if (type == TRANS_START)
+ return 1;
+
+ return 0;
+}
+
+static inline bool need_reserve_reloc_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!fs_info->reloc_ctl ||
+ !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ root->reloc_root)
+ return false;
+
+ return true;
+}
+
+static struct btrfs_trans_handle *
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+ unsigned int type, enum btrfs_reserve_flush_enum flush,
+ bool enforce_qgroups)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_trans_handle *h;
+ struct btrfs_transaction *cur_trans;
+ u64 num_bytes = 0;
+ u64 qgroup_reserved = 0;
+ bool reloc_reserved = false;
+ bool do_chunk_alloc = false;
+ int ret;
+
+ if (BTRFS_FS_ERROR(fs_info))
+ return ERR_PTR(-EROFS);
+
+ if (current->journal_info) {
+ WARN_ON(type & TRANS_EXTWRITERS);
+ h = current->journal_info;
+ refcount_inc(&h->use_count);
+ WARN_ON(refcount_read(&h->use_count) > 2);
+ h->orig_rsv = h->block_rsv;
+ h->block_rsv = NULL;
+ goto got_it;
+ }
+
+ /*
+ * Do the reservation before we join the transaction so we can do all
+ * the appropriate flushing if need be.
+ */
+ if (num_items && root != fs_info->chunk_root) {
+ struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
+ u64 delayed_refs_bytes = 0;
+
+ qgroup_reserved = num_items * fs_info->nodesize;
+ /*
+ * Use prealloc for now, as there might be a currently running
+ * transaction that could free this reserved space prematurely
+ * by committing.
+ */
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved,
+ enforce_qgroups, false);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /*
+ * We want to reserve all the bytes we may need all at once, so
+ * we only do 1 enospc flushing cycle per transaction start. We
+ * accomplish this by simply assuming we'll do 2 x num_items
+ * worth of delayed refs updates in this trans handle, and
+ * refill that amount for whatever is missing in the reserve.
+ */
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
+ if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
+ delayed_refs_bytes = num_bytes;
+ num_bytes <<= 1;
+ }
+
+ /*
+ * Do the reservation for the relocation root creation
+ */
+ if (need_reserve_reloc_root(root)) {
+ num_bytes += fs_info->nodesize;
+ reloc_reserved = true;
+ }
+
+ ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush);
+ if (ret)
+ goto reserve_fail;
+ if (delayed_refs_bytes) {
+ btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes);
+ num_bytes -= delayed_refs_bytes;
+ }
+ btrfs_block_rsv_add_bytes(rsv, num_bytes, true);
+
+ if (rsv->space_info->force_alloc)
+ do_chunk_alloc = true;
+ } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
+ /*
+ * Some people call with btrfs_start_transaction(root, 0)
+ * because they can be throttled, but have some other mechanism
+ * for reserving space. We still want these guys to refill the
+ * delayed block_rsv so just add 1 items worth of reservation
+ * here.
+ */
+ ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
+ if (ret)
+ goto reserve_fail;
+ }
+again:
+ h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ if (!h) {
+ ret = -ENOMEM;
+ goto alloc_fail;
+ }
+
+ /*
+ * If we are JOIN_NOLOCK we're already committing a transaction and
+ * waiting on this guy, so we don't need to do the sb_start_intwrite
+ * because we're already holding a ref. We need this because we could
+ * have raced in and did an fsync() on a file which can kick a commit
+ * and then we deadlock with somebody doing a freeze.
+ *
+ * If we are ATTACH, it means we just want to catch the current
+ * transaction and commit it, so we needn't do sb_start_intwrite().
+ */
+ if (type & __TRANS_FREEZABLE)
+ sb_start_intwrite(fs_info->sb);
+
+ if (may_wait_transaction(fs_info, type))
+ wait_current_trans(fs_info);
+
+ do {
+ ret = join_transaction(fs_info, type);
+ if (ret == -EBUSY) {
+ wait_current_trans(fs_info);
+ if (unlikely(type == TRANS_ATTACH ||
+ type == TRANS_JOIN_NOSTART))
+ ret = -ENOENT;
+ }
+ } while (ret == -EBUSY);
+
+ if (ret < 0)
+ goto join_fail;
+
+ cur_trans = fs_info->running_transaction;
+
+ h->transid = cur_trans->transid;
+ h->transaction = cur_trans;
+ refcount_set(&h->use_count, 1);
+ h->fs_info = root->fs_info;
+
+ h->type = type;
+ INIT_LIST_HEAD(&h->new_bgs);
+
+ smp_mb();
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
+ may_wait_transaction(fs_info, type)) {
+ current->journal_info = h;
+ btrfs_commit_transaction(h);
+ goto again;
+ }
+
+ if (num_bytes) {
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ h->transid, num_bytes, 1);
+ h->block_rsv = &fs_info->trans_block_rsv;
+ h->bytes_reserved = num_bytes;
+ h->reloc_reserved = reloc_reserved;
+ }
+
+ /*
+ * Now that we have found a transaction to be a part of, convert the
+ * qgroup reservation from prealloc to pertrans. A different transaction
+ * can't race in and free our pertrans out from under us.
+ */
+ if (qgroup_reserved)
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+
+got_it:
+ if (!current->journal_info)
+ current->journal_info = h;
+
+ /*
+ * If the space_info is marked ALLOC_FORCE then we'll get upgraded to
+ * ALLOC_FORCE the first run through, and then we won't allocate for
+ * anybody else who races in later. We don't care about the return
+ * value here.
+ */
+ if (do_chunk_alloc && num_bytes) {
+ u64 flags = h->block_rsv->space_info->flags;
+
+ btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
+ CHUNK_ALLOC_NO_FORCE);
+ }
+
+ /*
+ * btrfs_record_root_in_trans() needs to alloc new extents, and may
+ * call btrfs_join_transaction() while we're also starting a
+ * transaction.
+ *
+ * Thus it need to be called after current->journal_info initialized,
+ * or we can deadlock.
+ */
+ ret = btrfs_record_root_in_trans(h, root);
+ if (ret) {
+ /*
+ * The transaction handle is fully initialized and linked with
+ * other structures so it needs to be ended in case of errors,
+ * not just freed.
+ */
+ btrfs_end_transaction(h);
+ return ERR_PTR(ret);
+ }
+
+ return h;
+
+join_fail:
+ if (type & __TRANS_FREEZABLE)
+ sb_end_intwrite(fs_info->sb);
+ kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+ if (num_bytes)
+ btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
+ num_bytes, NULL);
+reserve_fail:
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
+ return ERR_PTR(ret);
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ unsigned int num_items)
+{
+ return start_transaction(root, num_items, TRANS_START,
+ BTRFS_RESERVE_FLUSH_ALL, true);
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items)
+{
+ return start_transaction(root, num_items, TRANS_START,
+ BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
+}
+
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
+ true);
+}
+
+struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
+ BTRFS_RESERVE_NO_FLUSH, true);
+}
+
+/*
+ * Similar to regular join but it never starts a transaction when none is
+ * running or after waiting for the current one to finish.
+ */
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_JOIN_NOSTART,
+ BTRFS_RESERVE_NO_FLUSH, true);
+}
+
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ * btrfs_attach_transaction_barrier()
+ */
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH, true);
+}
+
+/*
+ * btrfs_attach_transaction_barrier() - catch the running transaction
+ *
+ * It is similar to the above function, the difference is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+
+ trans = start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH, true);
+ if (trans == ERR_PTR(-ENOENT)) {
+ int ret;
+
+ ret = btrfs_wait_for_commit(root->fs_info, 0);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ return trans;
+}
+
+/* Wait for a transaction commit to reach at least the given state. */
+static noinline void wait_for_commit(struct btrfs_transaction *commit,
+ const enum btrfs_trans_state min_state)
+{
+ struct btrfs_fs_info *fs_info = commit->fs_info;
+ u64 transid = commit->transid;
+ bool put = false;
+
+ /*
+ * At the moment this function is called with min_state either being
+ * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
+ */
+ if (min_state == TRANS_STATE_COMPLETED)
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ else
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+
+ while (1) {
+ wait_event(commit->commit_wait, commit->state >= min_state);
+ if (put)
+ btrfs_put_transaction(commit);
+
+ if (min_state < TRANS_STATE_COMPLETED)
+ break;
+
+ /*
+ * A transaction isn't really completed until all of the
+ * previous transactions are completed, but with fsync we can
+ * end up with SUPER_COMMITTED transactions before a COMPLETED
+ * transaction. Wait for those.
+ */
+
+ spin_lock(&fs_info->trans_lock);
+ commit = list_first_entry_or_null(&fs_info->trans_list,
+ struct btrfs_transaction,
+ list);
+ if (!commit || commit->transid > transid) {
+ spin_unlock(&fs_info->trans_lock);
+ break;
+ }
+ refcount_inc(&commit->use_count);
+ put = true;
+ spin_unlock(&fs_info->trans_lock);
+ }
+}
+
+int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
+{
+ struct btrfs_transaction *cur_trans = NULL, *t;
+ int ret = 0;
+
+ if (transid) {
+ if (transid <= fs_info->last_trans_committed)
+ goto out;
+
+ /* find specified transaction */
+ spin_lock(&fs_info->trans_lock);
+ list_for_each_entry(t, &fs_info->trans_list, list) {
+ if (t->transid == transid) {
+ cur_trans = t;
+ refcount_inc(&cur_trans->use_count);
+ ret = 0;
+ break;
+ }
+ if (t->transid > transid) {
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The specified transaction doesn't exist, or we
+ * raced with btrfs_commit_transaction
+ */
+ if (!cur_trans) {
+ if (transid > fs_info->last_trans_committed)
+ ret = -EINVAL;
+ goto out;
+ }
+ } else {
+ /* find newest transaction that is committing | committed */
+ spin_lock(&fs_info->trans_lock);
+ list_for_each_entry_reverse(t, &fs_info->trans_list,
+ list) {
+ if (t->state >= TRANS_STATE_COMMIT_START) {
+ if (t->state == TRANS_STATE_COMPLETED)
+ break;
+ cur_trans = t;
+ refcount_inc(&cur_trans->use_count);
+ break;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (!cur_trans)
+ goto out; /* nothing committing|committed */
+ }
+
+ wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
+ ret = cur_trans->aborted;
+ btrfs_put_transaction(cur_trans);
+out:
+ return ret;
+}
+
+void btrfs_throttle(struct btrfs_fs_info *fs_info)
+{
+ wait_current_trans(fs_info);
+}
+
+static bool should_end_transaction(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (btrfs_check_space_for_delayed_refs(fs_info))
+ return true;
+
+ return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
+}
+
+bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
+ return true;
+
+ return should_end_transaction(trans);
+}
+
+static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
+
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (!trans->block_rsv) {
+ ASSERT(!trans->bytes_reserved);
+ return;
+ }
+
+ if (!trans->bytes_reserved)
+ return;
+
+ ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, trans->bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv,
+ trans->bytes_reserved, NULL);
+ trans->bytes_reserved = 0;
+}
+
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ int throttle)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int err = 0;
+
+ if (refcount_read(&trans->use_count) > 1) {
+ refcount_dec(&trans->use_count);
+ trans->block_rsv = trans->orig_rsv;
+ return 0;
+ }
+
+ btrfs_trans_release_metadata(trans);
+ trans->block_rsv = NULL;
+
+ btrfs_create_pending_block_groups(trans);
+
+ btrfs_trans_release_chunk_metadata(trans);
+
+ if (trans->type & __TRANS_FREEZABLE)
+ sb_end_intwrite(info->sb);
+
+ WARN_ON(cur_trans != info->running_transaction);
+ WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
+ atomic_dec(&cur_trans->num_writers);
+ extwriter_counter_dec(cur_trans, trans->type);
+
+ cond_wake_up(&cur_trans->writer_wait);
+
+ btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(info, btrfs_trans_num_writers);
+
+ btrfs_put_transaction(cur_trans);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ if (throttle)
+ btrfs_run_delayed_iputs(info);
+
+ if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
+ wake_up_process(info->transaction_kthread);
+ if (TRANS_ABORTED(trans))
+ err = trans->aborted;
+ else
+ err = -EROFS;
+ }
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ return err;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans)
+{
+ return __btrfs_end_transaction(trans, 0);
+}
+
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
+{
+ return __btrfs_end_transaction(trans, 1);
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are sent to disk but does not wait on them
+ */
+int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages, int mark)
+{
+ int err = 0;
+ int werr = 0;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 end;
+
+ atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
+ while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, &cached_state)) {
+ bool wait_writeback = false;
+
+ err = convert_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ mark, &cached_state);
+ /*
+ * convert_extent_bit can return -ENOMEM, which is most of the
+ * time a temporary error. So when it happens, ignore the error
+ * and wait for writeback of this range to finish - because we
+ * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+ * to __btrfs_wait_marked_extents() would not know that
+ * writeback for this range started and therefore wouldn't
+ * wait for it to finish - we don't want to commit a
+ * superblock that points to btree nodes/leafs for which
+ * writeback hasn't finished yet (and without errors).
+ * We cleanup any entries left in the io tree when committing
+ * the transaction (through extent_io_tree_release()).
+ */
+ if (err == -ENOMEM) {
+ err = 0;
+ wait_writeback = true;
+ }
+ if (!err)
+ err = filemap_fdatawrite_range(mapping, start, end);
+ if (err)
+ werr = err;
+ else if (wait_writeback)
+ werr = filemap_fdatawait_range(mapping, start, end);
+ free_extent_state(cached_state);
+ cached_state = NULL;
+ cond_resched();
+ start = end + 1;
+ }
+ atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
+ return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit. We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages)
+{
+ int err = 0;
+ int werr = 0;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 end;
+
+ while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_NEED_WAIT, &cached_state)) {
+ /*
+ * Ignore -ENOMEM errors returned by clear_extent_bit().
+ * When committing the transaction, we'll remove any entries
+ * left in the io tree. For a log commit, we don't remove them
+ * after committing the log because the tree can be accessed
+ * concurrently - we do it only at transaction commit time when
+ * it's safe to do it (through extent_io_tree_release()).
+ */
+ err = clear_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT, &cached_state);
+ if (err == -ENOMEM)
+ err = 0;
+ if (!err)
+ err = filemap_fdatawait_range(mapping, start, end);
+ if (err)
+ werr = err;
+ free_extent_state(cached_state);
+ cached_state = NULL;
+ cond_resched();
+ start = end + 1;
+ }
+ if (err)
+ werr = err;
+ return werr;
+}
+
+static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages)
+{
+ bool errors = false;
+ int err;
+
+ err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
+ errors = true;
+
+ if (errors && !err)
+ err = -EIO;
+ return err;
+}
+
+int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
+{
+ struct btrfs_fs_info *fs_info = log_root->fs_info;
+ struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
+ bool errors = false;
+ int err;
+
+ ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ if ((mark & EXTENT_DIRTY) &&
+ test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
+ errors = true;
+
+ if ((mark & EXTENT_NEW) &&
+ test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
+ errors = true;
+
+ if (errors && !err)
+ err = -EIO;
+ return err;
+}
+
+/*
+ * When btree blocks are allocated the corresponding extents are marked dirty.
+ * This function ensures such extents are persisted on disk for transaction or
+ * log commit.
+ *
+ * @trans: transaction whose dirty pages we'd like to write
+ */
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
+{
+ int ret;
+ int ret2;
+ struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
+ blk_finish_plug(&plug);
+ ret2 = btrfs_wait_extents(fs_info, dirty_pages);
+
+ extent_io_tree_release(&trans->transaction->dirty_pages);
+
+ if (ret)
+ return ret;
+ else if (ret2)
+ return ret2;
+ else
+ return 0;
+}
+
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ u64 old_root_bytenr;
+ u64 old_root_used;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+
+ old_root_used = btrfs_root_used(&root->root_item);
+
+ while (1) {
+ old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+ if (old_root_bytenr == root->node->start &&
+ old_root_used == btrfs_root_used(&root->root_item))
+ break;
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ &root->root_item);
+ if (ret)
+ return ret;
+
+ old_root_used = btrfs_root_used(&root->root_item);
+ }
+
+ return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ *
+ * The error handling in this function may not be obvious. Any of the
+ * failures will cause the file system to go offline. We still need
+ * to clean up the delayed refs.
+ */
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
+ struct list_head *io_bgs = &trans->transaction->io_bgs;
+ struct list_head *next;
+ struct extent_buffer *eb;
+ int ret;
+
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
+ eb = btrfs_lock_root_node(fs_info->tree_root);
+ ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
+ 0, &eb, BTRFS_NESTING_COW);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ if (ret)
+ return ret;
+
+ ret = btrfs_run_dev_stats(trans);
+ if (ret)
+ return ret;
+ ret = btrfs_run_dev_replace(trans);
+ if (ret)
+ return ret;
+ ret = btrfs_run_qgroups(trans);
+ if (ret)
+ return ret;
+
+ ret = btrfs_setup_space_cache(trans);
+ if (ret)
+ return ret;
+
+again:
+ while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+ struct btrfs_root *root;
+ next = fs_info->dirty_cowonly_roots.next;
+ list_del_init(next);
+ root = list_entry(next, struct btrfs_root, dirty_list);
+ clear_bit(BTRFS_ROOT_DIRTY, &root->state);
+
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ ret = update_cowonly_root(trans, root);
+ if (ret)
+ return ret;
+ }
+
+ /* Now flush any delayed refs generated by updating all of the roots */
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret)
+ return ret;
+
+ while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
+ ret = btrfs_write_dirty_block_groups(trans);
+ if (ret)
+ return ret;
+
+ /*
+ * We're writing the dirty block groups, which could generate
+ * delayed refs, which could generate more dirty block groups,
+ * so we want to keep this flushing in this loop to make sure
+ * everything gets run.
+ */
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret)
+ return ret;
+ }
+
+ if (!list_empty(&fs_info->dirty_cowonly_roots))
+ goto again;
+
+ /* Update dev-replace pointer once everything is committed */
+ fs_info->dev_replace.committed_cursor_left =
+ fs_info->dev_replace.cursor_left_last_write_of_item;
+
+ return 0;
+}
+
+/*
+ * If we had a pending drop we need to see if there are any others left in our
+ * dead roots list, and if not clear our bit and wake any waiters.
+ */
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * We put the drop in progress roots at the front of the list, so if the
+ * first entry doesn't have UNFINISHED_DROP set we can wake everybody
+ * up.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&fs_info->dead_roots)) {
+ struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root,
+ root_list);
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
+ spin_unlock(&fs_info->trans_lock);
+ return;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_wake_unfinished_drop(fs_info);
+}
+
+/*
+ * dead roots are old snapshots that need to be deleted. This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+void btrfs_add_dead_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ spin_lock(&fs_info->trans_lock);
+ if (list_empty(&root->root_list)) {
+ btrfs_grab_root(root);
+
+ /* We want to process the partially complete drops first. */
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
+ list_add(&root->root_list, &fs_info->dead_roots);
+ else
+ list_add_tail(&root->root_list, &fs_info->dead_roots);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
+/*
+ * Update each subvolume root and its relocation root, if it exists, in the tree
+ * of tree roots. Also free log roots if they exist.
+ */
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *gang[8];
+ int i;
+ int ret;
+
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ struct btrfs_root *root = gang[i];
+ int ret2;
+
+ /*
+ * At this point we can neither have tasks logging inodes
+ * from a root nor trying to commit a log tree.
+ */
+ ASSERT(atomic_read(&root->log_writers) == 0);
+ ASSERT(atomic_read(&root->log_commit[0]) == 0);
+ ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ btrfs_free_log(trans, root);
+ ret2 = btrfs_update_reloc_root(trans, root);
+ if (ret2)
+ return ret2;
+
+ /* see comments in should_cow_block() */
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
+
+ if (root->commit_root != root->node) {
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ btrfs_set_root_node(&root->root_item,
+ root->node);
+ }
+
+ ret2 = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key,
+ &root->root_item);
+ if (ret2)
+ return ret2;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ btrfs_qgroup_free_meta_all_pertrans(root);
+ }
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return 0;
+}
+
+/*
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
+ return 0;
+
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+
+ ret = btrfs_defrag_leaves(trans, root);
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(info);
+ cond_resched();
+
+ if (btrfs_fs_closing(info) || ret != -EAGAIN)
+ break;
+
+ if (btrfs_defrag_cancelled(info)) {
+ btrfs_debug(info, "defrag_root cancelled");
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
+ return ret;
+}
+
+/*
+ * Do all special snapshot related qgroup dirty hack.
+ *
+ * Will do all needed qgroup inherit and dirty hack like switch commit
+ * roots inside one transaction and write all btree into disk, to make
+ * qgroup works.
+ */
+static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_root *src,
+ struct btrfs_root *parent,
+ struct btrfs_qgroup_inherit *inherit,
+ u64 dst_objectid)
+{
+ struct btrfs_fs_info *fs_info = src->fs_info;
+ int ret;
+
+ /*
+ * Save some performance in the case that qgroups are not
+ * enabled. If this check races with the ioctl, rescan will
+ * kick in anyway.
+ */
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ /*
+ * Ensure dirty @src will be committed. Or, after coming
+ * commit_fs_roots() and switch_commit_roots(), any dirty but not
+ * recorded root will never be updated again, causing an outdated root
+ * item.
+ */
+ ret = record_root_in_trans(trans, src, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * btrfs_qgroup_inherit relies on a consistent view of the usage for the
+ * src root, so we must run the delayed refs here.
+ *
+ * However this isn't particularly fool proof, because there's no
+ * synchronization keeping us from changing the tree after this point
+ * before we do the qgroup_inherit, or even from making changes while
+ * we're doing the qgroup_inherit. But that's a problem for the future,
+ * for now flush the delayed refs to narrow the race window where the
+ * qgroup counters could end up wrong.
+ */
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = commit_fs_roots(trans);
+ if (ret)
+ goto out;
+ ret = btrfs_qgroup_account_extents(trans);
+ if (ret < 0)
+ goto out;
+
+ /* Now qgroup are all updated, we can inherit it to new qgroups */
+ ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
+ inherit);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Now we do a simplified commit transaction, which will:
+ * 1) commit all subvolume and extent tree
+ * To ensure all subvolume and extent tree have a valid
+ * commit_root to accounting later insert_dir_item()
+ * 2) write all btree blocks onto disk
+ * This is to make sure later btree modification will be cowed
+ * Or commit_root can be populated and cause wrong qgroup numbers
+ * In this simplified commit, we don't really care about other trees
+ * like chunk and root tree, as they won't affect qgroup.
+ * And we don't write super to avoid half committed status.
+ */
+ ret = commit_cowonly_roots(trans);
+ if (ret)
+ goto out;
+ switch_commit_roots(trans);
+ ret = btrfs_write_and_wait_transaction(trans);
+ if (ret)
+ btrfs_handle_fs_error(fs_info, ret,
+ "Error while writing out transaction for qgroup");
+
+out:
+ /*
+ * Force parent root to be updated, as we recorded it before so its
+ * last_trans == cur_transid.
+ * Or it won't be committed again onto disk after later
+ * insert_dir_item()
+ */
+ if (!ret)
+ ret = record_root_in_trans(trans, parent, 1);
+ return ret;
+}
+
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit. This does the actual creation.
+ *
+ * Note:
+ * If the error which may affect the commitment of the current transaction
+ * happens, we should return the error number. If the error which just affect
+ * the creation of the pending snapshots, just return 0.
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key;
+ struct btrfs_root_item *new_root_item;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *parent_root;
+ struct btrfs_block_rsv *rsv;
+ struct inode *parent_inode = pending->dir;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *dir_item;
+ struct extent_buffer *tmp;
+ struct extent_buffer *old;
+ struct timespec64 cur_time;
+ int ret = 0;
+ u64 to_reserve = 0;
+ u64 index = 0;
+ u64 objectid;
+ u64 root_flags;
+ unsigned int nofs_flags;
+ struct fscrypt_name fname;
+
+ ASSERT(pending->path);
+ path = pending->path;
+
+ ASSERT(pending->root_item);
+ new_root_item = pending->root_item;
+
+ /*
+ * We're inside a transaction and must make sure that any potential
+ * allocations with GFP_KERNEL in fscrypt won't recurse back to
+ * filesystem.
+ */
+ nofs_flags = memalloc_nofs_save();
+ pending->error = fscrypt_setup_filename(parent_inode,
+ &pending->dentry->d_name, 0,
+ &fname);
+ memalloc_nofs_restore(nofs_flags);
+ if (pending->error)
+ goto free_pending;
+
+ pending->error = btrfs_get_free_objectid(tree_root, &objectid);
+ if (pending->error)
+ goto free_fname;
+
+ /*
+ * Make qgroup to skip current new snapshot's qgroupid, as it is
+ * accounted by later btrfs_qgroup_inherit().
+ */
+ btrfs_set_skip_qgroup(trans, objectid);
+
+ btrfs_reloc_pre_snapshot(pending, &to_reserve);
+
+ if (to_reserve > 0) {
+ pending->error = btrfs_block_rsv_add(fs_info,
+ &pending->block_rsv,
+ to_reserve,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (pending->error)
+ goto clear_skip_qgroup;
+ }
+
+ key.objectid = objectid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+
+ rsv = trans->block_rsv;
+ trans->block_rsv = &pending->block_rsv;
+ trans->bytes_reserved = trans->block_rsv->reserved;
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid,
+ trans->bytes_reserved, 1);
+ parent_root = BTRFS_I(parent_inode)->root;
+ ret = record_root_in_trans(trans, parent_root, 0);
+ if (ret)
+ goto fail;
+ cur_time = current_time(parent_inode);
+
+ /*
+ * insert the directory item
+ */
+ ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
+ BUG_ON(ret); /* -ENOMEM */
+
+ /* check if there is a file/dir which has the same name. */
+ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+ btrfs_ino(BTRFS_I(parent_inode)),
+ &fname.disk_name, 0);
+ if (dir_item != NULL && !IS_ERR(dir_item)) {
+ pending->error = -EEXIST;
+ goto dir_item_existed;
+ } else if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * pull in the delayed directory update
+ * and the delayed inode item
+ * otherwise we corrupt the FS during
+ * snapshot
+ */
+ ret = btrfs_run_delayed_items(trans);
+ if (ret) { /* Transaction aborted */
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
+ ret = record_root_in_trans(trans, root, 0);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+ btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+ memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+ btrfs_check_and_init_root_item(new_root_item);
+
+ root_flags = btrfs_root_flags(new_root_item);
+ if (pending->readonly)
+ root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+ else
+ root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+ btrfs_set_root_flags(new_root_item, root_flags);
+
+ btrfs_set_root_generation_v2(new_root_item,
+ trans->transid);
+ generate_random_guid(new_root_item->uuid);
+ memcpy(new_root_item->parent_uuid, root->root_item.uuid,
+ BTRFS_UUID_SIZE);
+ if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
+ memset(new_root_item->received_uuid, 0,
+ sizeof(new_root_item->received_uuid));
+ memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
+ memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
+ btrfs_set_root_stransid(new_root_item, 0);
+ btrfs_set_root_rtransid(new_root_item, 0);
+ }
+ btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
+ btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
+ btrfs_set_root_otransid(new_root_item, trans->transid);
+
+ old = btrfs_lock_root_node(root);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(old);
+ free_extent_buffer(old);
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
+ ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
+ /* clean up in any case */
+ btrfs_tree_unlock(old);
+ free_extent_buffer(old);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+ /* see comments in should_cow_block() */
+ set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_wmb();
+
+ btrfs_set_root_node(new_root_item, tmp);
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
+ ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
+ btrfs_tree_unlock(tmp);
+ free_extent_buffer(tmp);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
+ /*
+ * insert root back/forward references
+ */
+ ret = btrfs_add_root_ref(trans, objectid,
+ parent_root->root_key.objectid,
+ btrfs_ino(BTRFS_I(parent_inode)), index,
+ &fname.disk_name);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
+ key.offset = (u64)-1;
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
+ if (IS_ERR(pending->snap)) {
+ ret = PTR_ERR(pending->snap);
+ pending->snap = NULL;
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
+ ret = btrfs_reloc_post_snapshot(trans, pending);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
+ /*
+ * Do special qgroup accounting for snapshot, as we do some qgroup
+ * snapshot hack to do fast snapshot.
+ * To co-operate with that hack, we do hack again.
+ * Or snapshot will be greatly slowed down by a subtree qgroup rescan
+ */
+ ret = qgroup_account_snapshot(trans, root, parent_root,
+ pending->inherit, objectid);
+ if (ret < 0)
+ goto fail;
+
+ ret = btrfs_insert_dir_item(trans, &fname.disk_name,
+ BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+ index);
+ /* We have check then name at the beginning, so it is impossible. */
+ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
+ btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
+ fname.disk_name.len * 2);
+ parent_inode->i_mtime = current_time(parent_inode);
+ parent_inode->i_ctime = parent_inode->i_mtime;
+ ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+ ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL,
+ objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+ if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
+ ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ objectid);
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+ }
+
+fail:
+ pending->error = ret;
+dir_item_existed:
+ trans->block_rsv = rsv;
+ trans->bytes_reserved = 0;
+clear_skip_qgroup:
+ btrfs_clear_skip_qgroup(trans);
+free_fname:
+ fscrypt_free_filename(&fname);
+free_pending:
+ kfree(new_root_item);
+ pending->root_item = NULL;
+ btrfs_free_path(path);
+ pending->path = NULL;
+
+ return ret;
+}
+
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_pending_snapshot *pending, *next;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+ int ret = 0;
+
+ list_for_each_entry_safe(pending, next, head, list) {
+ list_del(&pending->list);
+ ret = create_pending_snapshot(trans, pending);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static void update_super_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root_item *root_item;
+ struct btrfs_super_block *super;
+
+ super = fs_info->super_copy;
+
+ root_item = &fs_info->chunk_root->root_item;
+ super->chunk_root = root_item->bytenr;
+ super->chunk_root_generation = root_item->generation;
+ super->chunk_root_level = root_item->level;
+
+ root_item = &fs_info->tree_root->root_item;
+ super->root = root_item->bytenr;
+ super->generation = root_item->generation;
+ super->root_level = root_item->level;
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
+ super->cache_generation = root_item->generation;
+ else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
+ super->cache_generation = 0;
+ if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
+ super->uuid_tree_generation = root_item->generation;
+}
+
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+ struct btrfs_transaction *trans;
+ int ret = 0;
+
+ spin_lock(&info->trans_lock);
+ trans = info->running_transaction;
+ if (trans)
+ ret = (trans->state >= TRANS_STATE_COMMIT_START);
+ spin_unlock(&info->trans_lock);
+ return ret;
+}
+
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+ struct btrfs_transaction *trans;
+ int ret = 0;
+
+ spin_lock(&info->trans_lock);
+ trans = info->running_transaction;
+ if (trans)
+ ret = is_transaction_blocked(trans);
+ spin_unlock(&info->trans_lock);
+ return ret;
+}
+
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_transaction *cur_trans;
+
+ /* Kick the transaction kthread. */
+ set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+ wake_up_process(fs_info->transaction_kthread);
+
+ /* take transaction reference */
+ cur_trans = trans->transaction;
+ refcount_inc(&cur_trans->use_count);
+
+ btrfs_end_transaction(trans);
+
+ /*
+ * Wait for the current transaction commit to start and block
+ * subsequent transaction joins
+ */
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ wait_event(fs_info->transaction_blocked_wait,
+ cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ TRANS_ABORTED(cur_trans));
+ btrfs_put_transaction(cur_trans);
+}
+
+static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ WARN_ON(refcount_read(&trans->use_count) > 1);
+
+ btrfs_abort_transaction(trans, err);
+
+ spin_lock(&fs_info->trans_lock);
+
+ /*
+ * If the transaction is removed from the list, it means this
+ * transaction has been committed successfully, so it is impossible
+ * to call the cleanup function.
+ */
+ BUG_ON(list_empty(&cur_trans->list));
+
+ if (cur_trans == fs_info->running_transaction) {
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has already released the lockdep map as reader
+ * already in btrfs_commit_transaction().
+ */
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+
+ spin_lock(&fs_info->trans_lock);
+ }
+
+ /*
+ * Now that we know no one else is still using the transaction we can
+ * remove the transaction from the list of transactions. This avoids
+ * the transaction kthread from cleaning up the transaction while some
+ * other task is still using it, which could result in a use-after-free
+ * on things like log trees, as it forces the transaction kthread to
+ * wait for this transaction to be cleaned up by us.
+ */
+ list_del_init(&cur_trans->list);
+
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_cleanup_one_transaction(trans->transaction, fs_info);
+
+ spin_lock(&fs_info->trans_lock);
+ if (cur_trans == fs_info->running_transaction)
+ fs_info->running_transaction = NULL;
+ spin_unlock(&fs_info->trans_lock);
+
+ if (trans->type & __TRANS_FREEZABLE)
+ sb_end_intwrite(fs_info->sb);
+ btrfs_put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
+
+ trace_btrfs_transaction_commit(fs_info);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ /*
+ * If relocation is running, we can't cancel scrub because that will
+ * result in a deadlock. Before relocating a block group, relocation
+ * pauses scrub, then starts and commits a transaction before unpausing
+ * scrub. If the transaction commit is being done by the relocation
+ * task or triggered by another task and the relocation task is waiting
+ * for the commit, and we end up here due to an error in the commit
+ * path, then calling btrfs_scrub_cancel() will deadlock, as we are
+ * asking for scrub to stop while having it asked to be paused higher
+ * above in relocation code.
+ */
+ if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ btrfs_scrub_cancel(fs_info);
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+}
+
+/*
+ * Release reserved delayed ref space of all pending block groups of the
+ * transaction and remove them from the list
+ */
+static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *block_group, *tmp;
+
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
+ list_del_init(&block_group->bg_list);
+ }
+}
+
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * We use try_to_writeback_inodes_sb() here because if we used
+ * btrfs_start_delalloc_roots we would deadlock with fs freeze.
+ * Currently are holding the fs freeze lock, if we do an async flush
+ * we'll do btrfs_join_transaction() and deadlock because we need to
+ * wait for the fs freeze lock. Using the direct flushing we benefit
+ * from already being in a transaction and our join_transaction doesn't
+ * have to re-take the fs freeze lock.
+ *
+ * Note that try_to_writeback_inodes_sb() will only trigger writeback
+ * if it can read lock sb->s_umount. It will always be able to lock it,
+ * except when the filesystem is being unmounted or being frozen, but in
+ * those cases sync_filesystem() is called, which results in calling
+ * writeback_inodes_sb() while holding a write lock on sb->s_umount.
+ * Note that we don't call writeback_inodes_sb() directly, because it
+ * will emit a warning if sb->s_umount is not locked.
+ */
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+}
+
+/*
+ * Add a pending snapshot associated with the given transaction handle to the
+ * respective handle. This must be called after the transaction commit started
+ * and while holding fs_info->trans_lock.
+ * This serves to guarantee a caller of btrfs_commit_transaction() that it can
+ * safely free the pending snapshot pointer in case btrfs_commit_transaction()
+ * returns an error.
+ */
+static void add_pending_snapshot(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ if (!trans->pending_snapshot)
+ return;
+
+ lockdep_assert_held(&trans->fs_info->trans_lock);
+ ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
+
+ list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
+}
+
+static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+{
+ fs_info->commit_stats.commit_count++;
+ fs_info->commit_stats.last_commit_dur = interval;
+ fs_info->commit_stats.max_commit_dur =
+ max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
+ fs_info->commit_stats.total_commit_dur += interval;
+}
+
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_transaction *prev_trans = NULL;
+ int ret;
+ ktime_t start_time;
+ ktime_t interval;
+
+ ASSERT(refcount_read(&trans->use_count) == 1);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+
+ /* Stop the commit early if ->aborted is set */
+ if (TRANS_ABORTED(cur_trans)) {
+ ret = cur_trans->aborted;
+ goto lockdep_trans_commit_start_release;
+ }
+
+ btrfs_trans_release_metadata(trans);
+ trans->block_rsv = NULL;
+
+ /*
+ * We only want one transaction commit doing the flushing so we do not
+ * waste a bunch of time on lock contention on the extent root node.
+ */
+ if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
+ &cur_trans->delayed_refs.flags)) {
+ /*
+ * Make a pass through all the delayed refs we have so far.
+ * Any running threads may add more while we are here.
+ */
+ ret = btrfs_run_delayed_refs(trans, 0);
+ if (ret)
+ goto lockdep_trans_commit_start_release;
+ }
+
+ btrfs_create_pending_block_groups(trans);
+
+ if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
+ int run_it = 0;
+
+ /* this mutex is also taken before trying to set
+ * block groups readonly. We need to make sure
+ * that nobody has set a block group readonly
+ * after a extents from that block group have been
+ * allocated for cache files. btrfs_set_block_group_ro
+ * will wait for the transaction to commit if it
+ * finds BTRFS_TRANS_DIRTY_BG_RUN set.
+ *
+ * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+ * only one process starts all the block group IO. It wouldn't
+ * hurt to have more than one go through, but there's no
+ * real advantage to it either.
+ */
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+ &cur_trans->flags))
+ run_it = 1;
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+
+ if (run_it) {
+ ret = btrfs_start_dirty_block_groups(trans);
+ if (ret)
+ goto lockdep_trans_commit_start_release;
+ }
+ }
+
+ spin_lock(&fs_info->trans_lock);
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+
+ add_pending_snapshot(trans);
+
+ spin_unlock(&fs_info->trans_lock);
+ refcount_inc(&cur_trans->use_count);
+
+ if (trans->in_fsync)
+ want_state = TRANS_STATE_SUPER_COMMITTED;
+
+ btrfs_trans_state_lockdep_release(fs_info,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ ret = btrfs_end_transaction(trans);
+ wait_for_commit(cur_trans, want_state);
+
+ if (TRANS_ABORTED(cur_trans))
+ ret = cur_trans->aborted;
+
+ btrfs_put_transaction(cur_trans);
+
+ return ret;
+ }
+
+ cur_trans->state = TRANS_STATE_COMMIT_START;
+ wake_up(&fs_info->transaction_blocked_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+
+ if (cur_trans->list.prev != &fs_info->trans_list) {
+ enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+
+ if (trans->in_fsync)
+ want_state = TRANS_STATE_SUPER_COMMITTED;
+
+ prev_trans = list_entry(cur_trans->list.prev,
+ struct btrfs_transaction, list);
+ if (prev_trans->state < want_state) {
+ refcount_inc(&prev_trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ wait_for_commit(prev_trans, want_state);
+
+ ret = READ_ONCE(prev_trans->aborted);
+
+ btrfs_put_transaction(prev_trans);
+ if (ret)
+ goto lockdep_release;
+ } else {
+ spin_unlock(&fs_info->trans_lock);
+ }
+ } else {
+ spin_unlock(&fs_info->trans_lock);
+ /*
+ * The previous transaction was aborted and was already removed
+ * from the list of transactions at fs_info->trans_list. So we
+ * abort to prevent writing a new superblock that reflects a
+ * corrupt state (pointing to trees with unwritten nodes/leafs).
+ */
+ if (BTRFS_FS_ERROR(fs_info)) {
+ ret = -EROFS;
+ goto lockdep_release;
+ }
+ }
+
+ /*
+ * Get the time spent on the work done by the commit thread and not
+ * the time spent waiting on a previous commit
+ */
+ start_time = ktime_get_ns();
+
+ extwriter_counter_dec(cur_trans, trans->type);
+
+ ret = btrfs_start_delalloc_flush(fs_info);
+ if (ret)
+ goto lockdep_release;
+
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ goto lockdep_release;
+
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
+ wait_event(cur_trans->writer_wait,
+ extwriter_counter_read(cur_trans) == 0);
+
+ /* some pending stuffs might be added after the previous flush. */
+ ret = btrfs_run_delayed_items(trans);
+ if (ret) {
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ goto cleanup_transaction;
+ }
+
+ btrfs_wait_delalloc_flush(fs_info);
+
+ /*
+ * Wait for all ordered extents started by a fast fsync that joined this
+ * transaction. Otherwise if this transaction commits before the ordered
+ * extents complete we lose logged data after a power failure.
+ */
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
+
+ btrfs_scrub_pause(fs_info);
+ /*
+ * Ok now we need to make sure to block out any other joins while we
+ * commit the transaction. We could have started a join before setting
+ * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
+ */
+ spin_lock(&fs_info->trans_lock);
+ add_pending_snapshot(trans);
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+
+ /*
+ * Make lockdep happy by acquiring the state locks after
+ * btrfs_trans_num_writers is released. If we acquired the state locks
+ * before releasing the btrfs_trans_num_writers lock then lockdep would
+ * complain because we did not follow the reverse order unlocking rule.
+ */
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+
+ /*
+ * We've started the commit, clear the flag in case we were triggered to
+ * do an async commit but somebody else started before the transaction
+ * kthread could do the work.
+ */
+ clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+
+ if (TRANS_ABORTED(cur_trans)) {
+ ret = cur_trans->aborted;
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ goto scrub_continue;
+ }
+ /*
+ * the reloc mutex makes sure that we stop
+ * the balancing code from coming in and moving
+ * extents around in the middle of the commit
+ */
+ mutex_lock(&fs_info->reloc_mutex);
+
+ /*
+ * We needn't worry about the delayed items because we will
+ * deal with them in create_pending_snapshot(), which is the
+ * core function of the snapshot creation.
+ */
+ ret = create_pending_snapshots(trans);
+ if (ret)
+ goto unlock_reloc;
+
+ /*
+ * We insert the dir indexes of the snapshots and update the inode
+ * of the snapshots' parents after the snapshot creation, so there
+ * are some delayed items which are not dealt with. Now deal with
+ * them.
+ *
+ * We needn't worry that this operation will corrupt the snapshots,
+ * because all the tree which are snapshoted will be forced to COW
+ * the nodes and leaves.
+ */
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ goto unlock_reloc;
+
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret)
+ goto unlock_reloc;
+
+ /*
+ * make sure none of the code above managed to slip in a
+ * delayed item
+ */
+ btrfs_assert_delayed_root_empty(fs_info);
+
+ WARN_ON(cur_trans != trans->transaction);
+
+ ret = commit_fs_roots(trans);
+ if (ret)
+ goto unlock_reloc;
+
+ /*
+ * Since the transaction is done, we can apply the pending changes
+ * before the next transaction.
+ */
+ btrfs_apply_pending_changes(fs_info);
+
+ /* commit_fs_roots gets rid of all the tree log roots, it is now
+ * safe to free the root of tree log roots
+ */
+ btrfs_free_log_root_tree(trans, fs_info);
+
+ /*
+ * Since fs roots are all committed, we can get a quite accurate
+ * new_roots. So let's do quota accounting.
+ */
+ ret = btrfs_qgroup_account_extents(trans);
+ if (ret < 0)
+ goto unlock_reloc;
+
+ ret = commit_cowonly_roots(trans);
+ if (ret)
+ goto unlock_reloc;
+
+ /*
+ * The tasks which save the space cache and inode cache may also
+ * update ->aborted, check it.
+ */
+ if (TRANS_ABORTED(cur_trans)) {
+ ret = cur_trans->aborted;
+ goto unlock_reloc;
+ }
+
+ cur_trans = fs_info->running_transaction;
+
+ btrfs_set_root_node(&fs_info->tree_root->root_item,
+ fs_info->tree_root->node);
+ list_add_tail(&fs_info->tree_root->dirty_list,
+ &cur_trans->switch_commits);
+
+ btrfs_set_root_node(&fs_info->chunk_root->root_item,
+ fs_info->chunk_root->node);
+ list_add_tail(&fs_info->chunk_root->dirty_list,
+ &cur_trans->switch_commits);
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_set_root_node(&fs_info->block_group_root->root_item,
+ fs_info->block_group_root->node);
+ list_add_tail(&fs_info->block_group_root->dirty_list,
+ &cur_trans->switch_commits);
+ }
+
+ switch_commit_roots(trans);
+
+ ASSERT(list_empty(&cur_trans->dirty_bgs));
+ ASSERT(list_empty(&cur_trans->io_bgs));
+ update_super_roots(fs_info);
+
+ btrfs_set_super_log_root(fs_info->super_copy, 0);
+ btrfs_set_super_log_root_level(fs_info->super_copy, 0);
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_copy));
+
+ btrfs_commit_device_sizes(cur_trans);
+
+ clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
+ clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
+
+ btrfs_trans_release_chunk_metadata(trans);
+
+ /*
+ * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
+ * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
+ * make sure that before we commit our superblock, no other task can
+ * start a new transaction and commit a log tree before we commit our
+ * superblock. Anyone trying to commit a log tree locks this mutex before
+ * writing its superblock.
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
+ spin_lock(&fs_info->trans_lock);
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
+ fs_info->running_transaction = NULL;
+ spin_unlock(&fs_info->trans_lock);
+ mutex_unlock(&fs_info->reloc_mutex);
+
+ wake_up(&fs_info->transaction_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+
+ ret = btrfs_write_and_wait_transaction(trans);
+ if (ret) {
+ btrfs_handle_fs_error(fs_info, ret,
+ "Error while writing out transaction");
+ mutex_unlock(&fs_info->tree_log_mutex);
+ goto scrub_continue;
+ }
+
+ /*
+ * At this point, we should have written all the tree blocks allocated
+ * in this transaction. So it's now safe to free the redirtyied extent
+ * buffers.
+ */
+ btrfs_free_redirty_list(cur_trans);
+
+ ret = write_all_supers(fs_info, 0);
+ /*
+ * the super is written, we can safely allow the tree-loggers
+ * to go about their business
+ */
+ mutex_unlock(&fs_info->tree_log_mutex);
+ if (ret)
+ goto scrub_continue;
+
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
+ wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+
+ btrfs_finish_extent_commit(trans);
+
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
+ btrfs_clear_space_info_full(fs_info);
+
+ fs_info->last_trans_committed = cur_trans->transid;
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_COMPLETED;
+ wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+
+ spin_lock(&fs_info->trans_lock);
+ list_del_init(&cur_trans->list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
+
+ if (trans->type & __TRANS_FREEZABLE)
+ sb_end_intwrite(fs_info->sb);
+
+ trace_btrfs_transaction_commit(fs_info);
+
+ interval = ktime_get_ns() - start_time;
+
+ btrfs_scrub_continue(fs_info);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+ update_commit_stats(fs_info, interval);
+
+ return ret;
+
+unlock_reloc:
+ mutex_unlock(&fs_info->reloc_mutex);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+scrub_continue:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ btrfs_scrub_continue(fs_info);
+cleanup_transaction:
+ btrfs_trans_release_metadata(trans);
+ btrfs_cleanup_pending_block_groups(trans);
+ btrfs_trans_release_chunk_metadata(trans);
+ trans->block_rsv = NULL;
+ btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+ cleanup_transaction(trans, ret);
+
+ return ret;
+
+lockdep_release:
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ goto cleanup_transaction;
+
+lockdep_trans_commit_start_release:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+/*
+ * return < 0 if error
+ * 0 if there are no more dead_roots at the time of call
+ * 1 there are more to be processed, call me again
+ *
+ * The return value indicates there are certainly more snapshots to delete, but
+ * if there comes a new one during processing, it may return 0. We don't mind,
+ * because btrfs_commit_super will poke cleaner thread and it will process it a
+ * few seconds later.
+ */
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ int ret;
+
+ spin_lock(&fs_info->trans_lock);
+ if (list_empty(&fs_info->dead_roots)) {
+ spin_unlock(&fs_info->trans_lock);
+ return 0;
+ }
+ root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ list_del_init(&root->root_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
+
+ btrfs_kill_all_delayed_nodes(root);
+
+ if (btrfs_header_backref_rev(root->node) <
+ BTRFS_MIXED_BACKREF_REV)
+ ret = btrfs_drop_snapshot(root, 0, 0);
+ else
+ ret = btrfs_drop_snapshot(root, 1, 0);
+
+ btrfs_put_root(root);
+ return (ret < 0) ? 0 : 1;
+}
+
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+ unsigned long prev;
+ unsigned long bit;
+
+ prev = xchg(&fs_info->pending_changes, 0);
+ if (!prev)
+ return;
+
+ bit = 1 << BTRFS_PENDING_COMMIT;
+ if (prev & bit)
+ btrfs_debug(fs_info, "pending commit done");
+ prev &= ~bit;
+
+ if (prev)
+ btrfs_warn(fs_info,
+ "unknown pending changes left 0x%lx, ignoring", prev);
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000..970ff3160
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_TRANSACTION_H
+#define BTRFS_TRANSACTION_H
+
+#include <linux/refcount.h>
+#include "btrfs_inode.h"
+#include "delayed-ref.h"
+#include "ctree.h"
+
+enum btrfs_trans_state {
+ TRANS_STATE_RUNNING,
+ TRANS_STATE_COMMIT_START,
+ TRANS_STATE_COMMIT_DOING,
+ TRANS_STATE_UNBLOCKED,
+ TRANS_STATE_SUPER_COMMITTED,
+ TRANS_STATE_COMPLETED,
+ TRANS_STATE_MAX,
+};
+
+#define BTRFS_TRANS_HAVE_FREE_BGS 0
+#define BTRFS_TRANS_DIRTY_BG_RUN 1
+#define BTRFS_TRANS_CACHE_ENOSPC 2
+
+struct btrfs_transaction {
+ u64 transid;
+ /*
+ * total external writers(USERSPACE/START/ATTACH) in this
+ * transaction, it must be zero before the transaction is
+ * being committed
+ */
+ atomic_t num_extwriters;
+ /*
+ * total writers in this transaction, it must be zero before the
+ * transaction can end
+ */
+ atomic_t num_writers;
+ refcount_t use_count;
+
+ unsigned long flags;
+
+ /* Be protected by fs_info->trans_lock when we want to change it. */
+ enum btrfs_trans_state state;
+ int aborted;
+ struct list_head list;
+ struct extent_io_tree dirty_pages;
+ time64_t start_time;
+ wait_queue_head_t writer_wait;
+ wait_queue_head_t commit_wait;
+ struct list_head pending_snapshots;
+ struct list_head dev_update_list;
+ struct list_head switch_commits;
+ struct list_head dirty_bgs;
+
+ /*
+ * There is no explicit lock which protects io_bgs, rather its
+ * consistency is implied by the fact that all the sites which modify
+ * it do so under some form of transaction critical section, namely:
+ *
+ * - btrfs_start_dirty_block_groups - This function can only ever be
+ * run by one of the transaction committers. Refer to
+ * BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction
+ *
+ * - btrfs_write_dirty_blockgroups - this is called by
+ * commit_cowonly_roots from transaction critical section
+ * (TRANS_STATE_COMMIT_DOING)
+ *
+ * - btrfs_cleanup_dirty_bgs - called on transaction abort
+ */
+ struct list_head io_bgs;
+ struct list_head dropped_roots;
+ struct extent_io_tree pinned_extents;
+
+ /*
+ * we need to make sure block group deletion doesn't race with
+ * free space cache writeout. This mutex keeps them from stomping
+ * on each other
+ */
+ struct mutex cache_write_mutex;
+ spinlock_t dirty_bgs_lock;
+ /* Protected by spin lock fs_info->unused_bgs_lock. */
+ struct list_head deleted_bgs;
+ spinlock_t dropped_roots_lock;
+ struct btrfs_delayed_ref_root delayed_refs;
+ struct btrfs_fs_info *fs_info;
+
+ /*
+ * Number of ordered extents the transaction must wait for before
+ * committing. These are ordered extents started by a fast fsync.
+ */
+ atomic_t pending_ordered;
+ wait_queue_head_t pending_wait;
+
+ spinlock_t releasing_ebs_lock;
+ struct list_head releasing_ebs;
+};
+
+#define __TRANS_FREEZABLE (1U << 0)
+
+#define __TRANS_START (1U << 9)
+#define __TRANS_ATTACH (1U << 10)
+#define __TRANS_JOIN (1U << 11)
+#define __TRANS_JOIN_NOLOCK (1U << 12)
+#define __TRANS_DUMMY (1U << 13)
+#define __TRANS_JOIN_NOSTART (1U << 14)
+
+#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH (__TRANS_ATTACH)
+#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
+#define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART)
+
+#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
+
+struct btrfs_trans_handle {
+ u64 transid;
+ u64 bytes_reserved;
+ u64 chunk_bytes_reserved;
+ unsigned long delayed_ref_updates;
+ struct btrfs_transaction *transaction;
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *orig_rsv;
+ /* Set by a task that wants to create a snapshot. */
+ struct btrfs_pending_snapshot *pending_snapshot;
+ refcount_t use_count;
+ unsigned int type;
+ /*
+ * Error code of transaction abort, set outside of locks and must use
+ * the READ_ONCE/WRITE_ONCE access
+ */
+ short aborted;
+ bool adding_csums;
+ bool allocating_chunk;
+ bool removing_chunk;
+ bool reloc_reserved;
+ bool in_fsync;
+ struct btrfs_fs_info *fs_info;
+ struct list_head new_bgs;
+};
+
+/*
+ * The abort status can be changed between calls and is not protected by locks.
+ * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's
+ * set to a non-zero value it does not change, so the macro should be in checks
+ * but is not necessary for further reads of the value.
+ */
+#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted)))
+
+struct btrfs_pending_snapshot {
+ struct dentry *dentry;
+ struct inode *dir;
+ struct btrfs_root *root;
+ struct btrfs_root_item *root_item;
+ struct btrfs_root *snap;
+ struct btrfs_qgroup_inherit *inherit;
+ struct btrfs_path *path;
+ /* block reservation for the operation */
+ struct btrfs_block_rsv block_rsv;
+ /* extra metadata reservation for relocation */
+ int error;
+ /* Preallocated anonymous block device number */
+ dev_t anon_dev;
+ bool readonly;
+ struct list_head list;
+};
+
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ spin_lock(&inode->lock);
+ inode->last_trans = trans->transaction->transid;
+ inode->last_sub_trans = inode->root->log_transid;
+ inode->last_log_commit = inode->last_sub_trans - 1;
+ spin_unlock(&inode->lock);
+}
+
+/*
+ * Make qgroup codes to skip given qgroupid, means the old/new_roots for
+ * qgroup won't contain the qgroupid in it.
+ */
+static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
+ u64 qgroupid)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = qgroupid;
+}
+
+static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(!delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = 0;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ unsigned int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+ struct btrfs_root *root);
+int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
+
+void btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
+bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
+void btrfs_throttle(struct btrfs_fs_info *fs_info);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages, int mark);
+int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void btrfs_put_transaction(struct btrfs_transaction *transaction);
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
+void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
+
+#endif
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
new file mode 100644
index 000000000..02e839824
--- /dev/null
+++ b/fs/btrfs/tree-checker.c
@@ -0,0 +1,1947 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ */
+
+/*
+ * The module is used to catch unexpected/corrupted tree block data.
+ * Such behavior can be caused either by a fuzzed image or bugs.
+ *
+ * The objective is to do leaf/node validation checks when tree block is read
+ * from disk, and check *every* possible member, so other code won't
+ * need to checking them again.
+ *
+ * Due to the potential and unwanted damage, every checker needs to be
+ * carefully reviewed otherwise so it does not prevent mount of valid images.
+ */
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/error-injection.h>
+#include "ctree.h"
+#include "tree-checker.h"
+#include "disk-io.h"
+#include "compression.h"
+#include "volumes.h"
+#include "misc.h"
+#include "btrfs_inode.h"
+
+/*
+ * Error message should follow the following format:
+ * corrupt <type>: <identifier>, <reason>[, <bad_value>]
+ *
+ * @type: leaf or node
+ * @identifier: the necessary info to locate the leaf/node.
+ * It's recommended to decode key.objecitd/offset if it's
+ * meaningful.
+ * @reason: describe the error
+ * @bad_value: optional, it's recommended to output bad value and its
+ * expected value (range).
+ *
+ * Since comma is used to separate the components, only space is allowed
+ * inside each component.
+ */
+
+/*
+ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
+ * Allows callers to customize the output.
+ */
+__printf(3, 4)
+__cold
+static void generic_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf);
+ va_end(args);
+}
+
+/*
+ * Customized reporter for extent data item, since its key objectid and
+ * offset has its own meaning.
+ */
+__printf(3, 4)
+__cold
+static void file_extent_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, key.offset, &vaf);
+ va_end(args);
+}
+
+/*
+ * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
+ * Else return 1
+ */
+#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \
+({ \
+ if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), \
+ (alignment)))) \
+ file_extent_err((leaf), (slot), \
+ "invalid %s for file extent, have %llu, should be aligned to %u", \
+ (#name), btrfs_file_extent_##name((leaf), (fi)), \
+ (alignment)); \
+ (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \
+})
+
+static u64 file_extent_end(struct extent_buffer *leaf,
+ struct btrfs_key *key,
+ struct btrfs_file_extent_item *extent)
+{
+ u64 end;
+ u64 len;
+
+ if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_ram_bytes(leaf, extent);
+ end = ALIGN(key->offset + len, leaf->fs_info->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ end = key->offset + len;
+ }
+ return end;
+}
+
+/*
+ * Customized report for dir_item, the only new important information is
+ * key->objectid, which represents inode number
+ */
+__printf(3, 4)
+__cold
+static void dir_item_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
+ va_end(args);
+}
+
+/*
+ * This functions checks prev_key->objectid, to ensure current key and prev_key
+ * share the same objectid as inode number.
+ *
+ * This is to detect missing INODE_ITEM in subvolume trees.
+ *
+ * Return true if everything is OK or we don't need to check.
+ * Return false if anything is wrong.
+ */
+static bool check_prev_ino(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ /* No prev key, skip check */
+ if (slot == 0)
+ return true;
+
+ /* Only these key->types needs to be checked */
+ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
+ key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_DIR_INDEX_KEY ||
+ key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_EXTENT_DATA_KEY);
+
+ /*
+ * Only subvolume trees along with their reloc trees need this check.
+ * Things like log tree doesn't follow this ino requirement.
+ */
+ if (!is_fstree(btrfs_header_owner(leaf)))
+ return true;
+
+ if (key->objectid == prev_key->objectid)
+ return true;
+
+ /* Error found */
+ dir_item_err(leaf, slot,
+ "invalid previous key objectid, have %llu expect %llu",
+ prev_key->objectid, key->objectid);
+ return false;
+}
+static int check_extent_data_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_file_extent_item *fi;
+ u32 sectorsize = fs_info->sectorsize;
+ u32 item_size = btrfs_item_size(leaf, slot);
+ u64 extent_end;
+
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
+ file_extent_err(leaf, slot,
+"unaligned file_offset for file extent, have %llu should be aligned to %u",
+ key->offset, sectorsize);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Previous key must have the same key->objectid (ino).
+ * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA.
+ * But if objectids mismatch, it means we have a missing
+ * INODE_ITEM.
+ */
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+ return -EUCLEAN;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ /*
+ * Make sure the item contains at least inline header, so the file
+ * extent type is not some garbage.
+ */
+ if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) {
+ file_extent_err(leaf, slot,
+ "invalid item size, have %u expect [%zu, %u)",
+ item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
+ SZ_4K);
+ return -EUCLEAN;
+ }
+ if (unlikely(btrfs_file_extent_type(leaf, fi) >=
+ BTRFS_NR_FILE_EXTENT_TYPES)) {
+ file_extent_err(leaf, slot,
+ "invalid type for file extent, have %u expect range [0, %u]",
+ btrfs_file_extent_type(leaf, fi),
+ BTRFS_NR_FILE_EXTENT_TYPES - 1);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Support for new compression/encryption must introduce incompat flag,
+ * and must be caught in open_ctree().
+ */
+ if (unlikely(btrfs_file_extent_compression(leaf, fi) >=
+ BTRFS_NR_COMPRESS_TYPES)) {
+ file_extent_err(leaf, slot,
+ "invalid compression for file extent, have %u expect range [0, %u]",
+ btrfs_file_extent_compression(leaf, fi),
+ BTRFS_NR_COMPRESS_TYPES - 1);
+ return -EUCLEAN;
+ }
+ if (unlikely(btrfs_file_extent_encryption(leaf, fi))) {
+ file_extent_err(leaf, slot,
+ "invalid encryption for file extent, have %u expect 0",
+ btrfs_file_extent_encryption(leaf, fi));
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ /* Inline extent must have 0 as key offset */
+ if (unlikely(key->offset)) {
+ file_extent_err(leaf, slot,
+ "invalid file_offset for inline file extent, have %llu expect 0",
+ key->offset);
+ return -EUCLEAN;
+ }
+
+ /* Compressed inline extent has no on-disk size, skip it */
+ if (btrfs_file_extent_compression(leaf, fi) !=
+ BTRFS_COMPRESS_NONE)
+ return 0;
+
+ /* Uncompressed inline extent size must match item size */
+ if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi))) {
+ file_extent_err(leaf, slot,
+ "invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
+ item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi));
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /* Regular or preallocated extent has fixed item size */
+ if (unlikely(item_size != sizeof(*fi))) {
+ file_extent_err(leaf, slot,
+ "invalid item size for reg/prealloc file extent, have %u expect %zu",
+ item_size, sizeof(*fi));
+ return -EUCLEAN;
+ }
+ if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)))
+ return -EUCLEAN;
+
+ /* Catch extent end overflow */
+ if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
+ key->offset, &extent_end))) {
+ file_extent_err(leaf, slot,
+ "extent end overflow, have file offset %llu extent num bytes %llu",
+ key->offset,
+ btrfs_file_extent_num_bytes(leaf, fi));
+ return -EUCLEAN;
+ }
+
+ /*
+ * Check that no two consecutive file extent items, in the same leaf,
+ * present ranges that overlap each other.
+ */
+ if (slot > 0 &&
+ prev_key->objectid == key->objectid &&
+ prev_key->type == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *prev_fi;
+ u64 prev_end;
+
+ prev_fi = btrfs_item_ptr(leaf, slot - 1,
+ struct btrfs_file_extent_item);
+ prev_end = file_extent_end(leaf, prev_key, prev_fi);
+ if (unlikely(prev_end > key->offset)) {
+ file_extent_err(leaf, slot - 1,
+"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent",
+ prev_end, key->offset);
+ return -EUCLEAN;
+ }
+ }
+
+ return 0;
+}
+
+static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot, struct btrfs_key *prev_key)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ const u32 csumsize = fs_info->csum_size;
+
+ if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) {
+ generic_err(leaf, slot,
+ "invalid key objectid for csum item, have %llu expect %llu",
+ key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
+ generic_err(leaf, slot,
+ "unaligned key offset for csum item, have %llu should be aligned to %u",
+ key->offset, sectorsize);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) {
+ generic_err(leaf, slot,
+ "unaligned item size for csum item, have %u should be aligned to %u",
+ btrfs_item_size(leaf, slot), csumsize);
+ return -EUCLEAN;
+ }
+ if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
+ u64 prev_csum_end;
+ u32 prev_item_size;
+
+ prev_item_size = btrfs_item_size(leaf, slot - 1);
+ prev_csum_end = (prev_item_size / csumsize) * sectorsize;
+ prev_csum_end += prev_key->offset;
+ if (unlikely(prev_csum_end > key->offset)) {
+ generic_err(leaf, slot - 1,
+"csum end range (%llu) goes beyond the start range (%llu) of the next csum item",
+ prev_csum_end, key->offset);
+ return -EUCLEAN;
+ }
+ }
+ return 0;
+}
+
+/* Inode item error output has the same format as dir_item_err() */
+#define inode_item_err(eb, slot, fmt, ...) \
+ dir_item_err(eb, slot, fmt, __VA_ARGS__)
+
+static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
+{
+ struct btrfs_key item_key;
+ bool is_inode_item;
+
+ btrfs_item_key_to_cpu(leaf, &item_key, slot);
+ is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY);
+
+ /* For XATTR_ITEM, location key should be all 0 */
+ if (item_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (unlikely(key->objectid != 0 || key->type != 0 ||
+ key->offset != 0))
+ return -EUCLEAN;
+ return 0;
+ }
+
+ if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
+ key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
+ key->objectid != BTRFS_FREE_INO_OBJECTID)) {
+ if (is_inode_item) {
+ generic_err(leaf, slot,
+ "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
+ key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
+ BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID,
+ BTRFS_FREE_INO_OBJECTID);
+ } else {
+ dir_item_err(leaf, slot,
+"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
+ key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
+ BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID,
+ BTRFS_FREE_INO_OBJECTID);
+ }
+ return -EUCLEAN;
+ }
+ if (unlikely(key->offset != 0)) {
+ if (is_inode_item)
+ inode_item_err(leaf, slot,
+ "invalid key offset: has %llu expect 0",
+ key->offset);
+ else
+ dir_item_err(leaf, slot,
+ "invalid location key offset:has %llu expect 0",
+ key->offset);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
+{
+ struct btrfs_key item_key;
+ bool is_root_item;
+
+ btrfs_item_key_to_cpu(leaf, &item_key, slot);
+ is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
+
+ /*
+ * Bad rootid for reloc trees.
+ *
+ * Reloc trees are only for subvolume trees, other trees only need
+ * to be COWed to be relocated.
+ */
+ if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ !is_fstree(key->offset))) {
+ generic_err(leaf, slot,
+ "invalid reloc tree for root %lld, root id is not a subvolume tree",
+ key->offset);
+ return -EUCLEAN;
+ }
+
+ /* No such tree id */
+ if (unlikely(key->objectid == 0)) {
+ if (is_root_item)
+ generic_err(leaf, slot, "invalid root id 0");
+ else
+ dir_item_err(leaf, slot,
+ "invalid location key root id 0");
+ return -EUCLEAN;
+ }
+
+ /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
+ if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
+ dir_item_err(leaf, slot,
+ "invalid location key objectid, have %llu expect [%llu, %llu]",
+ key->objectid, BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID);
+ return -EUCLEAN;
+ }
+
+ /*
+ * ROOT_ITEM with non-zero offset means this is a snapshot, created at
+ * @offset transid.
+ * Furthermore, for location key in DIR_ITEM, its offset is always -1.
+ *
+ * So here we only check offset for reloc tree whose key->offset must
+ * be a valid tree.
+ */
+ if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ key->offset == 0)) {
+ generic_err(leaf, slot, "invalid root id 0 for reloc tree");
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_dir_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_dir_item *di;
+ u32 item_size = btrfs_item_size(leaf, slot);
+ u32 cur = 0;
+
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+ return -EUCLEAN;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ while (cur < item_size) {
+ struct btrfs_key location_key;
+ u32 name_len;
+ u32 data_len;
+ u32 max_name_len;
+ u32 total_size;
+ u32 name_hash;
+ u8 dir_type;
+ int ret;
+
+ /* header itself should not cross item boundary */
+ if (unlikely(cur + sizeof(*di) > item_size)) {
+ dir_item_err(leaf, slot,
+ "dir item header crosses item boundary, have %zu boundary %u",
+ cur + sizeof(*di), item_size);
+ return -EUCLEAN;
+ }
+
+ /* Location key check */
+ btrfs_dir_item_key_to_cpu(leaf, di, &location_key);
+ if (location_key.type == BTRFS_ROOT_ITEM_KEY) {
+ ret = check_root_key(leaf, &location_key, slot);
+ if (unlikely(ret < 0))
+ return ret;
+ } else if (location_key.type == BTRFS_INODE_ITEM_KEY ||
+ location_key.type == 0) {
+ ret = check_inode_key(leaf, &location_key, slot);
+ if (unlikely(ret < 0))
+ return ret;
+ } else {
+ dir_item_err(leaf, slot,
+ "invalid location key type, have %u, expect %u or %u",
+ location_key.type, BTRFS_ROOT_ITEM_KEY,
+ BTRFS_INODE_ITEM_KEY);
+ return -EUCLEAN;
+ }
+
+ /* dir type check */
+ dir_type = btrfs_dir_type(leaf, di);
+ if (unlikely(dir_type >= BTRFS_FT_MAX)) {
+ dir_item_err(leaf, slot,
+ "invalid dir item type, have %u expect [0, %u)",
+ dir_type, BTRFS_FT_MAX);
+ return -EUCLEAN;
+ }
+
+ if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR)) {
+ dir_item_err(leaf, slot,
+ "invalid dir item type for XATTR key, have %u expect %u",
+ dir_type, BTRFS_FT_XATTR);
+ return -EUCLEAN;
+ }
+ if (unlikely(dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY)) {
+ dir_item_err(leaf, slot,
+ "xattr dir type found for non-XATTR key");
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR)
+ max_name_len = XATTR_NAME_MAX;
+ else
+ max_name_len = BTRFS_NAME_LEN;
+
+ /* Name/data length check */
+ name_len = btrfs_dir_name_len(leaf, di);
+ data_len = btrfs_dir_data_len(leaf, di);
+ if (unlikely(name_len > max_name_len)) {
+ dir_item_err(leaf, slot,
+ "dir item name len too long, have %u max %u",
+ name_len, max_name_len);
+ return -EUCLEAN;
+ }
+ if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) {
+ dir_item_err(leaf, slot,
+ "dir item name and data len too long, have %u max %u",
+ name_len + data_len,
+ BTRFS_MAX_XATTR_SIZE(fs_info));
+ return -EUCLEAN;
+ }
+
+ if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) {
+ dir_item_err(leaf, slot,
+ "dir item with invalid data len, have %u expect 0",
+ data_len);
+ return -EUCLEAN;
+ }
+
+ total_size = sizeof(*di) + name_len + data_len;
+
+ /* header and name/data should not cross item boundary */
+ if (unlikely(cur + total_size > item_size)) {
+ dir_item_err(leaf, slot,
+ "dir item data crosses item boundary, have %u boundary %u",
+ cur + total_size, item_size);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Special check for XATTR/DIR_ITEM, as key->offset is name
+ * hash, should match its name
+ */
+ if (key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_XATTR_ITEM_KEY) {
+ char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+
+ read_extent_buffer(leaf, namebuf,
+ (unsigned long)(di + 1), name_len);
+ name_hash = btrfs_name_hash(namebuf, name_len);
+ if (unlikely(key->offset != name_hash)) {
+ dir_item_err(leaf, slot,
+ "name hash mismatch with key, have 0x%016x expect 0x%016llx",
+ name_hash, key->offset);
+ return -EUCLEAN;
+ }
+ }
+ cur += total_size;
+ di = (struct btrfs_dir_item *)((void *)di + total_size);
+ }
+ return 0;
+}
+
+__printf(3, 4)
+__cold
+static void block_group_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, key.offset, &vaf);
+ va_end(args);
+}
+
+static int check_block_group_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_block_group_item bgi;
+ u32 item_size = btrfs_item_size(leaf, slot);
+ u64 chunk_objectid;
+ u64 flags;
+ u64 type;
+
+ /*
+ * Here we don't really care about alignment since extent allocator can
+ * handle it. We care more about the size.
+ */
+ if (unlikely(key->offset == 0)) {
+ block_group_err(leaf, slot,
+ "invalid block group size 0");
+ return -EUCLEAN;
+ }
+
+ if (unlikely(item_size != sizeof(bgi))) {
+ block_group_err(leaf, slot,
+ "invalid item size, have %u expect %zu",
+ item_size, sizeof(bgi));
+ return -EUCLEAN;
+ }
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+ chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ /*
+ * We don't init the nr_global_roots until we load the global
+ * roots, so this could be 0 at mount time. If it's 0 we'll
+ * just assume we're fine, and later we'll check against our
+ * actual value.
+ */
+ if (unlikely(fs_info->nr_global_roots &&
+ chunk_objectid >= fs_info->nr_global_roots)) {
+ block_group_err(leaf, slot,
+ "invalid block group global root id, have %llu, needs to be <= %llu",
+ chunk_objectid,
+ fs_info->nr_global_roots);
+ return -EUCLEAN;
+ }
+ } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
+ block_group_err(leaf, slot,
+ "invalid block group chunk objectid, have %llu expect %llu",
+ btrfs_stack_block_group_chunk_objectid(&bgi),
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ return -EUCLEAN;
+ }
+
+ if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) {
+ block_group_err(leaf, slot,
+ "invalid block group used, have %llu expect [0, %llu)",
+ btrfs_stack_block_group_used(&bgi), key->offset);
+ return -EUCLEAN;
+ }
+
+ flags = btrfs_stack_block_group_flags(&bgi);
+ if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) {
+ block_group_err(leaf, slot,
+"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
+ flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
+ hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
+ return -EUCLEAN;
+ }
+
+ type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ if (unlikely(type != BTRFS_BLOCK_GROUP_DATA &&
+ type != BTRFS_BLOCK_GROUP_METADATA &&
+ type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA))) {
+ block_group_err(leaf, slot,
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+ type, hweight64(type),
+ BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+__printf(4, 5)
+__cold
+static void chunk_err(const struct extent_buffer *leaf,
+ const struct btrfs_chunk *chunk, u64 logical,
+ const char *fmt, ...)
+{
+ const struct btrfs_fs_info *fs_info = leaf->fs_info;
+ bool is_sb;
+ struct va_format vaf;
+ va_list args;
+ int i;
+ int slot = -1;
+
+ /* Only superblock eb is able to have such small offset */
+ is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET);
+
+ if (!is_sb) {
+ /*
+ * Get the slot number by iterating through all slots, this
+ * would provide better readability.
+ */
+ for (i = 0; i < btrfs_header_nritems(leaf); i++) {
+ if (btrfs_item_ptr_offset(leaf, i) ==
+ (unsigned long)chunk) {
+ slot = i;
+ break;
+ }
+ }
+ }
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (is_sb)
+ btrfs_crit(fs_info,
+ "corrupt superblock syschunk array: chunk_start=%llu, %pV",
+ logical, &vaf);
+ else
+ btrfs_crit(fs_info,
+ "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV",
+ BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot,
+ logical, &vaf);
+ va_end(args);
+}
+
+/*
+ * The common chunk check which could also work on super block sys chunk array.
+ *
+ * Return -EUCLEAN if anything is corrupted.
+ * Return 0 if everything is OK.
+ */
+int btrfs_check_chunk_valid(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 logical)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ u64 length;
+ u64 chunk_end;
+ u64 stripe_len;
+ u16 num_stripes;
+ u16 sub_stripes;
+ u64 type;
+ u64 features;
+ bool mixed = false;
+ int raid_index;
+ int nparity;
+ int ncopies;
+
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+ raid_index = btrfs_bg_flags_to_raid_index(type);
+ ncopies = btrfs_raid_array[raid_index].ncopies;
+ nparity = btrfs_raid_array[raid_index].nparity;
+
+ if (unlikely(!num_stripes)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes, have %u", num_stripes);
+ return -EUCLEAN;
+ }
+ if (unlikely(num_stripes < ncopies)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes < ncopies, have %u < %d",
+ num_stripes, ncopies);
+ return -EUCLEAN;
+ }
+ if (unlikely(nparity && num_stripes == nparity)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes == nparity, have %u == %d",
+ num_stripes, nparity);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk logical, have %llu should aligned to %u",
+ logical, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk sectorsize, have %u expect %u",
+ btrfs_chunk_sector_size(leaf, chunk),
+ fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk length, have %llu", length);
+ return -EUCLEAN;
+ }
+ if (unlikely(check_add_overflow(logical, length, &chunk_end))) {
+ chunk_err(leaf, chunk, logical,
+"invalid chunk logical start and length, have logical start %llu length %llu",
+ logical, length);
+ return -EUCLEAN;
+ }
+ if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk stripe length: %llu",
+ stripe_len);
+ return -EUCLEAN;
+ }
+ if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
+ chunk_err(leaf, chunk, logical,
+ "unrecognized chunk type: 0x%llx",
+ ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ btrfs_chunk_type(leaf, chunk));
+ return -EUCLEAN;
+ }
+
+ if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
+ type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ return -EUCLEAN;
+ }
+ if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) {
+ chunk_err(leaf, chunk, logical,
+ "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
+ type, BTRFS_BLOCK_GROUP_TYPE_MASK);
+ return -EUCLEAN;
+ }
+
+ if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (type & (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA)))) {
+ chunk_err(leaf, chunk, logical,
+ "system chunk with data or metadata type: 0x%llx",
+ type);
+ return -EUCLEAN;
+ }
+
+ features = btrfs_super_incompat_flags(fs_info->super_copy);
+ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = true;
+
+ if (!mixed) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&
+ (type & BTRFS_BLOCK_GROUP_DATA))) {
+ chunk_err(leaf, chunk, logical,
+ "mixed chunk type in non-mixed mode: 0x%llx", type);
+ return -EUCLEAN;
+ }
+ }
+
+ if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
+ sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 &&
+ num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 &&
+ num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_DUP &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
+ chunk_err(leaf, chunk, logical,
+ "invalid num_stripes:sub_stripes %u:%u for profile %llu",
+ num_stripes, sub_stripes,
+ type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+/*
+ * Enhanced version of chunk item checker.
+ *
+ * The common btrfs_check_chunk_valid() doesn't check item size since it needs
+ * to work on super block sys_chunk_array which doesn't have full item ptr.
+ */
+static int check_leaf_chunk_item(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_key *key, int slot)
+{
+ int num_stripes;
+
+ if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
+ chunk_err(leaf, chunk, key->offset,
+ "invalid chunk item size: have %u expect [%zu, %u)",
+ btrfs_item_size(leaf, slot),
+ sizeof(struct btrfs_chunk),
+ BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+ }
+
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ /* Let btrfs_check_chunk_valid() handle this error type */
+ if (num_stripes == 0)
+ goto out;
+
+ if (unlikely(btrfs_chunk_item_size(num_stripes) !=
+ btrfs_item_size(leaf, slot))) {
+ chunk_err(leaf, chunk, key->offset,
+ "invalid chunk item size: have %u expect %lu",
+ btrfs_item_size(leaf, slot),
+ btrfs_chunk_item_size(num_stripes));
+ return -EUCLEAN;
+ }
+out:
+ return btrfs_check_chunk_valid(leaf, chunk, key->offset);
+}
+
+__printf(3, 4)
+__cold
+static void dev_item_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(eb->fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
+ va_end(args);
+}
+
+static int check_dev_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_dev_item *ditem;
+ const u32 item_size = btrfs_item_size(leaf, slot);
+
+ if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) {
+ dev_item_err(leaf, slot,
+ "invalid objectid: has=%llu expect=%llu",
+ key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
+ return -EUCLEAN;
+ }
+
+ if (unlikely(item_size != sizeof(*ditem))) {
+ dev_item_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*ditem));
+ return -EUCLEAN;
+ }
+
+ ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
+ if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) {
+ dev_item_err(leaf, slot,
+ "devid mismatch: key has=%llu item has=%llu",
+ key->offset, btrfs_device_id(leaf, ditem));
+ return -EUCLEAN;
+ }
+
+ /*
+ * For device total_bytes, we don't have reliable way to check it, as
+ * it can be 0 for device removal. Device size check can only be done
+ * by dev extents check.
+ */
+ if (unlikely(btrfs_device_bytes_used(leaf, ditem) >
+ btrfs_device_total_bytes(leaf, ditem))) {
+ dev_item_err(leaf, slot,
+ "invalid bytes used: have %llu expect [0, %llu]",
+ btrfs_device_bytes_used(leaf, ditem),
+ btrfs_device_total_bytes(leaf, ditem));
+ return -EUCLEAN;
+ }
+ /*
+ * Remaining members like io_align/type/gen/dev_group aren't really
+ * utilized. Skip them to make later usage of them easier.
+ */
+ return 0;
+}
+
+static int check_inode_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_inode_item *iitem;
+ u64 super_gen = btrfs_super_generation(fs_info->super_copy);
+ u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
+ const u32 item_size = btrfs_item_size(leaf, slot);
+ u32 mode;
+ int ret;
+ u32 flags;
+ u32 ro_flags;
+
+ ret = check_inode_key(leaf, key, slot);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (unlikely(item_size != sizeof(*iitem))) {
+ generic_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*iitem));
+ return -EUCLEAN;
+ }
+
+ iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
+
+ /* Here we use super block generation + 1 to handle log tree */
+ if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) {
+ inode_item_err(leaf, slot,
+ "invalid inode generation: has %llu expect (0, %llu]",
+ btrfs_inode_generation(leaf, iitem),
+ super_gen + 1);
+ return -EUCLEAN;
+ }
+ /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
+ if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) {
+ inode_item_err(leaf, slot,
+ "invalid inode transid: has %llu expect [0, %llu]",
+ btrfs_inode_transid(leaf, iitem), super_gen + 1);
+ return -EUCLEAN;
+ }
+
+ /*
+ * For size and nbytes it's better not to be too strict, as for dir
+ * item its size/nbytes can easily get wrong, but doesn't affect
+ * anything in the fs. So here we skip the check.
+ */
+ mode = btrfs_inode_mode(leaf, iitem);
+ if (unlikely(mode & ~valid_mask)) {
+ inode_item_err(leaf, slot,
+ "unknown mode bit detected: 0x%x",
+ mode & ~valid_mask);
+ return -EUCLEAN;
+ }
+
+ /*
+ * S_IFMT is not bit mapped so we can't completely rely on
+ * is_power_of_2/has_single_bit_set, but it can save us from checking
+ * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS
+ */
+ if (!has_single_bit_set(mode & S_IFMT)) {
+ if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) {
+ inode_item_err(leaf, slot,
+ "invalid mode: has 0%o expect valid S_IF* bit(s)",
+ mode & S_IFMT);
+ return -EUCLEAN;
+ }
+ }
+ if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) {
+ inode_item_err(leaf, slot,
+ "invalid nlink: has %u expect no more than 1 for dir",
+ btrfs_inode_nlink(leaf, iitem));
+ return -EUCLEAN;
+ }
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags);
+ if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) {
+ inode_item_err(leaf, slot,
+ "unknown incompat flags detected: 0x%x", flags);
+ return -EUCLEAN;
+ }
+ if (unlikely(!sb_rdonly(fs_info->sb) &&
+ (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) {
+ inode_item_err(leaf, slot,
+ "unknown ro-compat flags detected on writeable mount: 0x%x",
+ ro_flags);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_root_item ri = { 0 };
+ const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
+ BTRFS_ROOT_SUBVOL_DEAD;
+ int ret;
+
+ ret = check_root_key(leaf, key, slot);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size(leaf, slot) !=
+ btrfs_legacy_root_item_size())) {
+ generic_err(leaf, slot,
+ "invalid root item size, have %u expect %zu or %u",
+ btrfs_item_size(leaf, slot), sizeof(ri),
+ btrfs_legacy_root_item_size());
+ return -EUCLEAN;
+ }
+
+ /*
+ * For legacy root item, the members starting at generation_v2 will be
+ * all filled with 0.
+ * And since we allow geneartion_v2 as 0, it will still pass the check.
+ */
+ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
+ btrfs_item_size(leaf, slot));
+
+ /* Generation related */
+ if (unlikely(btrfs_root_generation(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
+ generic_err(leaf, slot,
+ "invalid root generation, have %llu expect (0, %llu]",
+ btrfs_root_generation(&ri),
+ btrfs_super_generation(fs_info->super_copy) + 1);
+ return -EUCLEAN;
+ }
+ if (unlikely(btrfs_root_generation_v2(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
+ generic_err(leaf, slot,
+ "invalid root v2 generation, have %llu expect (0, %llu]",
+ btrfs_root_generation_v2(&ri),
+ btrfs_super_generation(fs_info->super_copy) + 1);
+ return -EUCLEAN;
+ }
+ if (unlikely(btrfs_root_last_snapshot(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
+ generic_err(leaf, slot,
+ "invalid root last_snapshot, have %llu expect (0, %llu]",
+ btrfs_root_last_snapshot(&ri),
+ btrfs_super_generation(fs_info->super_copy) + 1);
+ return -EUCLEAN;
+ }
+
+ /* Alignment and level check */
+ if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) {
+ generic_err(leaf, slot,
+ "invalid root bytenr, have %llu expect to be aligned to %u",
+ btrfs_root_bytenr(&ri), fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) {
+ generic_err(leaf, slot,
+ "invalid root level, have %u expect [0, %u]",
+ btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+ if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) {
+ generic_err(leaf, slot,
+ "invalid root level, have %u expect [0, %u]",
+ btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+
+ /* Flags check */
+ if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) {
+ generic_err(leaf, slot,
+ "invalid root flags, have 0x%llx expect mask 0x%llx",
+ btrfs_root_flags(&ri), valid_root_flags);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+__printf(3,4)
+__cold
+static void extent_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+ u64 bytenr;
+ u64 len;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ bytenr = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY ||
+ key.type == BTRFS_TREE_BLOCK_REF_KEY ||
+ key.type == BTRFS_SHARED_BLOCK_REF_KEY)
+ len = eb->fs_info->nodesize;
+ else
+ len = key.offset;
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(eb->fs_info,
+ "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ eb->start, slot, bytenr, len, &vaf);
+ va_end(args);
+}
+
+static int check_extent_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_extent_item *ei;
+ bool is_tree_block = false;
+ unsigned long ptr; /* Current pointer inside inline refs */
+ unsigned long end; /* Extent item end */
+ const u32 item_size = btrfs_item_size(leaf, slot);
+ u64 flags;
+ u64 generation;
+ u64 total_refs; /* Total refs in btrfs_extent_item */
+ u64 inline_refs = 0; /* found total inline refs */
+
+ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
+ !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) {
+ generic_err(leaf, slot,
+"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled");
+ return -EUCLEAN;
+ }
+ /* key->objectid is the bytenr for both key types */
+ if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) {
+ generic_err(leaf, slot,
+ "invalid key objectid, have %llu expect to be aligned to %u",
+ key->objectid, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+
+ /* key->offset is tree level for METADATA_ITEM_KEY */
+ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
+ key->offset >= BTRFS_MAX_LEVEL)) {
+ extent_err(leaf, slot,
+ "invalid tree level, have %llu expect [0, %u]",
+ key->offset, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+
+ /*
+ * EXTENT/METADATA_ITEM consists of:
+ * 1) One btrfs_extent_item
+ * Records the total refs, type and generation of the extent.
+ *
+ * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only)
+ * Records the first key and level of the tree block.
+ *
+ * 2) Zero or more btrfs_extent_inline_ref(s)
+ * Each inline ref has one btrfs_extent_inline_ref shows:
+ * 2.1) The ref type, one of the 4
+ * TREE_BLOCK_REF Tree block only
+ * SHARED_BLOCK_REF Tree block only
+ * EXTENT_DATA_REF Data only
+ * SHARED_DATA_REF Data only
+ * 2.2) Ref type specific data
+ * Either using btrfs_extent_inline_ref::offset, or specific
+ * data structure.
+ */
+ if (unlikely(item_size < sizeof(*ei))) {
+ extent_err(leaf, slot,
+ "invalid item size, have %u expect [%zu, %u)",
+ item_size, sizeof(*ei),
+ BTRFS_LEAF_DATA_SIZE(fs_info));
+ return -EUCLEAN;
+ }
+ end = item_size + btrfs_item_ptr_offset(leaf, slot);
+
+ /* Checks against extent_item */
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+ total_refs = btrfs_extent_refs(leaf, ei);
+ generation = btrfs_extent_generation(leaf, ei);
+ if (unlikely(generation >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
+ extent_err(leaf, slot,
+ "invalid generation, have %llu expect (0, %llu]",
+ generation,
+ btrfs_super_generation(fs_info->super_copy) + 1);
+ return -EUCLEAN;
+ }
+ if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK)))) {
+ extent_err(leaf, slot,
+ "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
+ flags, BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ return -EUCLEAN;
+ }
+ is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ if (is_tree_block) {
+ if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY &&
+ key->offset != fs_info->nodesize)) {
+ extent_err(leaf, slot,
+ "invalid extent length, have %llu expect %u",
+ key->offset, fs_info->nodesize);
+ return -EUCLEAN;
+ }
+ } else {
+ if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) {
+ extent_err(leaf, slot,
+ "invalid key type, have %u expect %u for data backref",
+ key->type, BTRFS_EXTENT_ITEM_KEY);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) {
+ extent_err(leaf, slot,
+ "invalid extent length, have %llu expect aligned to %u",
+ key->offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+ extent_err(leaf, slot,
+ "invalid extent flag, data has full backref set");
+ return -EUCLEAN;
+ }
+ }
+ ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);
+
+ /* Check the special case of btrfs_tree_block_info */
+ if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)ptr;
+ if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) {
+ extent_err(leaf, slot,
+ "invalid tree block info level, have %u expect [0, %u]",
+ btrfs_tree_block_level(leaf, info),
+ BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+ ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1);
+ }
+
+ /* Check inline refs */
+ while (ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ u64 dref_offset;
+ u64 inline_offset;
+ u8 inline_type;
+
+ if (unlikely(ptr + sizeof(*iref) > end)) {
+ extent_err(leaf, slot,
+"inline ref item overflows extent item, ptr %lu iref size %zu end %lu",
+ ptr, sizeof(*iref), end);
+ return -EUCLEAN;
+ }
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ inline_type = btrfs_extent_inline_ref_type(leaf, iref);
+ inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
+ extent_err(leaf, slot,
+"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
+ return -EUCLEAN;
+ }
+
+ switch (inline_type) {
+ /* inline_offset is subvolid of the owner, no need to check */
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ inline_refs++;
+ break;
+ /* Contains parent bytenr */
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ if (unlikely(!IS_ALIGNED(inline_offset,
+ fs_info->sectorsize))) {
+ extent_err(leaf, slot,
+ "invalid tree parent bytenr, have %llu expect aligned to %u",
+ inline_offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ inline_refs++;
+ break;
+ /*
+ * Contains owner subvolid, owner key objectid, adjusted offset.
+ * The only obvious corruption can happen in that offset.
+ */
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
+ if (unlikely(!IS_ALIGNED(dref_offset,
+ fs_info->sectorsize))) {
+ extent_err(leaf, slot,
+ "invalid data ref offset, have %llu expect aligned to %u",
+ dref_offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ inline_refs += btrfs_extent_data_ref_count(leaf, dref);
+ break;
+ /* Contains parent bytenr and ref count */
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ if (unlikely(!IS_ALIGNED(inline_offset,
+ fs_info->sectorsize))) {
+ extent_err(leaf, slot,
+ "invalid data parent bytenr, have %llu expect aligned to %u",
+ inline_offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ inline_refs += btrfs_shared_data_ref_count(leaf, sref);
+ break;
+ default:
+ extent_err(leaf, slot, "unknown inline ref type: %u",
+ inline_type);
+ return -EUCLEAN;
+ }
+ ptr += btrfs_extent_inline_ref_size(inline_type);
+ }
+ /* No padding is allowed */
+ if (unlikely(ptr != end)) {
+ extent_err(leaf, slot,
+ "invalid extent item size, padding bytes found");
+ return -EUCLEAN;
+ }
+
+ /* Finally, check the inline refs against total refs */
+ if (unlikely(inline_refs > total_refs)) {
+ extent_err(leaf, slot,
+ "invalid extent refs, have %llu expect >= inline %llu",
+ total_refs, inline_refs);
+ return -EUCLEAN;
+ }
+
+ if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) ||
+ (prev_key->type == BTRFS_METADATA_ITEM_KEY)) {
+ u64 prev_end = prev_key->objectid;
+
+ if (prev_key->type == BTRFS_METADATA_ITEM_KEY)
+ prev_end += fs_info->nodesize;
+ else
+ prev_end += prev_key->offset;
+
+ if (unlikely(prev_end > key->objectid)) {
+ extent_err(leaf, slot,
+ "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
+ prev_key->objectid, prev_key->type,
+ prev_key->offset, key->objectid, key->type,
+ key->offset);
+ return -EUCLEAN;
+ }
+ }
+
+ return 0;
+}
+
+static int check_simple_keyed_refs(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ u32 expect_item_size = 0;
+
+ if (key->type == BTRFS_SHARED_DATA_REF_KEY)
+ expect_item_size = sizeof(struct btrfs_shared_data_ref);
+
+ if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
+ generic_err(leaf, slot,
+ "invalid item size, have %u expect %u for key type %u",
+ btrfs_item_size(leaf, slot),
+ expect_item_size, key->type);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
+ generic_err(leaf, slot,
+"invalid key objectid for shared block ref, have %llu expect aligned to %u",
+ key->objectid, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY &&
+ !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) {
+ extent_err(leaf, slot,
+ "invalid tree parent bytenr, have %llu expect aligned to %u",
+ key->offset, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_extent_data_ref(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_extent_data_ref *dref;
+ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+ const unsigned long end = ptr + btrfs_item_size(leaf, slot);
+
+ if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) {
+ generic_err(leaf, slot,
+ "invalid item size, have %u expect aligned to %zu for key type %u",
+ btrfs_item_size(leaf, slot),
+ sizeof(*dref), key->type);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
+ generic_err(leaf, slot,
+"invalid key objectid for shared block ref, have %llu expect aligned to %u",
+ key->objectid, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ for (; ptr < end; ptr += sizeof(*dref)) {
+ u64 offset;
+
+ /*
+ * We cannot check the extent_data_ref hash due to possible
+ * overflow from the leaf due to hash collisions.
+ */
+ dref = (struct btrfs_extent_data_ref *)ptr;
+ offset = btrfs_extent_data_ref_offset(leaf, dref);
+ if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) {
+ extent_err(leaf, slot,
+ "invalid extent data backref offset, have %llu expect aligned to %u",
+ offset, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+ }
+ return 0;
+}
+
+#define inode_ref_err(eb, slot, fmt, args...) \
+ inode_item_err(eb, slot, fmt, ##args)
+static int check_inode_ref(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
+{
+ struct btrfs_inode_ref *iref;
+ unsigned long ptr;
+ unsigned long end;
+
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+ return -EUCLEAN;
+ /* namelen can't be 0, so item_size == sizeof() is also invalid */
+ if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) {
+ inode_ref_err(leaf, slot,
+ "invalid item size, have %u expect (%zu, %u)",
+ btrfs_item_size(leaf, slot),
+ sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+ }
+
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ end = ptr + btrfs_item_size(leaf, slot);
+ while (ptr < end) {
+ u16 namelen;
+
+ if (unlikely(ptr + sizeof(iref) > end)) {
+ inode_ref_err(leaf, slot,
+ "inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
+ ptr, end, sizeof(iref));
+ return -EUCLEAN;
+ }
+
+ iref = (struct btrfs_inode_ref *)ptr;
+ namelen = btrfs_inode_ref_name_len(leaf, iref);
+ if (unlikely(ptr + sizeof(*iref) + namelen > end)) {
+ inode_ref_err(leaf, slot,
+ "inode ref overflow, ptr %lu end %lu namelen %u",
+ ptr, end, namelen);
+ return -EUCLEAN;
+ }
+
+ /*
+ * NOTE: In theory we should record all found index numbers
+ * to find any duplicated indexes, but that will be too time
+ * consuming for inodes with too many hard links.
+ */
+ ptr += sizeof(*iref) + namelen;
+ }
+ return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ int ret = 0;
+ struct btrfs_chunk *chunk;
+
+ switch (key->type) {
+ case BTRFS_EXTENT_DATA_KEY:
+ ret = check_extent_data_item(leaf, key, slot, prev_key);
+ break;
+ case BTRFS_EXTENT_CSUM_KEY:
+ ret = check_csum_item(leaf, key, slot, prev_key);
+ break;
+ case BTRFS_DIR_ITEM_KEY:
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ ret = check_dir_item(leaf, key, prev_key, slot);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ ret = check_inode_ref(leaf, key, prev_key, slot);
+ break;
+ case BTRFS_BLOCK_GROUP_ITEM_KEY:
+ ret = check_block_group_item(leaf, key, slot);
+ break;
+ case BTRFS_CHUNK_ITEM_KEY:
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ ret = check_leaf_chunk_item(leaf, chunk, key, slot);
+ break;
+ case BTRFS_DEV_ITEM_KEY:
+ ret = check_dev_item(leaf, key, slot);
+ break;
+ case BTRFS_INODE_ITEM_KEY:
+ ret = check_inode_item(leaf, key, slot);
+ break;
+ case BTRFS_ROOT_ITEM_KEY:
+ ret = check_root_item(leaf, key, slot);
+ break;
+ case BTRFS_EXTENT_ITEM_KEY:
+ case BTRFS_METADATA_ITEM_KEY:
+ ret = check_extent_item(leaf, key, slot, prev_key);
+ break;
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ case BTRFS_SHARED_DATA_REF_KEY:
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = check_simple_keyed_refs(leaf, key, slot);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ ret = check_extent_data_ref(leaf, key, slot);
+ break;
+ }
+ return ret;
+}
+
+static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ /* No valid key type is 0, so all key should be larger than this key */
+ struct btrfs_key prev_key = {0, 0, 0};
+ struct btrfs_key key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ if (unlikely(btrfs_header_level(leaf) != 0)) {
+ generic_err(leaf, 0,
+ "invalid level for leaf, have %d expect 0",
+ btrfs_header_level(leaf));
+ return -EUCLEAN;
+ }
+
+ /*
+ * Extent buffers from a relocation tree have a owner field that
+ * corresponds to the subvolume tree they are based on. So just from an
+ * extent buffer alone we can not find out what is the id of the
+ * corresponding subvolume tree, so we can not figure out if the extent
+ * buffer corresponds to the root of the relocation tree or not. So
+ * skip this check for relocation trees.
+ */
+ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+ u64 owner = btrfs_header_owner(leaf);
+
+ /* These trees must never be empty */
+ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID ||
+ owner == BTRFS_CHUNK_TREE_OBJECTID ||
+ owner == BTRFS_DEV_TREE_OBJECTID ||
+ owner == BTRFS_FS_TREE_OBJECTID ||
+ owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
+ generic_err(leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+
+ /* Unknown tree */
+ if (unlikely(owner == 0)) {
+ generic_err(leaf, 0,
+ "invalid owner, root 0 is not defined");
+ return -EUCLEAN;
+ }
+
+ /* EXTENT_TREE_V2 can have empty extent trees. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) {
+ generic_err(leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+
+ return 0;
+ }
+
+ if (unlikely(nritems == 0))
+ return 0;
+
+ /*
+ * Check the following things to make sure this is a good leaf, and
+ * leaf users won't need to bother with similar sanity checks:
+ *
+ * 1) key ordering
+ * 2) item offset and size
+ * No overlap, no hole, all inside the leaf.
+ * 3) item content
+ * If possible, do comprehensive sanity check.
+ * NOTE: All checks must only rely on the item data itself.
+ */
+ for (slot = 0; slot < nritems; slot++) {
+ u32 item_end_expected;
+ u64 item_data_end;
+ int ret;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /* Make sure the keys are in the right order */
+ if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) {
+ generic_err(leaf, slot,
+ "bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
+ prev_key.objectid, prev_key.type,
+ prev_key.offset, key.objectid, key.type,
+ key.offset);
+ return -EUCLEAN;
+ }
+
+ item_data_end = (u64)btrfs_item_offset(leaf, slot) +
+ btrfs_item_size(leaf, slot);
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (slot == 0)
+ item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
+ else
+ item_end_expected = btrfs_item_offset(leaf,
+ slot - 1);
+ if (unlikely(item_data_end != item_end_expected)) {
+ generic_err(leaf, slot,
+ "unexpected item end, have %llu expect %u",
+ item_data_end, item_end_expected);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just in case all the items are consistent to each other, but
+ * all point outside of the leaf.
+ */
+ if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) {
+ generic_err(leaf, slot,
+ "slot end outside of leaf, have %llu expect range [0, %u]",
+ item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
+ return -EUCLEAN;
+ }
+
+ /* Also check if the item pointer overlaps with btrfs item. */
+ if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
+ btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) {
+ generic_err(leaf, slot,
+ "slot overlaps with its data, item end %lu data start %lu",
+ btrfs_item_nr_offset(slot) +
+ sizeof(struct btrfs_item),
+ btrfs_item_ptr_offset(leaf, slot));
+ return -EUCLEAN;
+ }
+
+ if (check_item_data) {
+ /*
+ * Check if the item size and content meet other
+ * criteria
+ */
+ ret = check_leaf_item(leaf, &key, slot, &prev_key);
+ if (unlikely(ret < 0))
+ return ret;
+ }
+
+ prev_key.objectid = key.objectid;
+ prev_key.type = key.type;
+ prev_key.offset = key.offset;
+ }
+
+ return 0;
+}
+
+int btrfs_check_leaf_full(struct extent_buffer *leaf)
+{
+ return check_leaf(leaf, true);
+}
+ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO);
+
+int btrfs_check_leaf_relaxed(struct extent_buffer *leaf)
+{
+ return check_leaf(leaf, false);
+}
+
+int btrfs_check_node(struct extent_buffer *node)
+{
+ struct btrfs_fs_info *fs_info = node->fs_info;
+ unsigned long nr = btrfs_header_nritems(node);
+ struct btrfs_key key, next_key;
+ int slot;
+ int level = btrfs_header_level(node);
+ u64 bytenr;
+ int ret = 0;
+
+ if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
+ generic_err(node, 0,
+ "invalid level for node, have %d expect [1, %d]",
+ level, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
+ if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) {
+ btrfs_crit(fs_info,
+"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
+ btrfs_header_owner(node), node->start,
+ nr == 0 ? "small" : "large", nr,
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+ return -EUCLEAN;
+ }
+
+ for (slot = 0; slot < nr - 1; slot++) {
+ bytenr = btrfs_node_blockptr(node, slot);
+ btrfs_node_key_to_cpu(node, &key, slot);
+ btrfs_node_key_to_cpu(node, &next_key, slot + 1);
+
+ if (unlikely(!bytenr)) {
+ generic_err(node, slot,
+ "invalid NULL node pointer");
+ ret = -EUCLEAN;
+ goto out;
+ }
+ if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) {
+ generic_err(node, slot,
+ "unaligned pointer, have %llu should be aligned to %u",
+ bytenr, fs_info->sectorsize);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) {
+ generic_err(node, slot,
+ "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
+ key.objectid, key.type, key.offset,
+ next_key.objectid, next_key.type,
+ next_key.offset);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
+
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
+{
+ const bool is_subvol = is_fstree(root_owner);
+ const u64 eb_owner = btrfs_header_owner(eb);
+
+ /*
+ * Skip dummy fs, as selftests don't create unique ebs for each dummy
+ * root.
+ */
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
+ return 0;
+ /*
+ * There are several call sites (backref walking, qgroup, and data
+ * reloc) passing 0 as @root_owner, as they are not holding the
+ * tree root. In that case, we can not do a reliable ownership check,
+ * so just exit.
+ */
+ if (root_owner == 0)
+ return 0;
+ /*
+ * These trees use key.offset as their owner, our callers don't have
+ * the extra capacity to pass key.offset here. So we just skip them.
+ */
+ if (root_owner == BTRFS_TREE_LOG_OBJECTID ||
+ root_owner == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!is_subvol) {
+ /* For non-subvolume trees, the eb owner should match root owner */
+ if (unlikely(root_owner != eb_owner)) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ root_owner);
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /*
+ * For subvolume trees, owners can mismatch, but they should all belong
+ * to subvolume trees.
+ */
+ if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID);
+ return -EUCLEAN;
+ }
+ return 0;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
new file mode 100644
index 000000000..ece497e26
--- /dev/null
+++ b/fs/btrfs/tree-checker.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ */
+
+#ifndef BTRFS_TREE_CHECKER_H
+#define BTRFS_TREE_CHECKER_H
+
+#include "ctree.h"
+#include "extent_io.h"
+
+/*
+ * Comprehensive leaf checker.
+ * Will check not only the item pointers, but also every possible member
+ * in item data.
+ */
+int btrfs_check_leaf_full(struct extent_buffer *leaf);
+
+/*
+ * Less strict leaf checker.
+ * Will only check item pointers, not reading item data.
+ */
+int btrfs_check_leaf_relaxed(struct extent_buffer *leaf);
+int btrfs_check_node(struct extent_buffer *node);
+
+int btrfs_check_chunk_valid(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 logical);
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
+
+#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000..0520d6d32
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,471 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+
+static struct kmem_cache *btrfs_inode_defrag_cachep;
+
+/*
+ * When auto defrag is enabled we queue up these defrag structs to remember
+ * which inodes need defragging passes.
+ */
+struct inode_defrag {
+ struct rb_node rb_node;
+ /* Inode number */
+ u64 ino;
+ /*
+ * Transid where the defrag was added, we search for extents newer than
+ * this.
+ */
+ u64 transid;
+
+ /* Root objectid */
+ u64 root;
+
+ /*
+ * The extent size threshold for autodefrag.
+ *
+ * This value is different for compressed/non-compressed extents, thus
+ * needs to be passed from higher layer.
+ * (aka, inode_should_defrag())
+ */
+ u32 extent_thresh;
+};
+
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+ struct inode_defrag *defrag2)
+{
+ if (defrag1->root > defrag2->root)
+ return 1;
+ else if (defrag1->root < defrag2->root)
+ return -1;
+ else if (defrag1->ino > defrag2->ino)
+ return 1;
+ else if (defrag1->ino < defrag2->ino)
+ return -1;
+ else
+ return 0;
+}
+
+/*
+ * Pop a record for an inode into the defrag tree. The lock must be held
+ * already.
+ *
+ * If you're inserting a record for an older transid than an existing record,
+ * the transid already in the tree is lowered.
+ *
+ * If an existing record is found the defrag item you pass in is freed.
+ */
+static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct inode_defrag *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ int ret;
+
+ p = &fs_info->defrag_inodes.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ ret = __compare_inode_defrag(defrag, entry);
+ if (ret < 0)
+ p = &parent->rb_left;
+ else if (ret > 0)
+ p = &parent->rb_right;
+ else {
+ /*
+ * If we're reinserting an entry for an old defrag run,
+ * make sure to lower the transid of our existing
+ * record.
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ entry->extent_thresh = min(defrag->extent_thresh,
+ entry->extent_thresh);
+ return -EEXIST;
+ }
+ }
+ set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
+ rb_link_node(&defrag->rb_node, parent, p);
+ rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
+ return 0;
+}
+
+static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
+ return 0;
+
+ if (btrfs_fs_closing(fs_info))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Insert a defrag record for this inode if auto defrag is enabled.
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode, u32 extent_thresh)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode_defrag *defrag;
+ u64 transid;
+ int ret;
+
+ if (!__need_auto_defrag(fs_info))
+ return 0;
+
+ if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
+ return 0;
+
+ if (trans)
+ transid = trans->transid;
+ else
+ transid = inode->root->last_trans;
+
+ defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
+ if (!defrag)
+ return -ENOMEM;
+
+ defrag->ino = btrfs_ino(inode);
+ defrag->transid = transid;
+ defrag->root = root->root_key.objectid;
+ defrag->extent_thresh = extent_thresh;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
+ /*
+ * If we set IN_DEFRAG flag and evict the inode from memory,
+ * and then re-read this inode, this new inode doesn't have
+ * IN_DEFRAG flag. At the case, we may find the existed defrag.
+ */
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ if (ret)
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return 0;
+}
+
+/*
+ * Pick the defragable inode that we want, if it doesn't exist, we will get the
+ * next one.
+ */
+static struct inode_defrag *btrfs_pick_defrag_inode(
+ struct btrfs_fs_info *fs_info, u64 root, u64 ino)
+{
+ struct inode_defrag *entry = NULL;
+ struct inode_defrag tmp;
+ struct rb_node *p;
+ struct rb_node *parent = NULL;
+ int ret;
+
+ tmp.ino = ino;
+ tmp.root = root;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ p = fs_info->defrag_inodes.rb_node;
+ while (p) {
+ parent = p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ ret = __compare_inode_defrag(&tmp, entry);
+ if (ret < 0)
+ p = parent->rb_left;
+ else if (ret > 0)
+ p = parent->rb_right;
+ else
+ goto out;
+ }
+
+ if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ parent = rb_next(parent);
+ if (parent)
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+ else
+ entry = NULL;
+ }
+out:
+ if (entry)
+ rb_erase(parent, &fs_info->defrag_inodes);
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return entry;
+}
+
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ node = rb_first(&fs_info->defrag_inodes);
+ while (node) {
+ rb_erase(node, &fs_info->defrag_inodes);
+ defrag = rb_entry(node, struct inode_defrag, rb_node);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+ cond_resched_lock(&fs_info->defrag_inodes_lock);
+
+ node = rb_first(&fs_info->defrag_inodes);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH 1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *inode_root;
+ struct inode *inode;
+ struct btrfs_ioctl_defrag_range_args range;
+ int ret = 0;
+ u64 cur = 0;
+
+again:
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ goto cleanup;
+ if (!__need_auto_defrag(fs_info))
+ goto cleanup;
+
+ /* Get the inode */
+ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
+ if (IS_ERR(inode_root)) {
+ ret = PTR_ERR(inode_root);
+ goto cleanup;
+ }
+
+ inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ btrfs_put_root(inode_root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto cleanup;
+ }
+
+ if (cur >= i_size_read(inode)) {
+ iput(inode);
+ goto cleanup;
+ }
+
+ /* Do a chunk of defrag */
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ memset(&range, 0, sizeof(range));
+ range.len = (u64)-1;
+ range.start = cur;
+ range.extent_thresh = defrag->extent_thresh;
+
+ sb_start_write(fs_info->sb);
+ ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ BTRFS_DEFRAG_BATCH);
+ sb_end_write(fs_info->sb);
+ iput(inode);
+
+ if (ret < 0)
+ goto cleanup;
+
+ cur = max(cur + fs_info->sectorsize, range.start);
+ goto again;
+
+cleanup:
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return ret;
+}
+
+/*
+ * Run through the list of inodes in the FS that need defragging.
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ u64 first_ino = 0;
+ u64 root_objectid = 0;
+
+ atomic_inc(&fs_info->defrag_running);
+ while (1) {
+ /* Pause the auto defragger. */
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ break;
+
+ if (!__need_auto_defrag(fs_info))
+ break;
+
+ /* find an inode to defrag */
+ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
+ if (!defrag) {
+ if (root_objectid || first_ino) {
+ root_objectid = 0;
+ first_ino = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ first_ino = defrag->ino + 1;
+ root_objectid = defrag->root;
+
+ __btrfs_run_defrag_inode(fs_info, defrag);
+ }
+ atomic_dec(&fs_info->defrag_running);
+
+ /*
+ * During unmount, we use the transaction_wait queue to wait for the
+ * defragger to stop.
+ */
+ wake_up(&fs_info->transaction_wait);
+ return 0;
+}
+
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ int ret = 0;
+ int wret;
+ int level;
+ int next_key_ret = 0;
+ u64 last_ret = 0;
+
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ level = btrfs_header_level(root->node);
+
+ if (level == 0)
+ goto out;
+
+ if (root->defrag_progress.objectid == 0) {
+ struct extent_buffer *root_node;
+ u32 nritems;
+
+ root_node = btrfs_lock_root_node(root);
+ nritems = btrfs_header_nritems(root_node);
+ root->defrag_max.objectid = 0;
+ /* from above we know this is not a leaf */
+ btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+ nritems - 1);
+ btrfs_tree_unlock(root_node);
+ free_extent_buffer(root_node);
+ memset(&key, 0, sizeof(key));
+ } else {
+ memcpy(&key, &root->defrag_progress, sizeof(key));
+ }
+
+ path->keep_locks = 1;
+
+ ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(path);
+ /*
+ * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+ * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+ * a deadlock (attempting to write lock an already write locked leaf).
+ */
+ path->lowest_level = 1;
+ wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ if (!path->nodes[1]) {
+ ret = 0;
+ goto out;
+ }
+ /*
+ * The node at level 1 must always be locked when our path has
+ * keep_locks set and lowest_level is 1, regardless of the value of
+ * path->slots[1].
+ */
+ BUG_ON(path->locks[1] == 0);
+ ret = btrfs_realloc_node(trans, root,
+ path->nodes[1], 0,
+ &last_ret,
+ &root->defrag_progress);
+ if (ret) {
+ WARN_ON(ret == -EAGAIN);
+ goto out;
+ }
+ /*
+ * Now that we reallocated the node we can find the next key. Note that
+ * btrfs_find_next_key() can release our path and do another search
+ * without COWing, this is because even with path->keep_locks = 1,
+ * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+ * node when path->slots[node_level - 1] does not point to the last
+ * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+ * we search for the next key after reallocating our node.
+ */
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+ BTRFS_OLDEST_GENERATION);
+ if (next_key_ret == 0) {
+ memcpy(&root->defrag_progress, &key, sizeof(key));
+ ret = -EAGAIN;
+ }
+out:
+ btrfs_free_path(path);
+ if (ret == -EAGAIN) {
+ if (root->defrag_max.objectid > root->defrag_progress.objectid)
+ goto done;
+ if (root->defrag_max.type > root->defrag_progress.type)
+ goto done;
+ if (root->defrag_max.offset > root->defrag_progress.offset)
+ goto done;
+ ret = 0;
+ }
+done:
+ if (ret != -EAGAIN)
+ memset(&root->defrag_progress, 0,
+ sizeof(root->defrag_progress));
+
+ return ret;
+}
+
+void __cold btrfs_auto_defrag_exit(void)
+{
+ kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int __init btrfs_auto_defrag_init(void)
+{
+ btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+ sizeof(struct inode_defrag), 0,
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_inode_defrag_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000..7c33b28c0
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,7562 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/list_sort.h>
+#include <linux/iversion.h>
+#include "misc.h"
+#include "ctree.h"
+#include "tree-log.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "backref.h"
+#include "compression.h"
+#include "qgroup.h"
+#include "block-group.h"
+#include "space-info.h"
+#include "zoned.h"
+#include "inode-item.h"
+
+#define MAX_CONFLICT_INODES 10
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+enum {
+ LOG_INODE_ALL,
+ LOG_INODE_EXISTS,
+};
+
+/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2). After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ * 2a is actually the more important variant. With the extra logging
+ * a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir. After a crash the rm -rf must
+ * be replayed. This must be able to recurse down the entire
+ * directory tree. The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
+ * stages for the tree walking. The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+enum {
+ LOG_WALK_PIN_ONLY,
+ LOG_WALK_REPLAY_INODES,
+ LOG_WALK_REPLAY_DIR_INDEX,
+ LOG_WALK_REPLAY_ALL,
+};
+
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ int inode_only,
+ struct btrfs_log_ctx *ctx);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction. Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree. Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ const bool zoned = btrfs_is_zoned(fs_info);
+ int ret = 0;
+ bool created = false;
+
+ /*
+ * First check if the log root tree was already created. If not, create
+ * it before locking the root's log_mutex, just to keep lockdep happy.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
+ mutex_lock(&tree_root->log_mutex);
+ if (!fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, fs_info);
+ if (!ret) {
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
+ created = true;
+ }
+ }
+ mutex_unlock(&tree_root->log_mutex);
+ if (ret)
+ return ret;
+ }
+
+ mutex_lock(&root->log_mutex);
+
+again:
+ if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
+ if (btrfs_need_log_full_commit(trans)) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
+
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
+
+ if (!root->log_start_pid) {
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
+ } else if (root->log_start_pid != current->pid) {
+ set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ }
+ } else {
+ /*
+ * This means fs_info->log_root_tree was already created
+ * for some other FS trees. Do the full commit not to mix
+ * nodes from multiple log transactions to do sequential
+ * writing.
+ */
+ if (zoned && !created) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
+
+ ret = btrfs_add_log_tree(trans, root);
+ if (ret)
+ goto out;
+
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
+ }
+
+ atomic_inc(&root->log_writers);
+ if (!ctx->logging_new_name) {
+ int index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
+
+out:
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+ const bool zoned = btrfs_is_zoned(root->fs_info);
+ int ret = -ENOENT;
+
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
+
+ mutex_lock(&root->log_mutex);
+again:
+ if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
+ ret = 0;
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
+ atomic_inc(&root->log_writers);
+ }
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+void btrfs_pin_log_trans(struct btrfs_root *root)
+{
+ atomic_inc(&root->log_writers);
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+void btrfs_end_log_trans(struct btrfs_root *root)
+{
+ if (atomic_dec_and_test(&root->log_writers)) {
+ /* atomic_dec_and_test implies a barrier */
+ cond_wake_up_nomb(&root->log_writer_wait);
+ }
+}
+
+static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+ filemap_fdatawait_range(buf->pages[0]->mapping,
+ buf->start, buf->start + buf->len - 1);
+}
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree. The stage field tells us which part
+ * of the log tree processing we are currently doing. The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+ /* should we free the extent on disk when done? This is used
+ * at transaction commit time while freeing a log tree
+ */
+ int free;
+
+ /* pin only walk, we record which extents on disk belong to the
+ * log trees
+ */
+ int pin;
+
+ /* what stage of the replay code we're currently in */
+ int stage;
+
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
+ * the LOG_WALK_REPLAY_INODES stage.
+ */
+ bool ignore_cur_inode;
+
+ /* the root we are currently replaying */
+ struct btrfs_root *replay_dest;
+
+ /* the trans handle for the current replay */
+ struct btrfs_trans_handle *trans;
+
+ /* the function that gets used to process blocks we find in the
+ * tree. Note the extent_buffer might not be up to date when it is
+ * passed in, and it must be checked or read if you need the data
+ * inside it
+ */
+ int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+ struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level)
+{
+ struct btrfs_fs_info *fs_info = log->fs_info;
+ int ret = 0;
+
+ /*
+ * If this fs is mixed then we need to be able to process the leaves to
+ * pin down any logged extents, so we have to read the block.
+ */
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
+ if (ret)
+ return ret;
+ }
+
+ if (wc->pin) {
+ ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
+ eb->len);
+ if (ret)
+ return ret;
+
+ if (btrfs_buffer_uptodate(eb, gen, 0) &&
+ btrfs_header_level(eb) == 0)
+ ret = btrfs_exclude_logged_extents(eb);
+ }
+ return ret;
+}
+
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ u32 item_size;
+ u64 saved_i_size = 0;
+ int save_old_i_size = 0;
+ unsigned long src_ptr;
+ unsigned long dst_ptr;
+ int overwrite_root = 0;
+ bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ overwrite_root = 1;
+
+ item_size = btrfs_item_size(eb, slot);
+ src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+ /* Our caller must have done a search for the key for us. */
+ ASSERT(path->nodes[0] != NULL);
+
+ /*
+ * And the slot must point to the exact key or the slot where the key
+ * should be at (the first item with a key greater than 'key')
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+ ret = btrfs_comp_cpu_keys(&found_key, key);
+ ASSERT(ret >= 0);
+ } else {
+ ret = 1;
+ }
+
+ if (ret == 0) {
+ char *src_copy;
+ char *dst_copy;
+ u32 dst_size = btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ if (dst_size != item_size)
+ goto insert;
+
+ if (item_size == 0) {
+ btrfs_release_path(path);
+ return 0;
+ }
+ dst_copy = kmalloc(item_size, GFP_NOFS);
+ src_copy = kmalloc(item_size, GFP_NOFS);
+ if (!dst_copy || !src_copy) {
+ btrfs_release_path(path);
+ kfree(dst_copy);
+ kfree(src_copy);
+ return -ENOMEM;
+ }
+
+ read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+ dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+ item_size);
+ ret = memcmp(dst_copy, src_copy, item_size);
+
+ kfree(dst_copy);
+ kfree(src_copy);
+ /*
+ * they have the same contents, just return, this saves
+ * us from cowing blocks in the destination tree and doing
+ * extra writes that may not have been done by a previous
+ * sync
+ */
+ if (ret == 0) {
+ btrfs_release_path(path);
+ return 0;
+ }
+
+ /*
+ * We need to load the old nbytes into the inode so when we
+ * replay the extents we've logged we get the right nbytes.
+ */
+ if (inode_item) {
+ struct btrfs_inode_item *item;
+ u64 nbytes;
+ u32 mode;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+ item = btrfs_item_ptr(eb, slot,
+ struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(eb, item, nbytes);
+
+ /*
+ * If this is a directory we need to reset the i_size to
+ * 0 so that we can set it up properly when replaying
+ * the rest of the items in this log.
+ */
+ mode = btrfs_inode_mode(eb, item);
+ if (S_ISDIR(mode))
+ btrfs_set_inode_size(eb, item, 0);
+ }
+ } else if (inode_item) {
+ struct btrfs_inode_item *item;
+ u32 mode;
+
+ /*
+ * New inode, set nbytes to 0 so that the nbytes comes out
+ * properly when we replay the extents.
+ */
+ item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(eb, item, 0);
+
+ /*
+ * If this is a directory we need to reset the i_size to 0 so
+ * that we can set it up properly when replaying the rest of
+ * the items in this log.
+ */
+ mode = btrfs_inode_mode(eb, item);
+ if (S_ISDIR(mode))
+ btrfs_set_inode_size(eb, item, 0);
+ }
+insert:
+ btrfs_release_path(path);
+ /* try to insert the key into the destination tree */
+ path->skip_release_on_error = 1;
+ ret = btrfs_insert_empty_item(trans, root, path,
+ key, item_size);
+ path->skip_release_on_error = 0;
+
+ /* make sure any existing item is the correct size */
+ if (ret == -EEXIST || ret == -EOVERFLOW) {
+ u32 found_size;
+ found_size = btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ if (found_size > item_size)
+ btrfs_truncate_item(path, item_size, 1);
+ else if (found_size < item_size)
+ btrfs_extend_item(path, item_size - found_size);
+ } else if (ret) {
+ return ret;
+ }
+ dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+
+ /* don't overwrite an existing inode if the generation number
+ * was logged as zero. This is done when the tree logging code
+ * is just logging an inode to make sure it exists after recovery.
+ *
+ * Also, don't overwrite i_size on directories during replay.
+ * log replay inserts and removes directory items based on the
+ * state of the tree found in the subvolume, and i_size is modified
+ * as it goes
+ */
+ if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ struct btrfs_inode_item *src_item;
+ struct btrfs_inode_item *dst_item;
+
+ src_item = (struct btrfs_inode_item *)src_ptr;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+ if (btrfs_inode_generation(eb, src_item) == 0) {
+ struct extent_buffer *dst_eb = path->nodes[0];
+ const u64 ino_size = btrfs_inode_size(eb, src_item);
+
+ /*
+ * For regular files an ino_size == 0 is used only when
+ * logging that an inode exists, as part of a directory
+ * fsync, and the inode wasn't fsynced before. In this
+ * case don't set the size of the inode in the fs/subvol
+ * tree, otherwise we would be throwing valid data away.
+ */
+ if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+ ino_size != 0)
+ btrfs_set_inode_size(dst_eb, dst_item, ino_size);
+ goto no_copy;
+ }
+
+ if (overwrite_root &&
+ S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+ S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ save_old_i_size = 1;
+ saved_i_size = btrfs_inode_size(path->nodes[0],
+ dst_item);
+ }
+ }
+
+ copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+ src_ptr, item_size);
+
+ if (save_old_i_size) {
+ struct btrfs_inode_item *dst_item;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+ btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ }
+
+ /* make sure the generation is filled in */
+ if (key->type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *dst_item;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+ if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+ btrfs_set_inode_generation(path->nodes[0], dst_item,
+ trans->transid);
+ }
+ }
+no_copy:
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+ return 0;
+}
+
+/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
+static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
+ struct fscrypt_str *name)
+{
+ char *buf;
+
+ buf = kmalloc(len, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ read_extent_buffer(eb, buf, (unsigned long)start, len);
+ name->name = buf;
+ name->len = len;
+ return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+ u64 objectid)
+{
+ struct inode *inode;
+
+ inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ if (IS_ERR(inode))
+ inode = NULL;
+ return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'. path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet. So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int found_type;
+ u64 extent_end;
+ u64 start = key->offset;
+ u64 nbytes = 0;
+ struct btrfs_file_extent_item *item;
+ struct inode *inode = NULL;
+ unsigned long size;
+ int ret = 0;
+
+ item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ nbytes = btrfs_file_extent_num_bytes(eb, item);
+ extent_end = start + nbytes;
+
+ /*
+ * We don't add to the inodes nbytes if we are prealloc or a
+ * hole.
+ */
+ if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+ nbytes = 0;
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ size = btrfs_file_extent_ram_bytes(eb, item);
+ nbytes = btrfs_file_extent_ram_bytes(eb, item);
+ extent_end = ALIGN(start + size,
+ fs_info->sectorsize);
+ } else {
+ ret = 0;
+ goto out;
+ }
+
+ inode = read_one_inode(root, key->objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * first check to see if we already have this extent in the
+ * file. This must be done before the btrfs_drop_extents run
+ * so we don't try to drop this extent.
+ */
+ ret = btrfs_lookup_file_extent(trans, root, path,
+ btrfs_ino(BTRFS_I(inode)), start, 0);
+
+ if (ret == 0 &&
+ (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+ struct btrfs_file_extent_item cmp1;
+ struct btrfs_file_extent_item cmp2;
+ struct btrfs_file_extent_item *existing;
+ struct extent_buffer *leaf;
+
+ leaf = path->nodes[0];
+ existing = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ read_extent_buffer(eb, &cmp1, (unsigned long)item,
+ sizeof(cmp1));
+ read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+ sizeof(cmp2));
+
+ /*
+ * we already have a pointer to this exact extent,
+ * we don't have to do anything
+ */
+ if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+ btrfs_release_path(path);
+ goto out;
+ }
+ }
+ btrfs_release_path(path);
+
+ /* drop any overlapping extents */
+ drop_args.start = start;
+ drop_args.end = extent_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ if (ret)
+ goto out;
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 offset;
+ unsigned long dest_offset;
+ struct btrfs_key ins;
+
+ if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES))
+ goto update_inode;
+
+ ret = btrfs_insert_empty_item(trans, root, path, key,
+ sizeof(*item));
+ if (ret)
+ goto out;
+ dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ copy_extent_buffer(path->nodes[0], eb, dest_offset,
+ (unsigned long)item, sizeof(*item));
+
+ ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ offset = key->offset - btrfs_file_extent_offset(eb, item);
+
+ /*
+ * Manually record dirty extent, as here we did a shallow
+ * file extent item copy and skip normal backref update,
+ * but modifying extent tree all by ourselves.
+ * So need to manually record dirty extent for qgroup,
+ * as the owner of the file extent changed from log tree
+ * (doesn't affect qgroup) to fs/file tree(affects qgroup)
+ */
+ ret = btrfs_qgroup_trace_extent(trans,
+ btrfs_file_extent_disk_bytenr(eb, item),
+ btrfs_file_extent_disk_num_bytes(eb, item),
+ GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
+ if (ins.objectid > 0) {
+ struct btrfs_ref ref = { 0 };
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+
+ /*
+ * is this extent already allocated in the extent
+ * allocation tree? If so, just add a reference
+ */
+ ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
+ ins.offset);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ btrfs_init_generic_ref(&ref,
+ BTRFS_ADD_DELAYED_REF,
+ ins.objectid, ins.offset, 0);
+ btrfs_init_data_ref(&ref,
+ root->root_key.objectid,
+ key->objectid, offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret)
+ goto out;
+ } else {
+ /*
+ * insert the extent pointer in the extent
+ * allocation tree
+ */
+ ret = btrfs_alloc_logged_file_extent(trans,
+ root->root_key.objectid,
+ key->objectid, offset, &ins);
+ if (ret)
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ if (btrfs_file_extent_compression(eb, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid +
+ btrfs_file_extent_offset(eb, item);
+ csum_end = csum_start +
+ btrfs_file_extent_num_bytes(eb, item);
+ }
+
+ ret = btrfs_lookup_csums_range(root->log_root,
+ csum_start, csum_end - 1,
+ &ordered_sums, 0, false);
+ if (ret)
+ goto out;
+ /*
+ * Now delete all existing cums in the csum root that
+ * cover our range. We do this because we can have an
+ * extent that is completely referenced by one file
+ * extent item and partially referenced by another
+ * file extent item (like after using the clone or
+ * extent_same ioctls). In this case if we end up doing
+ * the replay of the one that partially references the
+ * extent first, and we do not do the csum deletion
+ * below, we can get 2 csum items in the csum tree that
+ * overlap each other. For example, imagine our log has
+ * the two following file extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent
+ * that starts at disk byte 12845056, and the log tree
+ * has a single csum item that covers the entire range
+ * of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the
+ * csum tree gets the following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K
+ * of our extent. Now when we replay the second file
+ * extent item, if we do not delete existing csum items
+ * that cover any of its blocks, we end up getting two
+ * csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying
+ * to lookup up for the checksum of any block of our
+ * extent starting at an offset of 40K or higher, will
+ * end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting
+ * at offset 40K or higher of our extent.
+ */
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
+
+ sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ csum_root = btrfs_csum_root(fs_info,
+ sums->bytenr);
+ if (!ret)
+ ret = btrfs_del_csums(trans, csum_root,
+ sums->bytenr,
+ sums->len);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans,
+ csum_root,
+ sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ goto out;
+ } else {
+ btrfs_release_path(path);
+ }
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(trans, root, path, eb, slot, key);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
+ extent_end - start);
+ if (ret)
+ goto out;
+
+update_inode:
+ btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+out:
+ iput(inode);
+ return ret;
+}
+
+static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ const struct fscrypt_str *name)
+{
+ int ret;
+
+ ret = btrfs_unlink_inode(trans, dir, inode, name);
+ if (ret)
+ return ret;
+ /*
+ * Whenever we need to check if a name exists or not, we check the
+ * fs/subvolume tree. So after an unlink we must run delayed items, so
+ * that future checks for a name during log replay see that the name
+ * does not exists anymore.
+ */
+ return btrfs_run_delayed_items(trans);
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_inode *dir,
+ struct btrfs_dir_item *di)
+{
+ struct btrfs_root *root = dir->root;
+ struct inode *inode;
+ struct fscrypt_str name;
+ struct extent_buffer *leaf;
+ struct btrfs_key location;
+ int ret;
+
+ leaf = path->nodes[0];
+
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
+ ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
+ if (ret)
+ return -ENOMEM;
+
+ btrfs_release_path(path);
+
+ inode = read_one_inode(root, location.objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ if (ret)
+ goto out;
+
+ ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
+out:
+ kfree(name.name);
+ iput(inode);
+ return ret;
+}
+
+/*
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, u64 objectid, u64 index,
+ struct fscrypt_str *name)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_key location;
+ int ret = 0;
+
+ di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+ index, name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid != objectid)
+ goto out;
+ } else {
+ goto out;
+ }
+
+ btrfs_release_path(path);
+ di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode. This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+ struct btrfs_key *key,
+ u64 ref_objectid,
+ const struct fscrypt_str *name)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0],
+ ref_objectid, name);
+ else
+ ret = !!btrfs_find_name_in_backref(path->nodes[0],
+ path->slots[0], name);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ u64 inode_objectid, u64 parent_objectid,
+ u64 ref_index, struct fscrypt_str *name)
+{
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key search_key;
+ struct btrfs_inode_extref *extref;
+
+again:
+ /* Search old style refs */
+ search_key.objectid = inode_objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = parent_objectid;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret == 0) {
+ struct btrfs_inode_ref *victim_ref;
+ unsigned long ptr;
+ unsigned long ptr_end;
+
+ leaf = path->nodes[0];
+
+ /* are we trying to overwrite a back ref for the root directory
+ * if so, just jump out, we're done
+ */
+ if (search_key.objectid == search_key.offset)
+ return 1;
+
+ /* check all the names in this back reference to see
+ * if they are in the log. if so, we allow them to stay
+ * otherwise they must be unlinked as a conflict
+ */
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
+ while (ptr < ptr_end) {
+ struct fscrypt_str victim_name;
+
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret)
+ return ret;
+
+ ret = backref_in_log(log_root, &search_key,
+ parent_objectid, &victim_name);
+ if (ret < 0) {
+ kfree(victim_name.name);
+ return ret;
+ } else if (!ret) {
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(path);
+
+ ret = unlink_inode_for_log_replay(trans, dir, inode,
+ &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ goto again;
+ }
+ kfree(victim_name.name);
+
+ ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
+ }
+ }
+ btrfs_release_path(path);
+
+ /* Same search but for extended refs */
+ extref = btrfs_lookup_inode_extref(NULL, root, path, name,
+ inode_objectid, parent_objectid, 0,
+ 0);
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
+ u32 item_size;
+ u32 cur_offset = 0;
+ unsigned long base;
+ struct inode *victim_parent;
+
+ leaf = path->nodes[0];
+
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ while (cur_offset < item_size) {
+ struct fscrypt_str victim_name;
+
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
+
+ if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+ goto next;
+
+ ret = read_alloc_one_name(leaf, &extref->name,
+ btrfs_inode_extref_name_len(leaf, extref),
+ &victim_name);
+ if (ret)
+ return ret;
+
+ search_key.objectid = inode_objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(parent_objectid,
+ victim_name.name,
+ victim_name.len);
+ ret = backref_in_log(log_root, &search_key,
+ parent_objectid, &victim_name);
+ if (ret < 0) {
+ kfree(victim_name.name);
+ return ret;
+ } else if (!ret) {
+ ret = -ENOENT;
+ victim_parent = read_one_inode(root,
+ parent_objectid);
+ if (victim_parent) {
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(path);
+
+ ret = unlink_inode_for_log_replay(trans,
+ BTRFS_I(victim_parent),
+ inode, &victim_name);
+ }
+ iput(victim_parent);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ goto again;
+ }
+ kfree(victim_name.name);
+next:
+ cur_offset += victim_name.len + sizeof(*extref);
+ }
+ }
+ btrfs_release_path(path);
+
+ /* look for a conflicting sequence number */
+ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+ ref_index, name, 0);
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
+ ret = drop_one_dir_item(trans, path, dir, di);
+ if (ret)
+ return ret;
+ }
+ btrfs_release_path(path);
+
+ /* look for a conflicting name */
+ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
+ ret = drop_one_dir_item(trans, path, dir, di);
+ if (ret)
+ return ret;
+ }
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ struct fscrypt_str *name, u64 *index,
+ u64 *parent_objectid)
+{
+ struct btrfs_inode_extref *extref;
+ int ret;
+
+ extref = (struct btrfs_inode_extref *)ref_ptr;
+
+ ret = read_alloc_one_name(eb, &extref->name,
+ btrfs_inode_extref_name_len(eb, extref), name);
+ if (ret)
+ return ret;
+
+ if (index)
+ *index = btrfs_inode_extref_index(eb, extref);
+ if (parent_objectid)
+ *parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+ return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ struct fscrypt_str *name, u64 *index)
+{
+ struct btrfs_inode_ref *ref;
+ int ret;
+
+ ref = (struct btrfs_inode_ref *)ref_ptr;
+
+ ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
+ name);
+ if (ret)
+ return ret;
+
+ if (index)
+ *index = btrfs_inode_ref_index(eb, ref);
+
+ return 0;
+}
+
+/*
+ * Take an inode reference item from the log tree and iterate all names from the
+ * inode reference item in the subvolume tree with the same key (if it exists).
+ * For any name that is not in the inode reference item from the log tree, do a
+ * proper unlink of that name (that is, remove its entry from the inode
+ * reference item and both dir index keys).
+ */
+static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode,
+ struct extent_buffer *log_eb,
+ int log_slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ unsigned long ref_ptr;
+ unsigned long ref_end;
+ struct extent_buffer *eb;
+
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[0];
+ ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
+ while (ref_ptr < ref_end) {
+ struct fscrypt_str name;
+ u64 parent_id;
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ ret = extref_get_fields(eb, ref_ptr, &name,
+ NULL, &parent_id);
+ } else {
+ parent_id = key->offset;
+ ret = ref_get_fields(eb, ref_ptr, &name, NULL);
+ }
+ if (ret)
+ goto out;
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
+ parent_id, &name);
+ else
+ ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
+
+ if (!ret) {
+ struct inode *dir;
+
+ btrfs_release_path(path);
+ dir = read_one_inode(root, parent_id);
+ if (!dir) {
+ ret = -ENOENT;
+ kfree(name.name);
+ goto out;
+ }
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
+ inode, &name);
+ kfree(name.name);
+ iput(dir);
+ if (ret)
+ goto out;
+ goto again;
+ }
+
+ kfree(name.name);
+ ref_ptr += name.len;
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ref_ptr += sizeof(struct btrfs_inode_extref);
+ else
+ ref_ptr += sizeof(struct btrfs_inode_ref);
+ }
+ ret = 0;
+ out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function. (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ struct inode *dir = NULL;
+ struct inode *inode = NULL;
+ unsigned long ref_ptr;
+ unsigned long ref_end;
+ struct fscrypt_str name;
+ int ret;
+ int log_ref_ver = 0;
+ u64 parent_objectid;
+ u64 inode_objectid;
+ u64 ref_index = 0;
+ int ref_struct_size;
+
+ ref_ptr = btrfs_item_ptr_offset(eb, slot);
+ ref_end = ref_ptr + btrfs_item_size(eb, slot);
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *r;
+
+ ref_struct_size = sizeof(struct btrfs_inode_extref);
+ log_ref_ver = 1;
+ r = (struct btrfs_inode_extref *)ref_ptr;
+ parent_objectid = btrfs_inode_extref_parent(eb, r);
+ } else {
+ ref_struct_size = sizeof(struct btrfs_inode_ref);
+ parent_objectid = key->offset;
+ }
+ inode_objectid = key->objectid;
+
+ /*
+ * it is possible that we didn't log all the parent directories
+ * for a given inode. If we don't find the dir, just don't
+ * copy the back ref in. The link count fixup code will take
+ * care of the rest
+ */
+ dir = read_one_inode(root, parent_objectid);
+ if (!dir) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ inode = read_one_inode(root, inode_objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ while (ref_ptr < ref_end) {
+ if (log_ref_ver) {
+ ret = extref_get_fields(eb, ref_ptr, &name,
+ &ref_index, &parent_objectid);
+ /*
+ * parent object can change from one array
+ * item to another.
+ */
+ if (!dir)
+ dir = read_one_inode(root, parent_objectid);
+ if (!dir) {
+ ret = -ENOENT;
+ goto out;
+ }
+ } else {
+ ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
+ }
+ if (ret)
+ goto out;
+
+ ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
+ btrfs_ino(BTRFS_I(inode)), ref_index, &name);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ /*
+ * look for a conflicting back reference in the
+ * metadata. if we find one we have to unlink that name
+ * of the file before we add our new link. Later on, we
+ * overwrite any existing back reference, and we don't
+ * want to create dangling pointers in the directory.
+ */
+ ret = __add_inode_ref(trans, root, path, log,
+ BTRFS_I(dir), BTRFS_I(inode),
+ inode_objectid, parent_objectid,
+ ref_index, &name);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto out;
+ }
+
+ /* insert our name */
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ &name, 0, ref_index);
+ if (ret)
+ goto out;
+
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
+ }
+ /* Else, ret == 1, we already have a perfect match, we're done. */
+
+ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
+ kfree(name.name);
+ name.name = NULL;
+ if (log_ref_ver) {
+ iput(dir);
+ dir = NULL;
+ }
+ }
+
+ /*
+ * Before we overwrite the inode reference item in the subvolume tree
+ * with the item from the log tree, we must unlink all names from the
+ * parent directory that are in the subvolume's tree inode reference
+ * item, otherwise we end up with an inconsistent subvolume tree where
+ * dir index entries exist for a name but there is no inode reference
+ * item with the same name.
+ */
+ ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
+ key);
+ if (ret)
+ goto out;
+
+ /* finally write the back reference in the inode */
+ ret = overwrite_item(trans, root, path, eb, slot, key);
+out:
+ btrfs_release_path(path);
+ kfree(name.name);
+ iput(dir);
+ iput(inode);
+ return ret;
+}
+
+static int count_inode_extrefs(struct btrfs_root *root,
+ struct btrfs_inode *inode, struct btrfs_path *path)
+{
+ int ret = 0;
+ int name_len;
+ unsigned int nlink = 0;
+ u32 item_size;
+ u32 cur_offset = 0;
+ u64 inode_objectid = btrfs_ino(inode);
+ u64 offset = 0;
+ unsigned long ptr;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+
+ while (1) {
+ ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+ &extref, &offset);
+ if (ret)
+ break;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+ name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ nlink++;
+
+ cur_offset += name_len + sizeof(*extref);
+ }
+
+ offset++;
+ btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
+
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+ return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+ struct btrfs_inode *inode, struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ unsigned int nlink = 0;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ int name_len;
+ u64 ino = btrfs_ino(inode);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+process_slot:
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid != ino ||
+ key.type != BTRFS_INODE_REF_KEY)
+ break;
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ while (ptr < ptr_end) {
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ptr;
+ name_len = btrfs_inode_ref_name_len(path->nodes[0],
+ ref);
+ ptr = (unsigned long)(ref + 1) + name_len;
+ nlink++;
+ }
+
+ if (key.offset == 0)
+ break;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ goto process_slot;
+ }
+ key.offset--;
+ btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
+
+ return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay. So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found. If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ struct btrfs_path *path;
+ int ret;
+ u64 nlink = 0;
+ u64 ino = btrfs_ino(BTRFS_I(inode));
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = count_inode_refs(root, BTRFS_I(inode), path);
+ if (ret < 0)
+ goto out;
+
+ nlink = ret;
+
+ ret = count_inode_extrefs(root, BTRFS_I(inode), path);
+ if (ret < 0)
+ goto out;
+
+ nlink += ret;
+
+ ret = 0;
+
+ if (nlink != inode->i_nlink) {
+ set_nlink(inode, nlink);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
+ }
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
+ if (inode->i_nlink == 0) {
+ if (S_ISDIR(inode->i_mode)) {
+ ret = replay_dir_deletes(trans, root, NULL, path,
+ ino, 1);
+ if (ret)
+ goto out;
+ }
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = (u64)-1;
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ break;
+
+ if (ret == 1) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ break;
+
+ btrfs_release_path(path);
+ inode = read_one_inode(root, key.offset);
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
+
+ ret = fixup_inode_link_count(trans, root, inode);
+ iput(inode);
+ if (ret)
+ break;
+
+ /*
+ * fixup on a directory may create new entries,
+ * make sure we always look for the highset possible
+ * offset
+ */
+ key.offset = (u64)-1;
+ }
+ btrfs_release_path(path);
+ return ret;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done. The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_key key;
+ int ret = 0;
+ struct inode *inode;
+
+ inode = read_one_inode(root, objectid);
+ if (!inode)
+ return -EIO;
+
+ key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = objectid;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+ btrfs_release_path(path);
+ if (ret == 0) {
+ if (!inode->i_nlink)
+ set_nlink(inode, 1);
+ else
+ inc_nlink(inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ }
+ iput(inode);
+
+ return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist. This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 dirid, u64 index,
+ const struct fscrypt_str *name,
+ struct btrfs_key *location)
+{
+ struct inode *inode;
+ struct inode *dir;
+ int ret;
+
+ inode = read_one_inode(root, location->objectid);
+ if (!inode)
+ return -ENOENT;
+
+ dir = read_one_inode(root, dirid);
+ if (!dir) {
+ iput(inode);
+ return -EIO;
+ }
+
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+ 1, index);
+
+ /* FIXME, put inode into FIXUP list */
+
+ iput(inode);
+ iput(dir);
+ return ret;
+}
+
+static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *dst_di,
+ const struct btrfs_key *log_key,
+ u8 log_type,
+ bool exists)
+{
+ struct btrfs_key found_key;
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ /* The existing dentry points to the same inode, don't delete it. */
+ if (found_key.objectid == log_key->objectid &&
+ found_key.type == log_key->type &&
+ found_key.offset == log_key->offset &&
+ btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+ return 1;
+
+ /*
+ * Don't drop the conflicting directory entry if the inode for the new
+ * entry doesn't exist.
+ */
+ if (!exists)
+ return 0;
+
+ return drop_one_dir_item(trans, path, dir, dst_di);
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped. fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ struct btrfs_dir_item *di,
+ struct btrfs_key *key)
+{
+ struct fscrypt_str name;
+ struct btrfs_dir_item *dir_dst_di;
+ struct btrfs_dir_item *index_dst_di;
+ bool dir_dst_matches = false;
+ bool index_dst_matches = false;
+ struct btrfs_key log_key;
+ struct btrfs_key search_key;
+ struct inode *dir;
+ u8 log_type;
+ bool exists;
+ int ret;
+ bool update_size = true;
+ bool name_added = false;
+
+ dir = read_one_inode(root, key->objectid);
+ if (!dir)
+ return -EIO;
+
+ ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+ if (ret)
+ goto out;
+
+ log_type = btrfs_dir_type(eb, di);
+ btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+ ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+ btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ exists = (ret == 0);
+ ret = 0;
+
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+ &name, 1);
+ if (IS_ERR(dir_dst_di)) {
+ ret = PTR_ERR(dir_dst_di);
+ goto out;
+ } else if (dir_dst_di) {
+ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+ dir_dst_di, &log_key, log_type,
+ exists);
+ if (ret < 0)
+ goto out;
+ dir_dst_matches = (ret == 1);
+ }
+
+ btrfs_release_path(path);
+
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+ key->objectid, key->offset,
+ &name, 1);
+ if (IS_ERR(index_dst_di)) {
+ ret = PTR_ERR(index_dst_di);
+ goto out;
+ } else if (index_dst_di) {
+ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+ index_dst_di, &log_key,
+ log_type, exists);
+ if (ret < 0)
+ goto out;
+ index_dst_matches = (ret == 1);
+ }
+
+ btrfs_release_path(path);
+
+ if (dir_dst_matches && index_dst_matches) {
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
+
+ /*
+ * Check if the inode reference exists in the log for the given name,
+ * inode and parent inode
+ */
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &search_key, 0, &name);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
+
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
+ btrfs_release_path(path);
+ ret = insert_one_name(trans, root, key->objectid, key->offset,
+ &name, &log_key);
+ if (ret && ret != -ENOENT && ret != -EEXIST)
+ goto out;
+ if (!ret)
+ name_added = true;
+ update_size = false;
+ ret = 0;
+
+out:
+ if (!ret && update_size) {
+ btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ }
+ kfree(name.name);
+ iput(dir);
+ if (!ret && name_added)
+ ret = 1;
+ return ret;
+}
+
+/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ struct btrfs_dir_item *di;
+
+ /* We only log dir index keys, which only contain a single dir item. */
+ ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
+
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ ret = replay_one_name(trans, root, path, eb, di, key);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * If this entry refers to a non-directory (directories can not have a
+ * link count > 1) and it was added in the transaction that was not
+ * committed, make sure we fixup the link count of the inode the entry
+ * points to. Otherwise something like the following would result in a
+ * directory pointing to an inode with a wrong link that does not account
+ * for this dir entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two entries
+ * pointing to it in the directory testdir. This would make it impossible
+ * to ever delete the parent directory has it would result in stale
+ * dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ struct btrfs_path *fixup_path;
+ struct btrfs_key di_key;
+
+ fixup_path = btrfs_alloc_path();
+ if (!fixup_path)
+ return -ENOMEM;
+
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+ ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
+ btrfs_free_path(fixup_path);
+ }
+
+ return ret;
+}
+
+/*
+ * directory replay has two parts. There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for. During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid,
+ u64 *start_ret, u64 *end_ret)
+{
+ struct btrfs_key key;
+ u64 found_end;
+ struct btrfs_dir_log_item *item;
+ int ret;
+ int nritems;
+
+ if (*start_ret == (u64)-1)
+ return 1;
+
+ key.objectid = dirid;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.offset = *start_ret;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+ if (ret != 0)
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
+ ret = 1;
+ goto next;
+ }
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ if (*start_ret >= key.offset && *start_ret <= found_end) {
+ ret = 0;
+ *start_ret = key.offset;
+ *end_ret = found_end;
+ goto out;
+ }
+ ret = 1;
+next:
+ /* check the next slot in the tree to see if it is a valid item */
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ path->slots[0]++;
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
+ ret = 1;
+ goto out;
+ }
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ found_end = btrfs_dir_log_end(path->nodes[0], item);
+ *start_ret = key.offset;
+ *end_ret = found_end;
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
+ * this looks for a given directory item in the log. If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct btrfs_path *log_path,
+ struct inode *dir,
+ struct btrfs_key *dir_key)
+{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int ret;
+ struct extent_buffer *eb;
+ int slot;
+ struct btrfs_dir_item *di;
+ struct fscrypt_str name;
+ struct inode *inode = NULL;
+ struct btrfs_key location;
+
+ /*
+ * Currently we only log dir index keys. Even if we replay a log created
+ * by an older kernel that logged both dir index and dir item keys, all
+ * we need to do is process the dir index keys, we (and our caller) can
+ * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
+ */
+ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
+ if (ret)
+ goto out;
+
+ if (log) {
+ struct btrfs_dir_item *log_di;
+
+ log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ dir_key->objectid,
+ dir_key->offset, &name, 0);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ } else if (log_di) {
+ /* The dentry exists in the log, we have nothing to do. */
+ ret = 0;
+ goto out;
+ }
+ }
+
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
+ inode = read_one_inode(root, location.objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ if (ret)
+ goto out;
+
+ inc_nlink(inode);
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
+ &name);
+ /*
+ * Unlike dir item keys, dir index keys can only have one name (entry) in
+ * them, as there are no key collisions since each key has a unique offset
+ * (an index number), so we're done.
+ */
+out:
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
+ kfree(name.name);
+ iput(inode);
+ return ret;
+}
+
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const u64 ino)
+{
+ struct btrfs_key search_key;
+ struct btrfs_path *log_path;
+ int i;
+ int nritems;
+ int ret;
+
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_XATTR_ITEM_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+process_leaf:
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ u32 total_size;
+ u32 cur;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size(path->nodes[0], i);
+ cur = 0;
+ while (cur < total_size) {
+ u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u32 this_len = sizeof(*di) + name_len + data_len;
+ char *name;
+
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(path->nodes[0], name,
+ (unsigned long)(di + 1), name_len);
+
+ log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+ name, name_len, 0);
+ btrfs_release_path(log_path);
+ if (!log_di) {
+ /* Doesn't exist in log tree, so delete it. */
+ btrfs_release_path(path);
+ di = btrfs_lookup_xattr(trans, root, path, ino,
+ name, name_len, -1);
+ kfree(name);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ ASSERT(di);
+ ret = btrfs_delete_one_dir_name(trans, root,
+ path, di);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ search_key = key;
+ goto again;
+ }
+ kfree(name);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ }
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *)di + this_len);
+ }
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ goto process_leaf;
+out:
+ btrfs_free_path(log_path);
+ btrfs_release_path(path);
+ return ret;
+}
+
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes. It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all)
+{
+ u64 range_start;
+ u64 range_end;
+ int ret = 0;
+ struct btrfs_key dir_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *log_path;
+ struct inode *dir;
+
+ dir_key.objectid = dirid;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ dir = read_one_inode(root, dirid);
+ /* it isn't an error if the inode isn't there, that can happen
+ * because we replay the deletes before we copy in the inode item
+ * from the log
+ */
+ if (!dir) {
+ btrfs_free_path(log_path);
+ return 0;
+ }
+
+ range_start = 0;
+ range_end = 0;
+ while (1) {
+ if (del_all)
+ range_end = (u64)-1;
+ else {
+ ret = find_dir_range(log, path, dirid,
+ &range_start, &range_end);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ }
+
+ dir_key.offset = range_start;
+ while (1) {
+ int nritems;
+ ret = btrfs_search_slot(NULL, root, &dir_key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 1)
+ break;
+ else if (ret < 0)
+ goto out;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != dirid ||
+ found_key.type != dir_key.type) {
+ ret = 0;
+ goto out;
+ }
+
+ if (found_key.offset > range_end)
+ break;
+
+ ret = check_item_in_log(trans, log, path,
+ log_path, dir,
+ &found_key);
+ if (ret)
+ goto out;
+ if (found_key.offset == (u64)-1)
+ break;
+ dir_key.offset = found_key.offset + 1;
+ }
+ btrfs_release_path(path);
+ if (range_end == (u64)-1)
+ break;
+ range_start = range_end + 1;
+ }
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ btrfs_free_path(log_path);
+ iput(dir);
+ return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree. This
+ * gets called in two different stages. The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume. The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level)
+{
+ int nritems;
+ struct btrfs_path *path;
+ struct btrfs_root *root = wc->replay_dest;
+ struct btrfs_key key;
+ int i;
+ int ret;
+
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
+ if (ret)
+ return ret;
+
+ level = btrfs_header_level(eb);
+
+ if (level != 0)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+
+ /* inode keys are done during the first stage */
+ if (key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ struct btrfs_inode_item *inode_item;
+ u32 mode;
+
+ inode_item = btrfs_item_ptr(eb, i,
+ struct btrfs_inode_item);
+ /*
+ * If we have a tmpfile (O_TMPFILE) that got fsync'ed
+ * and never got linked before the fsync, skip it, as
+ * replaying it is pointless since it would be deleted
+ * later. We skip logging tmpfiles, but it's always
+ * possible we are replaying a log created with a kernel
+ * that used to log tmpfiles.
+ */
+ if (btrfs_inode_nlink(eb, inode_item) == 0) {
+ wc->ignore_cur_inode = true;
+ continue;
+ } else {
+ wc->ignore_cur_inode = false;
+ }
+ ret = replay_xattr_deletes(wc->trans, root, log,
+ path, key.objectid);
+ if (ret)
+ break;
+ mode = btrfs_inode_mode(eb, inode_item);
+ if (S_ISDIR(mode)) {
+ ret = replay_dir_deletes(wc->trans,
+ root, log, path, key.objectid, 0);
+ if (ret)
+ break;
+ }
+ ret = overwrite_item(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+
+ /*
+ * Before replaying extents, truncate the inode to its
+ * size. We need to do it now and not after log replay
+ * because before an fsync we can have prealloc extents
+ * added beyond the inode's i_size. If we did it after,
+ * through orphan cleanup for example, we would drop
+ * those prealloc extents just after replaying them.
+ */
+ if (S_ISREG(mode)) {
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct inode *inode;
+ u64 from;
+
+ inode = read_one_inode(root, key.objectid);
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
+ from = ALIGN(i_size_read(inode),
+ root->fs_info->sectorsize);
+ drop_args.start = from;
+ drop_args.end = (u64)-1;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(wc->trans, root,
+ BTRFS_I(inode),
+ &drop_args);
+ if (!ret) {
+ inode_sub_bytes(inode,
+ drop_args.bytes_found);
+ /* Update the inode's nbytes. */
+ ret = btrfs_update_inode(wc->trans,
+ root, BTRFS_I(inode));
+ }
+ iput(inode);
+ if (ret)
+ break;
+ }
+
+ ret = link_to_fixup_dir(wc->trans, root,
+ path, key.objectid);
+ if (ret)
+ break;
+ }
+
+ if (wc->ignore_cur_inode)
+ continue;
+
+ if (key.type == BTRFS_DIR_INDEX_KEY &&
+ wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
+ ret = replay_one_dir_item(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+ }
+
+ if (wc->stage < LOG_WALK_REPLAY_ALL)
+ continue;
+
+ /* these keys are simply copied */
+ if (key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+ } else if (key.type == BTRFS_INODE_REF_KEY ||
+ key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = add_inode_ref(wc->trans, root, log, path,
+ eb, i, &key);
+ if (ret && ret != -ENOENT)
+ break;
+ ret = 0;
+ } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+ }
+ /*
+ * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
+ * BTRFS_DIR_INDEX_KEY items which we use to derive the
+ * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
+ * older kernel with such keys, ignore them.
+ */
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Correctly adjust the reserved bytes occupied by a log tree extent buffer
+ */
+static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, start);
+ if (!cache) {
+ btrfs_err(fs_info, "unable to find block group for %llu", start);
+ return;
+ }
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->reserved -= fs_info->nodesize;
+ cache->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ btrfs_put_block_group(cache);
+}
+
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 bytenr;
+ u64 ptr_gen;
+ struct extent_buffer *next;
+ struct extent_buffer *cur;
+ u32 blocksize;
+ int ret = 0;
+
+ while (*level > 0) {
+ struct btrfs_key first_key;
+
+ cur = path->nodes[*level];
+
+ WARN_ON(btrfs_header_level(cur) != *level);
+
+ if (path->slots[*level] >=
+ btrfs_header_nritems(cur))
+ break;
+
+ bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+ ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
+ blocksize = fs_info->nodesize;
+
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ btrfs_header_owner(cur),
+ *level - 1);
+ if (IS_ERR(next))
+ return PTR_ERR(next);
+
+ if (*level == 1) {
+ ret = wc->process_func(root, next, wc, ptr_gen,
+ *level - 1);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+
+ path->slots[*level]++;
+ if (wc->free) {
+ ret = btrfs_read_extent_buffer(next, ptr_gen,
+ *level - 1, &first_key);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_clean_tree_block(next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ bytenr, blocksize);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+ btrfs_redirty_list_add(
+ trans->transaction, next);
+ } else {
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+ clear_extent_buffer_dirty(next);
+ unaccount_log_buffer(fs_info, bytenr);
+ }
+ }
+ free_extent_buffer(next);
+ continue;
+ }
+ ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+
+ if (path->nodes[*level-1])
+ free_extent_buffer(path->nodes[*level-1]);
+ path->nodes[*level-1] = next;
+ *level = btrfs_header_level(next);
+ path->slots[*level] = 0;
+ cond_resched();
+ }
+ path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
+
+ cond_resched();
+ return 0;
+}
+
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int i;
+ int slot;
+ int ret;
+
+ for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+ slot = path->slots[i];
+ if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
+ path->slots[i]++;
+ *level = i;
+ WARN_ON(*level == 0);
+ return 0;
+ } else {
+ ret = wc->process_func(root, path->nodes[*level], wc,
+ btrfs_header_generation(path->nodes[*level]),
+ *level);
+ if (ret)
+ return ret;
+
+ if (wc->free) {
+ struct extent_buffer *next;
+
+ next = path->nodes[*level];
+
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_clean_tree_block(next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ path->nodes[*level]->start,
+ path->nodes[*level]->len);
+ if (ret)
+ return ret;
+ btrfs_redirty_list_add(trans->transaction,
+ next);
+ } else {
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+ clear_extent_buffer_dirty(next);
+
+ unaccount_log_buffer(fs_info,
+ path->nodes[*level]->start);
+ }
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level = i + 1;
+ }
+ }
+ return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'. This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, struct walk_control *wc)
+{
+ struct btrfs_fs_info *fs_info = log->fs_info;
+ int ret = 0;
+ int wret;
+ int level;
+ struct btrfs_path *path;
+ int orig_level;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ level = btrfs_header_level(log->node);
+ orig_level = level;
+ path->nodes[level] = log->node;
+ atomic_inc(&log->node->refs);
+ path->slots[level] = 0;
+
+ while (1) {
+ wret = walk_down_log_tree(trans, log, path, &level, wc);
+ if (wret > 0)
+ break;
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+
+ wret = walk_up_log_tree(trans, log, path, &level, wc);
+ if (wret > 0)
+ break;
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ }
+
+ /* was the root node processed? if not, catch it here */
+ if (path->nodes[orig_level]) {
+ ret = wc->process_func(log, path->nodes[orig_level], wc,
+ btrfs_header_generation(path->nodes[orig_level]),
+ orig_level);
+ if (ret)
+ goto out;
+ if (wc->free) {
+ struct extent_buffer *next;
+
+ next = path->nodes[orig_level];
+
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_clean_tree_block(next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ next->start, next->len);
+ if (ret)
+ goto out;
+ btrfs_redirty_list_add(trans->transaction, next);
+ } else {
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+ clear_extent_buffer_dirty(next);
+ unaccount_log_buffer(fs_info, next->start);
+ }
+ }
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
+{
+ struct btrfs_fs_info *fs_info = log->fs_info;
+ int ret;
+
+ if (log->log_transid == 1) {
+ /* insert root item on the first sync */
+ ret = btrfs_insert_root(trans, fs_info->log_root_tree,
+ &log->root_key, root_item);
+ } else {
+ ret = btrfs_update_root(trans, fs_info->log_root_tree,
+ &log->root_key, root_item);
+ }
+ return ret;
+}
+
+static void wait_log_commit(struct btrfs_root *root, int transid)
+{
+ DEFINE_WAIT(wait);
+ int index = transid % 2;
+
+ /*
+ * we only allow two pending log transactions at a time,
+ * so we know that if ours is more than 2 older than the
+ * current transaction, we're done
+ */
+ for (;;) {
+ prepare_to_wait(&root->log_commit_wait[index],
+ &wait, TASK_UNINTERRUPTIBLE);
+
+ if (!(root->log_transid_committed < transid &&
+ atomic_read(&root->log_commit[index])))
+ break;
+
+ mutex_unlock(&root->log_mutex);
+ schedule();
+ mutex_lock(&root->log_mutex);
+ }
+ finish_wait(&root->log_commit_wait[index], &wait);
+}
+
+static void wait_for_writer(struct btrfs_root *root)
+{
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&root->log_writer_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&root->log_writers))
+ break;
+
+ mutex_unlock(&root->log_mutex);
+ schedule();
+ mutex_lock(&root->log_mutex);
+ }
+ finish_wait(&root->log_writer_wait, &wait);
+}
+
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx->list);
+ mutex_unlock(&root->log_mutex);
+}
+
+/*
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+ int index, int error)
+{
+ struct btrfs_log_ctx *ctx;
+ struct btrfs_log_ctx *safe;
+
+ list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
+ list_del_init(&ctx->list);
+ ctx->log_ret = error;
+ }
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it. When this call is done,
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx)
+{
+ int index1;
+ int index2;
+ int mark;
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
+ int log_transid = 0;
+ struct btrfs_log_ctx root_log_ctx;
+ struct blk_plug plug;
+ u64 log_root_start;
+ u64 log_root_level;
+
+ mutex_lock(&root->log_mutex);
+ log_transid = ctx->log_transid;
+ if (root->log_transid_committed >= log_transid) {
+ mutex_unlock(&root->log_mutex);
+ return ctx->log_ret;
+ }
+
+ index1 = log_transid % 2;
+ if (atomic_read(&root->log_commit[index1])) {
+ wait_log_commit(root, log_transid);
+ mutex_unlock(&root->log_mutex);
+ return ctx->log_ret;
+ }
+ ASSERT(log_transid == root->log_transid);
+ atomic_set(&root->log_commit[index1], 1);
+
+ /* wait for previous tree log sync to complete */
+ if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+ wait_log_commit(root, log_transid - 1);
+
+ while (1) {
+ int batch = atomic_read(&root->log_batch);
+ /* when we're on an ssd, just kick the log commit out */
+ if (!btrfs_test_opt(fs_info, SSD) &&
+ test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
+ mutex_unlock(&root->log_mutex);
+ schedule_timeout_uninterruptible(1);
+ mutex_lock(&root->log_mutex);
+ }
+ wait_for_writer(root);
+ if (batch == atomic_read(&root->log_batch))
+ break;
+ }
+
+ /* bail out if we need to do a full commit */
+ if (btrfs_need_log_full_commit(trans)) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+
+ if (log_transid % 2 == 0)
+ mark = EXTENT_DIRTY;
+ else
+ mark = EXTENT_NEW;
+
+ /* we start IO on all the marked extents here, but we don't actually
+ * wait for them until later.
+ */
+ blk_start_plug(&plug);
+ ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
+ /*
+ * -EAGAIN happens when someone, e.g., a concurrent transaction
+ * commit, writes a dirty extent in this tree-log commit. This
+ * concurrent write will create a hole writing out the extents,
+ * and we cannot proceed on a zoned filesystem, requiring
+ * sequential writing. While we can bail out to a full commit
+ * here, but we can continue hoping the concurrent writing fills
+ * the hole.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
+ ret = 0;
+ if (ret) {
+ blk_finish_plug(&plug);
+ btrfs_set_log_full_commit(trans);
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
+ btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
+
+ root->log_transid++;
+ log->log_transid = root->log_transid;
+ root->log_start_pid = 0;
+ /*
+ * IO has been started, blocks of the log tree have WRITTEN flag set
+ * in their headers. new modifications of the log will be written to
+ * new positions. so it's safe to allow log writers to go in.
+ */
+ mutex_unlock(&root->log_mutex);
+
+ if (btrfs_is_zoned(fs_info)) {
+ mutex_lock(&fs_info->tree_root->log_mutex);
+ if (!log_root_tree->node) {
+ ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
+ goto out;
+ }
+ }
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ }
+
+ btrfs_init_log_ctx(&root_log_ctx, NULL);
+
+ mutex_lock(&log_root_tree->log_mutex);
+
+ index2 = log_root_tree->log_transid % 2;
+ list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+ root_log_ctx.log_transid = log_root_tree->log_transid;
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+ if (ret) {
+ if (!list_empty(&root_log_ctx.list))
+ list_del_init(&root_log_ctx.list);
+
+ blk_finish_plug(&plug);
+ btrfs_set_log_full_commit(trans);
+ if (ret != -ENOSPC)
+ btrfs_err(fs_info,
+ "failed to update log for root %llu ret %d",
+ root->root_key.objectid, ret);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out;
+ }
+
+ if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+ blk_finish_plug(&plug);
+ list_del_init(&root_log_ctx.list);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = root_log_ctx.log_ret;
+ goto out;
+ }
+
+ index2 = root_log_ctx.log_transid % 2;
+ if (atomic_read(&log_root_tree->log_commit[index2])) {
+ blk_finish_plug(&plug);
+ ret = btrfs_wait_tree_log_extents(log, mark);
+ wait_log_commit(log_root_tree,
+ root_log_ctx.log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ if (!ret)
+ ret = root_log_ctx.log_ret;
+ goto out;
+ }
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
+ atomic_set(&log_root_tree->log_commit[index2], 1);
+
+ if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+ wait_log_commit(log_root_tree,
+ root_log_ctx.log_transid - 1);
+ }
+
+ /*
+ * now that we've moved on to the tree of log tree roots,
+ * check the full commit flag again
+ */
+ if (btrfs_need_log_full_commit(trans)) {
+ blk_finish_plug(&plug);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out_wake_log_root;
+ }
+
+ ret = btrfs_write_marked_extents(fs_info,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ blk_finish_plug(&plug);
+ /*
+ * As described above, -EAGAIN indicates a hole in the extents. We
+ * cannot wait for these write outs since the waiting cause a
+ * deadlock. Bail out to the full commit instead.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
+ btrfs_set_log_full_commit(trans);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ } else if (ret) {
+ btrfs_set_log_full_commit(trans);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
+ ret = btrfs_wait_tree_log_extents(log, mark);
+ if (!ret)
+ ret = btrfs_wait_tree_log_extents(log_root_tree,
+ EXTENT_NEW | EXTENT_DIRTY);
+ if (ret) {
+ btrfs_set_log_full_commit(trans);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
+
+ log_root_start = log_root_tree->node->start;
+ log_root_level = btrfs_header_level(log_root_tree->node);
+ log_root_tree->log_transid++;
+ mutex_unlock(&log_root_tree->log_mutex);
+
+ /*
+ * Here we are guaranteed that nobody is going to write the superblock
+ * for the current transaction before us and that neither we do write
+ * our superblock before the previous transaction finishes its commit
+ * and writes its superblock, because:
+ *
+ * 1) We are holding a handle on the current transaction, so no body
+ * can commit it until we release the handle;
+ *
+ * 2) Before writing our superblock we acquire the tree_log_mutex, so
+ * if the previous transaction is still committing, and hasn't yet
+ * written its superblock, we wait for it to do it, because a
+ * transaction commit acquires the tree_log_mutex when the commit
+ * begins and releases it only after writing its superblock.
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
+ /*
+ * The previous transaction writeout phase could have failed, and thus
+ * marked the fs in an error state. We must not commit here, as we
+ * could have updated our generation in the super_for_commit and
+ * writing the super here would result in transid mismatches. If there
+ * is an error here just bail.
+ */
+ if (BTRFS_FS_ERROR(fs_info)) {
+ ret = -EIO;
+ btrfs_set_log_full_commit(trans);
+ btrfs_abort_transaction(trans, ret);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ goto out_wake_log_root;
+ }
+
+ btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
+ btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
+ ret = write_all_supers(fs_info, 1);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ if (ret) {
+ btrfs_set_log_full_commit(trans);
+ btrfs_abort_transaction(trans, ret);
+ goto out_wake_log_root;
+ }
+
+ /*
+ * We know there can only be one task here, since we have not yet set
+ * root->log_commit[index1] to 0 and any task attempting to sync the
+ * log must wait for the previous log transaction to commit if it's
+ * still in progress or wait for the current log transaction commit if
+ * someone else already started it. We use <= and not < because the
+ * first log transaction has an ID of 0.
+ */
+ ASSERT(root->last_log_commit <= log_transid);
+ root->last_log_commit = log_transid;
+
+out_wake_log_root:
+ mutex_lock(&log_root_tree->log_mutex);
+ btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+ log_root_tree->log_transid_committed++;
+ atomic_set(&log_root_tree->log_commit[index2], 0);
+ mutex_unlock(&log_root_tree->log_mutex);
+
+ /*
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
+ */
+ cond_wake_up(&log_root_tree->log_commit_wait[index2]);
+out:
+ mutex_lock(&root->log_mutex);
+ btrfs_remove_all_log_ctxs(root, index1, ret);
+ root->log_transid_committed++;
+ atomic_set(&root->log_commit[index1], 0);
+ mutex_unlock(&root->log_mutex);
+
+ /*
+ * The barrier before waitqueue_active (in cond_wake_up) is needed so
+ * all the updates above are seen by the woken threads. It might not be
+ * necessary, but proving that seems to be hard.
+ */
+ cond_wake_up(&root->log_commit_wait[index1]);
+ return ret;
+}
+
+static void free_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log)
+{
+ int ret;
+ struct walk_control wc = {
+ .free = 1,
+ .process_func = process_one_buffer
+ };
+
+ if (log->node) {
+ ret = walk_log_tree(trans, log, &wc);
+ if (ret) {
+ /*
+ * We weren't able to traverse the entire log tree, the
+ * typical scenario is getting an -EIO when reading an
+ * extent buffer of the tree, due to a previous writeback
+ * failure of it.
+ */
+ set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ &log->fs_info->fs_state);
+
+ /*
+ * Some extent buffers of the log tree may still be dirty
+ * and not yet written back to storage, because we may
+ * have updates to a log tree without syncing a log tree,
+ * such as during rename and link operations. So flush
+ * them out and wait for their writeback to complete, so
+ * that we properly cleanup their state and pages.
+ */
+ btrfs_write_marked_extents(log->fs_info,
+ &log->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ btrfs_wait_tree_log_extents(log,
+ EXTENT_DIRTY | EXTENT_NEW);
+
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
+ }
+
+ clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+ extent_io_tree_release(&log->log_csum_range);
+
+ btrfs_put_root(log);
+}
+
+/*
+ * free all the extents used by the tree log. This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+ if (root->log_root) {
+ free_log_tree(trans, root->log_root);
+ root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
+ }
+ return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->log_root_tree) {
+ free_log_tree(trans, fs_info->log_root_tree);
+ fs_info->log_root_tree = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
+ }
+ return 0;
+}
+
+/*
+ * Check if an inode was logged in the current transaction. This correctly deals
+ * with the case where the inode was logged but has a logged_trans of 0, which
+ * happens if the inode is evicted and loaded again, as logged_trans is an in
+ * memory only field (not persisted).
+ *
+ * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
+ * and < 0 on error.
+ */
+static int inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path_in)
+{
+ struct btrfs_path *path = path_in;
+ struct btrfs_key key;
+ int ret;
+
+ if (inode->logged_trans == trans->transid)
+ return 1;
+
+ /*
+ * If logged_trans is not 0, then we know the inode logged was not logged
+ * in this transaction, so we can return false right away.
+ */
+ if (inode->logged_trans > 0)
+ return 0;
+
+ /*
+ * If no log tree was created for this root in this transaction, then
+ * the inode can not have been logged in this transaction. In that case
+ * set logged_trans to anything greater than 0 and less than the current
+ * transaction's ID, to avoid the search below in a future call in case
+ * a log tree gets created after this.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
+
+ /*
+ * We have a log tree and the inode's logged_trans is 0. We can't tell
+ * for sure if the inode was logged before in this transaction by looking
+ * only at logged_trans. We could be pessimistic and assume it was, but
+ * that can lead to unnecessarily logging an inode during rename and link
+ * operations, and then further updating the log in followup rename and
+ * link operations, specially if it's a directory, which adds latency
+ * visible to applications doing a series of rename or link operations.
+ *
+ * A logged_trans of 0 here can mean several things:
+ *
+ * 1) The inode was never logged since the filesystem was mounted, and may
+ * or may have not been evicted and loaded again;
+ *
+ * 2) The inode was logged in a previous transaction, then evicted and
+ * then loaded again;
+ *
+ * 3) The inode was logged in the current transaction, then evicted and
+ * then loaded again.
+ *
+ * For cases 1) and 2) we don't want to return true, but we need to detect
+ * case 3) and return true. So we do a search in the log root for the inode
+ * item.
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+
+ if (path_in)
+ btrfs_release_path(path);
+ else
+ btrfs_free_path(path);
+
+ /*
+ * Logging an inode always results in logging its inode item. So if we
+ * did not find the item we know the inode was not logged for sure.
+ */
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /*
+ * Set logged_trans to a value greater than 0 and less then the
+ * current transaction to avoid doing the search in future calls.
+ */
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
+
+ /*
+ * The inode was previously logged and then evicted, set logged_trans to
+ * the current transacion's ID, to avoid future tree searches as long as
+ * the inode is not evicted again.
+ */
+ inode->logged_trans = trans->transid;
+
+ /*
+ * If it's a directory, then we must set last_dir_index_offset to the
+ * maximum possible value, so that the next attempt to log the inode does
+ * not skip checking if dir index keys found in modified subvolume tree
+ * leaves have been logged before, otherwise it would result in attempts
+ * to insert duplicate dir index keys in the log tree. This must be done
+ * because last_dir_index_offset is an in-memory only field, not persisted
+ * in the inode item or any other on-disk structure, so its value is lost
+ * once the inode is evicted.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->last_dir_index_offset = (u64)-1;
+
+ return 1;
+}
+
+/*
+ * Delete a directory entry from the log if it exists.
+ *
+ * Returns < 0 on error
+ * 1 if the entry does not exists
+ * 0 if the entry existed and was successfully deleted
+ */
+static int del_logged_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dir_ino,
+ const struct fscrypt_str *name,
+ u64 index)
+{
+ struct btrfs_dir_item *di;
+
+ /*
+ * We only log dir index items of a directory, so we don't need to look
+ * for dir item keys.
+ */
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, -1);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+ else if (!di)
+ return 1;
+
+ /*
+ * We do not need to update the size field of the directory's
+ * inode item because on log replay we update the field to reflect
+ * all existing entries in the directory (see overwrite_item()).
+ */
+ return btrfs_delete_one_dir_name(trans, log, path, di);
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist. But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked. Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct fscrypt_str *name,
+ struct btrfs_inode *dir, u64 index)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ ret = inode_logged(trans, dir, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
+
+ ret = join_running_log_trans(root);
+ if (ret)
+ return;
+
+ mutex_lock(&dir->log_mutex);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
+ name, index);
+ btrfs_free_path(path);
+out_unlock:
+ mutex_unlock(&dir->log_mutex);
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
+ btrfs_end_log_trans(root);
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct fscrypt_str *name,
+ struct btrfs_inode *inode, u64 dirid)
+{
+ struct btrfs_root *log;
+ u64 index;
+ int ret;
+
+ ret = inode_logged(trans, inode, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
+
+ ret = join_running_log_trans(root);
+ if (ret)
+ return;
+ log = root->log_root;
+ mutex_lock(&inode->log_mutex);
+
+ ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
+ dirid, &index);
+ mutex_unlock(&inode->log_mutex);
+ if (ret < 0 && ret != -ENOENT)
+ btrfs_set_log_full_commit(trans);
+ btrfs_end_log_trans(root);
+}
+
+/*
+ * creates a range item in the log for 'dirid'. first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid,
+ u64 first_offset, u64 last_offset)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_log_item *item;
+
+ key.objectid = dirid;
+ key.offset = first_offset;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+ /*
+ * -EEXIST is fine and can happen sporadically when we are logging a
+ * directory and have concurrent insertions in the subvolume's tree for
+ * items from other inodes and that result in pushing off some dir items
+ * from one leaf to another in order to accommodate for the new items.
+ * This results in logging the same dir index range key.
+ */
+ if (ret && ret != -EEXIST)
+ return ret;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ if (ret == -EEXIST) {
+ const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ /*
+ * btrfs_del_dir_entries_in_log() might have been called during
+ * an unlink between the initial insertion of this key and the
+ * current update, or we might be logging a single entry deletion
+ * during a rename, so set the new last_offset to the max value.
+ */
+ last_offset = max(last_offset, curr_end);
+ }
+ btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+ return 0;
+}
+
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct extent_buffer *src,
+ struct btrfs_path *dst_path,
+ int start_slot,
+ int count)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ char *ins_data = NULL;
+ struct btrfs_item_batch batch;
+ struct extent_buffer *dst;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ u64 last_index;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+ int i;
+
+ ASSERT(count > 0);
+ batch.nr = count;
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+ item_size = btrfs_item_size(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+
+ ins_data = kmalloc(count * sizeof(u32) +
+ count * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+
+ for (i = 0; i < count; i++) {
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+ ins_sizes[i] = btrfs_item_size(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst = dst_path->nodes[0];
+ /*
+ * Copy all the items in bulk, in a single copy operation. Item data is
+ * organized such that it's placed at the end of a leaf and from right
+ * to left. For example, the data for the second item ends at an offset
+ * that matches the offset where the data for the first item starts, the
+ * data for the third item ends at an offset that matches the offset
+ * where the data of the second items starts, and so on.
+ * Therefore our source and destination start offsets for copy match the
+ * offsets of the last items (highest slots).
+ */
+ dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+ src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+ copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+ btrfs_release_path(dst_path);
+
+ last_index = batch.keys[count - 1].offset;
+ ASSERT(last_index > inode->last_dir_index_offset);
+
+ /*
+ * If for some unexpected reason the last item's index is not greater
+ * than the last index we logged, warn and return an error to fallback
+ * to a transaction commit.
+ */
+ if (WARN_ON(last_index <= inode->last_dir_index_offset))
+ ret = -EUCLEAN;
+ else
+ inode->last_dir_index_offset = last_index;
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx,
+ u64 *last_old_dentry_offset)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct extent_buffer *src;
+ const int nritems = btrfs_header_nritems(path->nodes[0]);
+ const u64 ino = btrfs_ino(inode);
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int i;
+
+ /*
+ * We need to clone the leaf, release the read lock on it, and use the
+ * clone before modifying the log tree. See the comment at copy_items()
+ * about why we need to do this.
+ */
+ src = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!src)
+ return -ENOMEM;
+
+ i = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = src;
+ path->slots[0] = i;
+
+ for (; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ int ret;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+ if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
+ last_found = true;
+ break;
+ }
+
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+
+ /*
+ * Skip ranges of items that consist only of dir item keys created
+ * in past transactions. However if we find a gap, we must log a
+ * dir index range item for that gap, so that index keys in that
+ * gap are deleted during log replay.
+ */
+ if (btrfs_dir_transid(src, di) < trans->transid) {
+ if (key.offset > *last_old_dentry_offset + 1) {
+ ret = insert_dir_log_key(trans, log, dst_path,
+ ino, *last_old_dentry_offset + 1,
+ key.offset - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ *last_old_dentry_offset = key.offset;
+ continue;
+ }
+
+ /* If we logged this dir index item before, we can skip it. */
+ if (key.offset <= inode->last_dir_index_offset)
+ continue;
+
+ /*
+ * We must make sure that when we log a directory entry, the
+ * corresponding inode, after log replay, has a matching link
+ * count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our file inode
+ * would have a link count of 1, but we get two directory entries
+ * pointing to the same inode. After removing one of the names,
+ * it would not be possible to remove the other name, which
+ * resulted always in stale file handle errors, and would not be
+ * possible to rmdir the parent directory, since its i_size could
+ * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+ * resulting in -ENOTEMPTY errors.
+ */
+ if (!ctx->log_new_dentries) {
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(src, di, &di_key);
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+
+ if (batch_size == 0)
+ batch_start = i;
+ batch_size++;
+ }
+
+ if (batch_size > 0) {
+ int ret;
+
+ ret = flush_dir_items_batch(trans, inode, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ }
+
+ return last_found ? 1 : 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory. This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx,
+ u64 min_offset, u64 *last_offset_ret)
+{
+ struct btrfs_key min_key;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_root *log = root->log_root;
+ int err = 0;
+ int ret;
+ u64 last_old_dentry_offset = min_offset - 1;
+ u64 last_offset = (u64)-1;
+ u64 ino = btrfs_ino(inode);
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = min_offset;
+
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+
+ /*
+ * we didn't find anything from this transaction, see if there
+ * is anything at all
+ */
+ if (ret != 0 || min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
+ min_key.objectid = ino;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = (u64)-1;
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret < 0) {
+ btrfs_release_path(path);
+ return ret;
+ }
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
+
+ /* if ret == 0 there are items for this type,
+ * create a range to tell us the last key of this type.
+ * otherwise, there are no items in this directory after
+ * *min_offset, and we create a range to indicate that.
+ */
+ if (ret == 0) {
+ struct btrfs_key tmp;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+ path->slots[0]);
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
+ } else if (ret < 0) {
+ err = ret;
+ }
+
+ goto done;
+ }
+
+ /* go backward to find any previous key */
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
+ if (ret == 0) {
+ struct btrfs_key tmp;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+ /*
+ * The dir index key before the first one we found that needs to
+ * be logged might be in a previous leaf, and there might be a
+ * gap between these keys, meaning that we had deletions that
+ * happened. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
+ * previous key's offset plus 1, so that those deletes are replayed.
+ */
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
+ } else if (ret < 0) {
+ err = ret;
+ goto done;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * Find the first key from this transaction again or the one we were at
+ * in the loop below in case we had to reschedule. We may be logging the
+ * directory without holding its VFS lock, which happen when logging new
+ * dentries (through log_new_dir_dentries()) or in some cases when we
+ * need to log the parent directory of an inode. This means a dir index
+ * key might be deleted from the inode's root, and therefore we may not
+ * find it anymore. If we can't find it, just move to the next key. We
+ * can not bail out and ignore, because if we do that we will simply
+ * not log dir index keys that come after the one that was just deleted
+ * and we can end up logging a dir index range that ends at (u64)-1
+ * (@last_offset is initialized to that), resulting in removing dir
+ * entries we should not remove at log replay time.
+ */
+search:
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret > 0)
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ err = ret;
+ /* If ret is 1, there are no more keys in the inode's root. */
+ if (ret != 0)
+ goto done;
+
+ /*
+ * we have a block from this transaction, log every item in it
+ * from our directory
+ */
+ while (1) {
+ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+ &last_old_dentry_offset);
+ if (ret != 0) {
+ if (ret < 0)
+ err = ret;
+ goto done;
+ }
+ path->slots[0] = btrfs_header_nritems(path->nodes[0]);
+
+ /*
+ * look ahead to the next item and see if it is also
+ * from this directory and from this transaction
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret) {
+ if (ret == 1)
+ last_offset = (u64)-1;
+ else
+ err = ret;
+ goto done;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+ if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
+ last_offset = (u64)-1;
+ goto done;
+ }
+ if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+ /*
+ * The next leaf was not changed in the current transaction
+ * and has at least one dir index key.
+ * We check for the next key because there might have been
+ * one or more deletions between the last key we logged and
+ * that next key. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
+ * offset minus 1, so that those deletes are replayed.
+ */
+ last_offset = min_key.offset - 1;
+ goto done;
+ }
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
+ }
+done:
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+
+ if (err == 0) {
+ *last_offset_ret = last_offset;
+ /*
+ * In case the leaf was changed in the current transaction but
+ * all its dir items are from a past transaction, the last item
+ * in the leaf is a dir item and there's no gap between that last
+ * dir item and the first one on the next leaf (which did not
+ * change in the current transaction), then we don't need to log
+ * a range, last_old_dentry_offset is == to last_offset.
+ */
+ ASSERT(last_old_dentry_offset <= last_offset);
+ if (last_old_dentry_offset < last_offset) {
+ ret = insert_dir_log_key(trans, log, path, ino,
+ last_old_dentry_offset + 1,
+ last_offset);
+ if (ret)
+ err = ret;
+ }
+ }
+ return err;
+}
+
+/*
+ * If the inode was logged before and it was evicted, then its
+ * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * key offset. If that's the case, search for it and update the inode. This
+ * is to avoid lookups in the log tree every time we try to insert a dir index
+ * key from a leaf changed in the current transaction, and to allow us to always
+ * do batch insertions of dir index keys.
+ */
+static int update_last_dir_index_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_key key;
+ int ret;
+
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (inode->last_dir_index_offset != (u64)-1)
+ return 0;
+
+ if (!ctx->logged_before) {
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+ return 0;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+ /*
+ * An error happened or we actually have an index key with an offset
+ * value of (u64)-1. Bail out, we're done.
+ */
+ if (ret <= 0)
+ goto out;
+
+ ret = 0;
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+
+ /*
+ * No dir index items, bail out and leave last_dir_index_offset with
+ * the value right before the first valid index value.
+ */
+ if (path->slots[0] == 0)
+ goto out;
+
+ /*
+ * btrfs_search_slot() left us at one slot beyond the slot with the last
+ * index key, or beyond the last key of the directory that is not an
+ * index key. If we have an index key before, set last_dir_index_offset
+ * to its offset value, otherwise leave it with a value right before the
+ * first valid index value, as it means we have an empty directory.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
+ inode->last_dir_index_offset = key.offset;
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
+{
+ u64 min_key;
+ u64 max_key;
+ int ret;
+
+ ret = update_last_dir_index_offset(inode, path, ctx);
+ if (ret)
+ return ret;
+
+ min_key = BTRFS_DIR_START_INDEX;
+ max_key = 0;
+
+ while (1) {
+ ret = log_dir_items(trans, inode, path, dst_path,
+ ctx, min_key, &max_key);
+ if (ret)
+ return ret;
+ if (max_key == (u64)-1)
+ break;
+ min_key = max_key + 1;
+ }
+
+ return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode. max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode,
+ int max_key_type)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int start_slot;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = max_key_type;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ BUG_ON(ret == 0); /* Logic error */
+ if (ret < 0)
+ break;
+
+ if (path->slots[0] == 0)
+ break;
+
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+
+ if (found_key.objectid != key.objectid)
+ break;
+
+ found_key.offset = 0;
+ found_key.type = 0;
+ ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
+ if (ret < 0)
+ break;
+
+ ret = btrfs_del_items(trans, log, path, start_slot,
+ path->slots[0] - start_slot + 1);
+ /*
+ * If start slot isn't 0 then we don't need to re-search, we've
+ * found the last guy with the objectid in this tree.
+ */
+ if (ret || start_slot != 0)
+ break;
+ btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *inode,
+ u64 new_size, u32 min_type)
+{
+ struct btrfs_truncate_control control = {
+ .new_size = new_size,
+ .ino = btrfs_ino(inode),
+ .min_type = min_type,
+ .skip_ref_updates = true,
+ };
+
+ return btrfs_truncate_inode_items(trans, log_root, &control);
+}
+
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+ struct extent_buffer *leaf,
+ struct btrfs_inode_item *item,
+ struct inode *inode, int log_inode_only,
+ u64 logged_isize)
+{
+ struct btrfs_map_token token;
+ u64 flags;
+
+ btrfs_init_map_token(&token, leaf);
+
+ if (log_inode_only) {
+ /* set the generation to zero so the recover code
+ * can tell the difference between an logging
+ * just to say 'this inode exists' and a logging
+ * to say 'update this inode with these values'
+ */
+ btrfs_set_token_inode_generation(&token, item, 0);
+ btrfs_set_token_inode_size(&token, item, logged_isize);
+ } else {
+ btrfs_set_token_inode_generation(&token, item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_token_inode_size(&token, item, inode->i_size);
+ }
+
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+
+ btrfs_set_token_timespec_sec(&token, &item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
+ inode->i_ctime.tv_nsec);
+
+ /*
+ * We do not need to set the nbytes field, in fact during a fast fsync
+ * its value may not even be correct, since a fast fsync does not wait
+ * for ordered extent completion, which is where we update nbytes, it
+ * only waits for writeback to complete. During log replay as we find
+ * file extent items and replay them, we adjust the nbytes field of the
+ * inode item in subvolume tree as needed (see overwrite_item()).
+ */
+
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
+ btrfs_set_token_inode_block_group(&token, item, 0);
+}
+
+static int log_inode_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, struct btrfs_path *path,
+ struct btrfs_inode *inode, bool inode_item_dropped)
+{
+ struct btrfs_inode_item *inode_item;
+ int ret;
+
+ /*
+ * If we are doing a fast fsync and the inode was logged before in the
+ * current transaction, then we know the inode was previously logged and
+ * it exists in the log tree. For performance reasons, in this case use
+ * btrfs_search_slot() directly with ins_len set to 0 so that we never
+ * attempt a write lock on the leaf's parent, which adds unnecessary lock
+ * contention in case there are concurrent fsyncs for other inodes of the
+ * same subvolume. Using btrfs_insert_empty_item() when the inode item
+ * already exists can also result in unnecessarily splitting a leaf.
+ */
+ if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+ ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+ ASSERT(ret <= 0);
+ if (ret > 0)
+ ret = -ENOENT;
+ } else {
+ /*
+ * This means it is the first fsync in the current transaction,
+ * so the inode item is not in the log and we need to insert it.
+ * We can never get -EEXIST because we are only called for a fast
+ * fsync and in case an inode eviction happens after the inode was
+ * logged before in the current transaction, when we load again
+ * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+ * flags and set ->logged_trans to 0.
+ */
+ ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+ sizeof(*inode_item));
+ ASSERT(ret != -EEXIST);
+ }
+ if (ret)
+ return ret;
+ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
+ 0, 0);
+ btrfs_release_path(path);
+ return 0;
+}
+
+static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_root *log_root,
+ struct btrfs_ordered_sum *sums)
+{
+ const u64 lock_end = sums->bytenr + sums->len - 1;
+ struct extent_state *cached_state = NULL;
+ int ret;
+
+ /*
+ * If this inode was not used for reflink operations in the current
+ * transaction with new extents, then do the fast path, no need to
+ * worry about logging checksum items with overlapping ranges.
+ */
+ if (inode->last_reflink_trans < trans->transid)
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+
+ /*
+ * Serialize logging for checksums. This is to avoid racing with the
+ * same checksum being logged by another task that is logging another
+ * file which happens to refer to the same extent as well. Such races
+ * can leave checksum items in the log with overlapping ranges.
+ */
+ ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
+ if (ret)
+ return ret;
+ /*
+ * Due to extent cloning, we might have logged a csum item that covers a
+ * subrange of a cloned extent, and later we can end up logging a csum
+ * item for a larger subrange of the same extent or the entire range.
+ * This would leave csum items in the log tree that cover the same range
+ * and break the searches for checksums in the log tree, resulting in
+ * some checksums missing in the fs/subvolume tree. So just delete (or
+ * trim and adjust) any existing csum items in the log for this range.
+ */
+ ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log_root, sums);
+
+ unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
+
+ return ret;
+}
+
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *dst_path,
+ struct btrfs_path *src_path,
+ int start_slot, int nr, int inode_only,
+ u64 logged_isize)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_file_extent_item *extent;
+ struct extent_buffer *src;
+ int ret = 0;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ struct btrfs_item_batch batch;
+ char *ins_data;
+ int i;
+ int dst_index;
+ const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ /*
+ * To keep lockdep happy and avoid deadlocks, clone the source leaf and
+ * use the clone. This is because otherwise we would be changing the log
+ * tree, to insert items from the subvolume tree or insert csum items,
+ * while holding a read lock on a leaf from the subvolume tree, which
+ * creates a nasty lock dependency when COWing log tree nodes/leaves:
+ *
+ * 1) Modifying the log tree triggers an extent buffer allocation while
+ * holding a write lock on a parent extent buffer from the log tree.
+ * Allocating the pages for an extent buffer, or the extent buffer
+ * struct, can trigger inode eviction and finally the inode eviction
+ * will trigger a release/remove of a delayed node, which requires
+ * taking the delayed node's mutex;
+ *
+ * 2) Allocating a metadata extent for a log tree can trigger the async
+ * reclaim thread and make us wait for it to release enough space and
+ * unblock our reservation ticket. The reclaim thread can start
+ * flushing delayed items, and that in turn results in the need to
+ * lock delayed node mutexes and in the need to write lock extent
+ * buffers of a subvolume tree - all this while holding a write lock
+ * on the parent extent buffer in the log tree.
+ *
+ * So one task in scenario 1) running in parallel with another task in
+ * scenario 2) could lead to a deadlock, one wanting to lock a delayed
+ * node mutex while having a read lock on a leaf from the subvolume,
+ * while the other is holding the delayed node's mutex and wants to
+ * write lock the same subvolume leaf for flushing delayed items.
+ */
+ src = btrfs_clone_extent_buffer(src_path->nodes[0]);
+ if (!src)
+ return -ENOMEM;
+
+ i = src_path->slots[0];
+ btrfs_release_path(src_path);
+ src_path->nodes[0] = src;
+ src_path->slots[0] = i;
+
+ ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+ nr * sizeof(u32), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+ batch.nr = 0;
+
+ dst_index = 0;
+ for (i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_sum *sums_next;
+ LIST_HEAD(ordered_sums);
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ u64 extent_num_bytes;
+ bool is_old_extent;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
+
+ if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
+ goto add_to_batch;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ is_old_extent = (btrfs_file_extent_generation(src, extent) <
+ trans->transid);
+
+ /*
+ * Don't copy extents from past generations. That would make us
+ * log a lot more metadata for common cases like doing only a
+ * few random writes into a file and then fsync it for the first
+ * time or after the full sync flag is set on the inode. We can
+ * get leaves full of extent items, most of which are from past
+ * generations, so we can skip them - as long as the inode has
+ * not been the target of a reflink operation in this transaction,
+ * as in that case it might have had file extent items with old
+ * generations copied into it. We also must always log prealloc
+ * extents that start at or beyond eof, otherwise we would lose
+ * them on log replay.
+ */
+ if (is_old_extent &&
+ ins_keys[dst_index].offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+ if (skip_csum)
+ goto add_to_batch;
+
+ /* Only regular extents have checksums. */
+ if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
+ goto add_to_batch;
+
+ /*
+ * If it's an extent created in a past transaction, then its
+ * checksums are already accessible from the committed csum tree,
+ * no need to log them.
+ */
+ if (is_old_extent)
+ goto add_to_batch;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
+ /* If it's an explicit hole, there are no checksums. */
+ if (disk_bytenr == 0)
+ goto add_to_batch;
+
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
+
+ if (btrfs_file_extent_compression(src, extent)) {
+ extent_offset = 0;
+ extent_num_bytes = disk_num_bytes;
+ } else {
+ extent_offset = btrfs_file_extent_offset(src, extent);
+ extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
+ }
+
+ csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
+ disk_bytenr += extent_offset;
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, 0, false);
+ if (ret)
+ goto out;
+
+ list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
+ if (!ret)
+ ret = log_csums(trans, inode, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ goto out;
+
+add_to_batch:
+ ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
+ batch.total_data_size += ins_sizes[dst_index];
+ batch.nr++;
+ dst_index++;
+ }
+
+ /*
+ * We have a leaf full of old extent items that don't need to be logged,
+ * so we don't need to do anything.
+ */
+ if (batch.nr == 0)
+ goto out;
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst_index = 0;
+ for (i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ const int dst_slot = dst_path->slots[0] + dst_index;
+ struct btrfs_key key;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+
+ /*
+ * We're done, all the remaining items in the source leaf
+ * correspond to old file extent items.
+ */
+ if (dst_index >= batch.nr)
+ break;
+
+ btrfs_item_key_to_cpu(src, &key, src_slot);
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto copy_item;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ /* See the comment in the previous loop, same logic. */
+ if (btrfs_file_extent_generation(src, extent) < trans->transid &&
+ key.offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+copy_item:
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
+ src_offset = btrfs_item_ptr_offset(src, src_slot);
+
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *inode_item;
+
+ inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
+ struct btrfs_inode_item);
+ fill_inode_item(trans, dst_path->nodes[0], inode_item,
+ &inode->vfs_inode,
+ inode_only == LOG_INODE_EXISTS,
+ logged_isize);
+ } else {
+ copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+ src_offset, ins_sizes[dst_index]);
+ }
+
+ dst_index++;
+ }
+
+ btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+ btrfs_release_path(dst_path);
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int extent_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ const struct extent_map *em1, *em2;
+
+ em1 = list_entry(a, struct extent_map, list);
+ em2 = list_entry(b, struct extent_map, list);
+
+ if (em1->start < em2->start)
+ return -1;
+ else if (em1->start > em2->start)
+ return 1;
+ return 0;
+}
+
+static int log_extent_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_root *log_root,
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *csum_root;
+ u64 csum_offset;
+ u64 csum_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
+ LIST_HEAD(ordered_sums);
+ int ret = 0;
+
+ if (inode->flags & BTRFS_INODE_NODATASUM ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ em->block_start == EXTENT_MAP_HOLE)
+ return 0;
+
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
+
+ /* If we're compressed we have to save the entire range of csums. */
+ if (em->compress_type) {
+ csum_offset = 0;
+ csum_len = max(em->block_len, em->orig_block_len);
+ } else {
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
+ }
+
+ /* block start is already adjusted for the file extent offset. */
+ csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
+ ret = btrfs_lookup_csums_range(csum_root,
+ em->block_start + csum_offset,
+ em->block_start + csum_offset +
+ csum_len - 1, &ordered_sums, 0, false);
+ if (ret)
+ return ret;
+
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ if (!ret)
+ ret = log_csums(trans, inode, log_root, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+
+ return ret;
+}
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct extent_map *em,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_file_extent_item fi = { 0 };
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 extent_offset = em->start - em->orig_start;
+ u64 block_len;
+ int ret;
+
+ btrfs_set_stack_file_extent_generation(&fi, trans->transid);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
+ else
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
+ extent_offset);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ }
+
+ btrfs_set_stack_file_extent_offset(&fi, extent_offset);
+ btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
+ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
+ btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+
+ ret = log_extent_csums(trans, inode, log, em, ctx);
+ if (ret)
+ return ret;
+
+ /*
+ * If this is the first time we are logging the inode in the current
+ * transaction, we can avoid btrfs_drop_extents(), which is expensive
+ * because it does a deletion search, which always acquires write locks
+ * for extent buffers at levels 2, 1 and 0. This not only wastes time
+ * but also adds significant contention in a log tree, since log trees
+ * are small, with a root at level 2 or 3 at most, due to their short
+ * life span.
+ */
+ if (ctx->logged_before) {
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+ }
+
+ if (!drop_args.extent_inserted) {
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = em->start;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(fi));
+ if (ret)
+ return ret;
+ }
+ leaf = path->nodes[0];
+ write_extent_buffer(leaf, &fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(fi));
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+/*
+ * Log all prealloc extents beyond the inode's i_size to make sure we do not
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
+ * subvolume's root instead of iterating the inode's extent map tree because
+ * otherwise we can log incorrect extent items based on extent map conversion.
+ * That can happen due to the fact that extent maps are merged when they
+ * are not in the extent map tree's list of modified extents.
+ */
+static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_key key;
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_path *dst_path = NULL;
+ bool dropped_extents = false;
+ u64 truncate_offset = i_size;
+ struct extent_buffer *leaf;
+ int slot;
+ int ins_nr = 0;
+ int start_slot = 0;
+ int ret;
+
+ if (!(inode->flags & BTRFS_INODE_PREALLOC))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = i_size;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We must check if there is a prealloc extent that starts before the
+ * i_size and crosses the i_size boundary. This is to ensure later we
+ * truncate down to the end of that extent and not to the i_size, as
+ * otherwise we end up losing part of the prealloc extent after a log
+ * replay and with an implicit hole if there is another prealloc extent
+ * that starts at an offset beyond i_size.
+ */
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ struct btrfs_file_extent_item *ei;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 extent_end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, ei);
+
+ if (extent_end > i_size)
+ truncate_offset = extent_end;
+ }
+ } else {
+ ret = 0;
+ }
+
+ while (true) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0);
+ if (ret < 0)
+ goto out;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY ||
+ key.offset < i_size) {
+ path->slots[0]++;
+ continue;
+ }
+ if (!dropped_extents) {
+ /*
+ * Avoid logging extent items logged in past fsync calls
+ * and leading to duplicate keys in the log tree.
+ */
+ ret = truncate_inode_items(trans, root->log_root, inode,
+ truncate_offset,
+ BTRFS_EXTENT_DATA_KEY);
+ if (ret)
+ goto out;
+ dropped_extents = true;
+ }
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ if (!dst_path) {
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+ if (ins_nr > 0)
+ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0);
+out:
+ btrfs_release_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+ struct extent_map *em, *n;
+ struct list_head extents;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ int ret = 0;
+ int num = 0;
+
+ INIT_LIST_HEAD(&extents);
+
+ write_lock(&tree->lock);
+
+ list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+ list_del_init(&em->list);
+ /*
+ * Just an arbitrary number, this can be really CPU intensive
+ * once we start getting a lot of extents, and really once we
+ * have a bunch of extents we just want to commit since it will
+ * be faster.
+ */
+ if (++num > 32768) {
+ list_del_init(&tree->modified_extents);
+ ret = -EFBIG;
+ goto process;
+ }
+
+ if (em->generation < trans->transid)
+ continue;
+
+ /* We log prealloc extents beyond eof later. */
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ em->start >= i_size_read(&inode->vfs_inode))
+ continue;
+
+ /* Need a ref to keep it from getting evicted from cache */
+ refcount_inc(&em->refs);
+ set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ list_add_tail(&em->list, &extents);
+ num++;
+ }
+
+ list_sort(NULL, &extents, extent_cmp);
+process:
+ while (!list_empty(&extents)) {
+ em = list_entry(extents.next, struct extent_map, list);
+
+ list_del_init(&em->list);
+
+ /*
+ * If we had an error we just need to delete everybody from our
+ * private list.
+ */
+ if (ret) {
+ clear_em_logging(tree, em);
+ free_extent_map(em);
+ continue;
+ }
+
+ write_unlock(&tree->lock);
+
+ ret = log_one_extent(trans, inode, em, path, ctx);
+ write_lock(&tree->lock);
+ clear_em_logging(tree, em);
+ free_extent_map(em);
+ }
+ WARN_ON(!list_empty(&extents));
+ write_unlock(&tree->lock);
+
+ if (!ret)
+ ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+ return ret;
+
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock_irq(&inode->ordered_tree.lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&inode->ordered_tree.lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
+}
+
+static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
+ struct btrfs_path *path, u64 *size_ret)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *size_ret = 0;
+ } else {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ *size_ret = btrfs_inode_size(path->nodes[0], item);
+ /*
+ * If the in-memory inode's i_size is smaller then the inode
+ * size stored in the btree, return the inode's i_size, so
+ * that we get a correct inode size after replaying the log
+ * when before a power failure we had a shrinking truncate
+ * followed by addition of a new name (rename / new hard link).
+ * Otherwise return the inode size from the btree, to avoid
+ * data loss when replaying a log due to previously doing a
+ * write that expands the inode's size and logging a new name
+ * immediately after.
+ */
+ if (*size_ret > inode->vfs_inode.i_size)
+ *size_ret = inode->vfs_inode.i_size;
+ }
+
+ btrfs_release_path(path);
+ return 0;
+}
+
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ struct btrfs_root *root = inode->root;
+ int ret;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ int ins_nr = 0;
+ int start_slot = 0;
+ bool found_xattrs = false;
+
+ if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ int nritems = btrfs_header_nritems(leaf);
+
+ if (slot >= nritems) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ found_xattrs = true;
+ cond_resched();
+ }
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (!found_xattrs)
+ set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
+
+ return 0;
+}
+
+/*
+ * When using the NO_HOLES feature if we punched a hole that causes the
+ * deletion of entire leafs or all the extent items of the first leaf (the one
+ * that contains the inode item and references) we may end up not processing
+ * any extents, because there are no leafs with a generation matching the
+ * current transaction that have extent items for our inode. So we need to find
+ * if any holes exist and then log them. We also need to log holes after any
+ * truncate operation that changes the inode's size.
+ */
+static int btrfs_log_holes(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 prev_extent_end = 0;
+ int ret;
+
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ /* We have a hole, log it. */
+ if (prev_extent_end < key.offset) {
+ const u64 hole_len = key.offset - prev_extent_end;
+
+ /*
+ * Release the path to avoid deadlocks with other code
+ * paths that search the root while holding locks on
+ * leafs from the log root.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_insert_hole_extent(trans, root->log_root,
+ ino, prev_extent_end,
+ hole_len);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Search for the same key again in the root. Since it's
+ * an extent item and we are holding the inode lock, the
+ * key must still exist. If it doesn't just emit warning
+ * and return an error to fall back to a transaction
+ * commit.
+ */
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (WARN_ON(ret > 0))
+ return -ENOENT;
+ leaf = path->nodes[0];
+ }
+
+ prev_extent_end = btrfs_file_extent_end(path);
+ path->slots[0]++;
+ cond_resched();
+ }
+
+ if (prev_extent_end < i_size) {
+ u64 hole_len;
+
+ btrfs_release_path(path);
+ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
+ ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
+ prev_extent_end, hole_len);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * When we are logging a new inode X, check if it doesn't have a reference that
+ * matches the reference from some other inode Y created in a past transaction
+ * and that was renamed in the current transaction. If we don't do this, then at
+ * log replay time we can lose inode Y (and all its files if it's a directory):
+ *
+ * mkdir /mnt/x
+ * echo "hello world" > /mnt/x/foobar
+ * sync
+ * mv /mnt/x /mnt/y
+ * mkdir /mnt/x # or touch /mnt/x
+ * xfs_io -c fsync /mnt/x
+ * <power fail>
+ * mount fs, trigger log replay
+ *
+ * After the log replay procedure, we would lose the first directory and all its
+ * files (file foobar).
+ * For the case where inode Y is not a directory we simply end up losing it:
+ *
+ * echo "123" > /mnt/foo
+ * sync
+ * mv /mnt/foo /mnt/bar
+ * echo "abc" > /mnt/foo
+ * xfs_io -c fsync /mnt/foo
+ * <power fail>
+ *
+ * We also need this for cases where a snapshot entry is replaced by some other
+ * entry (file or directory) otherwise we end up with an unreplayable log due to
+ * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
+ * if it were a regular entry:
+ *
+ * mkdir /mnt/x
+ * btrfs subvolume snapshot /mnt /mnt/x/snap
+ * btrfs subvolume delete /mnt/x/snap
+ * rmdir /mnt/x
+ * mkdir /mnt/x
+ * fsync /mnt/x or fsync some new file inside it
+ * <power fail>
+ *
+ * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
+ * the same transaction.
+ */
+static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ const int slot,
+ const struct btrfs_key *key,
+ struct btrfs_inode *inode,
+ u64 *other_ino, u64 *other_parent)
+{
+ int ret;
+ struct btrfs_path *search_path;
+ char *name = NULL;
+ u32 name_len = 0;
+ u32 item_size = btrfs_item_size(eb, slot);
+ u32 cur_offset = 0;
+ unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+ search_path = btrfs_alloc_path();
+ if (!search_path)
+ return -ENOMEM;
+ search_path->search_commit_root = 1;
+ search_path->skip_locking = 1;
+
+ while (cur_offset < item_size) {
+ u64 parent;
+ u32 this_name_len;
+ u32 this_len;
+ unsigned long name_ptr;
+ struct btrfs_dir_item *di;
+ struct fscrypt_str name_str;
+
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+
+ iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ parent = key->offset;
+ this_name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_ptr = (unsigned long)(iref + 1);
+ this_len = sizeof(*iref) + this_name_len;
+ } else {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ this_name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_ptr = (unsigned long)&extref->name;
+ this_len = sizeof(*extref) + this_name_len;
+ }
+
+ if (this_name_len > name_len) {
+ char *new_name;
+
+ new_name = krealloc(name, this_name_len, GFP_NOFS);
+ if (!new_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ name_len = this_name_len;
+ name = new_name;
+ }
+
+ read_extent_buffer(eb, name, name_ptr, this_name_len);
+
+ name_str.name = name;
+ name_str.len = this_name_len;
+ di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
+ parent, &name_str, 0);
+ if (di && !IS_ERR(di)) {
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+ di, &di_key);
+ if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+ if (di_key.objectid != key->objectid) {
+ ret = 1;
+ *other_ino = di_key.objectid;
+ *other_parent = parent;
+ } else {
+ ret = 0;
+ }
+ } else {
+ ret = -EAGAIN;
+ }
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ btrfs_release_path(search_path);
+
+ cur_offset += this_len;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(search_path);
+ kfree(name);
+ return ret;
+}
+
+/*
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
+ */
+static bool need_log_inode(const struct btrfs_trans_handle *trans,
+ const struct btrfs_inode *inode)
+{
+ /*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
+ */
+ if (inode->logged_trans == trans->transid &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
+
+ return true;
+}
+
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory.
+ * See process_dir_items_leaf() for details about why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not acquire their VFS lock, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not acquiring the VFS lock of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * do_overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = start_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ u64 ino = btrfs_ino(start_inode);
+ int ret = 0;
+
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (true) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ bool continue_curr_inode = true;
+ int nritems;
+ int i;
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
+ continue_curr_inode = false;
+ break;
+ }
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_type(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ btrfs_release_path(path);
+ di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto out;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(di_inode);
+ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
+ log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
+ if (ret)
+ goto out;
+ if (ctx->log_new_dentries) {
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dir_elem->ino = di_key.objectid;
+ list_add_tail(&dir_elem->list, &dir_list);
+ }
+ break;
+ }
+
+ if (continue_curr_inode && min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+
+next:
+ if (list_empty(&dir_list))
+ break;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
+ ino = dir_elem->ino;
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+out:
+ btrfs_free_path(path);
+ if (ret) {
+ struct btrfs_dir_list *next;
+
+ list_for_each_entry_safe(dir_elem, next, &dir_list, list)
+ kfree(dir_elem);
+ }
+
+ return ret;
+}
+
+struct btrfs_ino_list {
+ u64 ino;
+ u64 parent;
+ struct list_head list;
+};
+
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *curr;
+ struct btrfs_ino_list *next;
+
+ list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+ list_del(&curr->list);
+ kfree(curr);
+ }
+}
+
+static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (WARN_ON_ONCE(ret > 0)) {
+ /*
+ * We have previously found the inode through the commit root
+ * so this should not happen. If it does, just error out and
+ * fallback to a transaction commit.
+ */
+ ret = -ENOENT;
+ } else if (ret == 0) {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
+ ret = 1;
+ }
+
+ btrfs_release_path(path);
+ path->search_commit_root = 0;
+ path->skip_locking = 0;
+
+ return ret;
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 ino, u64 parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *ino_elem;
+ struct inode *inode;
+
+ /*
+ * It's rare to have a lot of conflicting inodes, in practice it is not
+ * common to have more than 1 or 2. We don't want to collect too many,
+ * as we could end up logging too many inodes (even if only in
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
+ btrfs_set_log_full_commit(trans);
+ return BTRFS_LOG_FORCE_COMMIT;
+ }
+
+ inode = btrfs_iget(root->fs_info->sb, ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was deleted in
+ * the current transaction then we either:
+ *
+ * 1) Log the parent directory (later after adding it to the list) if
+ * the inode is a directory. This is because it may be a deleted
+ * subvolume/snapshot or it may be a regular directory that had
+ * deleted subvolumes/snapshots (or subdirectories that had them),
+ * and at the moment we can't deal with dropping subvolumes/snapshots
+ * during log replay. So we just log the parent, which will result in
+ * a fallback to a transaction commit if we are dealing with those
+ * cases (last_unlink_trans will match the current transaction);
+ *
+ * 2) Do nothing if it's not a directory. During log replay we simply
+ * unlink the conflicting dentry from the parent directory and then
+ * add the dentry for our inode. Like this we can avoid logging the
+ * parent directory (and maybe fallback to a transaction commit in
+ * case it has a last_unlink_trans == trans->transid, due to moving
+ * some inode from it to some other directory).
+ */
+ if (IS_ERR(inode)) {
+ int ret = PTR_ERR(inode);
+
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = conflicting_inode_is_dir(root, ino, path);
+ /* Not a directory or we got an error. */
+ if (ret <= 0)
+ return ret;
+
+ /* Conflicting inode is a directory, so we'll log its parent. */
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+ }
+
+ /*
+ * If the inode was already logged skip it - otherwise we can hit an
+ * infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the following
+ * inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ *
+ * Here we can use need_log_inode() because we only need to log the
+ * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+ * so that the log ends up with the new name and without the old name.
+ */
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
+ btrfs_add_delayed_iput(inode);
+ return 0;
+ }
+
+ btrfs_add_delayed_iput(inode);
+
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+}
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+
+ /*
+ * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+ * otherwise we could have unbounded recursion of btrfs_log_inode()
+ * calls. This check guarantees we can have only 1 level of recursion.
+ */
+ if (ctx->logging_conflict_inodes)
+ return 0;
+
+ ctx->logging_conflict_inodes = true;
+
+ /*
+ * New conflicting inodes may be found and added to the list while we
+ * are logging a conflicting inode, so keep iterating while the list is
+ * not empty.
+ */
+ while (!list_empty(&ctx->conflict_inodes)) {
+ struct btrfs_ino_list *curr;
+ struct inode *inode;
+ u64 ino;
+ u64 parent;
+
+ curr = list_first_entry(&ctx->conflict_inodes,
+ struct btrfs_ino_list, list);
+ ino = curr->ino;
+ parent = curr->parent;
+ list_del(&curr->list);
+ kfree(curr);
+
+ inode = btrfs_iget(fs_info->sb, ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was
+ * deleted in the current transaction, we need to log its parent
+ * directory. See the comment at add_conflicting_inode().
+ */
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ if (ret != -ENOENT)
+ break;
+
+ inode = btrfs_iget(fs_info->sb, parent, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
+ }
+
+ /*
+ * Always log the directory, we cannot make this
+ * conditional on need_log_inode() because the directory
+ * might have been logged in LOG_INODE_EXISTS mode or
+ * the dir index of the conflicting inode is not in a
+ * dir index key range logged for the directory. So we
+ * must make sure the deletion is recorded.
+ */
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
+ LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
+ continue;
+ }
+
+ /*
+ * Here we can use need_log_inode() because we only need to log
+ * the inode in LOG_INODE_EXISTS mode and rename operations
+ * update the log, so that the log ends up with the new name and
+ * without the old name.
+ *
+ * We did this check at add_conflicting_inode(), but here we do
+ * it again because if some other task logged the inode after
+ * that, we can avoid doing it again.
+ */
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
+ btrfs_add_delayed_iput(inode);
+ continue;
+ }
+
+ /*
+ * We are safe logging the other inode without acquiring its
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
+ * are safe against concurrent renames of the other inode as
+ * well because during a rename we pin the log and update the
+ * log with the new name before we unpin it.
+ */
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
+ }
+
+ ctx->logging_conflict_inodes = false;
+ if (ret)
+ free_conflicting_inodes(ctx);
+
+ return ret;
+}
+
+static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_key *min_key,
+ const struct btrfs_key *max_key,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ const u64 logged_isize,
+ const int inode_only,
+ struct btrfs_log_ctx *ctx,
+ bool *need_log_inode_item)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ struct btrfs_root *root = inode->root;
+ int ins_start_slot = 0;
+ int ins_nr = 0;
+ int ret;
+
+ while (1) {
+ ret = btrfs_search_forward(root, min_key, path, trans->transid);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+again:
+ /* Note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key->objectid != max_key->objectid)
+ break;
+ if (min_key->type > max_key->type)
+ break;
+
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
+ *need_log_inode_item = false;
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ (inode->generation == trans->transid ||
+ ctx->logging_conflict_inodes)) {
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0], min_key, inode,
+ &other_ino, &other_parent);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0 &&
+ other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+ }
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr,
+ inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+
+ btrfs_release_path(path);
+ ret = add_conflicting_inode(trans, root, path,
+ other_ino,
+ other_parent, ctx);
+ if (ret)
+ return ret;
+ goto next_key;
+ }
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ goto next_slot;
+ }
+
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
+
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ btrfs_release_path(path);
+next_key:
+ if (min_key->offset < (u64)-1) {
+ min_key->offset++;
+ } else if (min_key->type < max_key->type) {
+ min_key->type++;
+ min_key->offset = 0;
+ } else {
+ break;
+ }
+
+ /*
+ * We may process many leaves full of items for our inode, so
+ * avoid monopolizing a cpu for too long by rescheduling while
+ * not holding locks on any tree.
+ */
+ cond_resched();
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
+
+ return ret;
+}
+
+static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch,
+ const struct btrfs_delayed_item *first_item)
+{
+ const struct btrfs_delayed_item *curr = first_item;
+ int ret;
+
+ ret = btrfs_insert_empty_items(trans, log, path, batch);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < batch->nr; i++) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ curr = list_next_entry(curr, log_list);
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
+ const int max_batch_size = 195;
+ const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_item_batch batch = {
+ .nr = 0,
+ .total_data_size = 0,
+ };
+ const struct btrfs_delayed_item *first = NULL;
+ const struct btrfs_delayed_item *curr;
+ char *ins_data;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ u64 curr_batch_size = 0;
+ int batch_idx = 0;
+ int ret;
+
+ /* We are adding dir index items to the log tree. */
+ lockdep_assert_held(&inode->log_mutex);
+
+ /*
+ * We collect delayed items before copying index keys from the subvolume
+ * to the log tree. However just after we collected them, they may have
+ * been flushed (all of them or just some of them), and therefore we
+ * could have copied them from the subvolume tree to the log tree.
+ * So find the first delayed item that was not yet logged (they are
+ * sorted by index number).
+ */
+ list_for_each_entry(curr, delayed_ins_list, log_list) {
+ if (curr->index > inode->last_dir_index_offset) {
+ first = curr;
+ break;
+ }
+ }
+
+ /* Empty list or all delayed items were already logged. */
+ if (!first)
+ return 0;
+
+ ins_data = kmalloc(max_batch_size * sizeof(u32) +
+ max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+ ins_sizes = (u32 *)ins_data;
+ batch.data_sizes = ins_sizes;
+ ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
+ batch.keys = ins_keys;
+
+ curr = first;
+ while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
+ const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
+
+ if (curr_batch_size + curr_size > leaf_data_size ||
+ batch.nr == max_batch_size) {
+ ret = insert_delayed_items_batch(trans, log, path,
+ &batch, first);
+ if (ret)
+ goto out;
+ batch_idx = 0;
+ batch.nr = 0;
+ batch.total_data_size = 0;
+ curr_batch_size = 0;
+ first = curr;
+ }
+
+ ins_sizes[batch_idx] = curr->data_len;
+ ins_keys[batch_idx].objectid = ino;
+ ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[batch_idx].offset = curr->index;
+ curr_batch_size += curr_size;
+ batch.total_data_size += curr->data_len;
+ batch.nr++;
+ batch_idx++;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ ASSERT(batch.nr >= 1);
+ ret = insert_delayed_items_batch(trans, log, path, &batch, first);
+
+ curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
+ log_list);
+ inode->last_dir_index_offset = curr->index;
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ const struct btrfs_delayed_item *curr;
+
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ const struct btrfs_delayed_item *next;
+ int ret;
+
+ /*
+ * Find a range of consecutive dir index items to delete. Like
+ * this we log a single dir range item spanning several contiguous
+ * dir items instead of logging one range item per dir index item.
+ */
+ next = list_next_entry(curr, log_list);
+ while (!list_entry_is_head(next, delayed_del_list, log_list)) {
+ if (next->index != curr->index + 1)
+ break;
+ curr = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ last_dir_index = curr->index;
+ ASSERT(last_dir_index >= first_dir_index);
+
+ ret = insert_dir_log_key(trans, inode->root->log_root, path,
+ ino, first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ return 0;
+}
+
+static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ const struct list_head *delayed_del_list,
+ const struct btrfs_delayed_item *first,
+ const struct btrfs_delayed_item **last_ret)
+{
+ const struct btrfs_delayed_item *next;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int last_slot = btrfs_header_nritems(leaf) - 1;
+ int slot = path->slots[0] + 1;
+ const u64 ino = btrfs_ino(inode);
+
+ next = list_next_entry(first, log_list);
+
+ while (slot < last_slot &&
+ !list_entry_is_head(next, delayed_del_list, log_list)) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
+ break;
+
+ slot++;
+ *last_ret = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ return btrfs_del_items(trans, inode->root->log_root, path,
+ path->slots[0], slot - path->slots[0]);
+}
+
+static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ const struct btrfs_delayed_item *curr;
+ u64 last_range_start = 0;
+ u64 last_range_end = 0;
+ struct btrfs_key key;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ const struct btrfs_delayed_item *last = curr;
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ bool deleted_items = false;
+ int ret;
+
+ key.offset = curr->index;
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+ delayed_del_list, curr,
+ &last);
+ if (ret)
+ return ret;
+ deleted_items = true;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * If we deleted items from the leaf, it means we have a range
+ * item logging their range, so no need to add one or update an
+ * existing one. Otherwise we have to log a dir range item.
+ */
+ if (deleted_items)
+ goto next_batch;
+
+ last_dir_index = last->index;
+ ASSERT(last_dir_index >= first_dir_index);
+ /*
+ * If this range starts right after where the previous one ends,
+ * then we want to reuse the previous range item and change its
+ * end offset to the end of this range. This is just to minimize
+ * leaf space usage, by avoiding adding a new range item.
+ */
+ if (last_range_end != 0 && first_dir_index == last_range_end + 1)
+ first_dir_index = last_range_start;
+
+ ret = insert_dir_log_key(trans, log, path, key.objectid,
+ first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+
+ last_range_start = first_dir_index;
+ last_range_end = last_dir_index;
+next_batch:
+ curr = list_next_entry(last, log_list);
+ }
+
+ return 0;
+}
+
+static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /*
+ * We are deleting dir index items from the log tree or adding range
+ * items to it.
+ */
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (list_empty(delayed_del_list))
+ return 0;
+
+ if (ctx->logged_before)
+ return log_delayed_deletions_incremental(trans, inode, path,
+ delayed_del_list, ctx);
+
+ return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
+ ctx);
+}
+
+/*
+ * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
+ * items instead of the subvolume tree.
+ */
+static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const bool orig_log_new_dentries = ctx->log_new_dentries;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_item *item;
+ int ret = 0;
+
+ /*
+ * No need for the log mutex, plus to avoid potential deadlocks or
+ * lockdep annotations due to nesting of delayed inode mutexes and log
+ * mutexes.
+ */
+ lockdep_assert_not_held(&inode->log_mutex);
+
+ ASSERT(!ctx->logging_new_delayed_dentries);
+ ctx->logging_new_delayed_dentries = true;
+
+ list_for_each_entry(item, delayed_ins_list, log_list) {
+ struct btrfs_dir_item *dir_item;
+ struct inode *di_inode;
+ struct btrfs_key key;
+ int log_mode = LOG_INODE_EXISTS;
+
+ dir_item = (struct btrfs_dir_item *)item->data;
+ btrfs_disk_key_to_cpu(&key, &dir_item->location);
+
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ break;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(di_inode);
+ continue;
+ }
+
+ if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+
+ btrfs_add_delayed_iput(di_inode);
+
+ if (ret)
+ break;
+ }
+
+ ctx->log_new_dentries = orig_log_new_dentries;
+ ctx->logging_new_delayed_dentries = false;
+
+ return ret;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree. An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ int inode_only,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_path *path;
+ struct btrfs_path *dst_path;
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct btrfs_root *log = inode->root->log_root;
+ int ret;
+ bool fast_search = false;
+ u64 ino = btrfs_ino(inode);
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 logged_isize = 0;
+ bool need_log_inode_item = true;
+ bool xattrs_logged = false;
+ bool inode_item_dropped = true;
+ bool full_dir_logging = false;
+ LIST_HEAD(delayed_ins_list);
+ LIST_HEAD(delayed_del_list);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_INODE_ITEM_KEY;
+ min_key.offset = 0;
+
+ max_key.objectid = ino;
+
+
+ /* today the code can only do partial logging of directories */
+ if (S_ISDIR(inode->vfs_inode.i_mode) ||
+ (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &inode->runtime_flags) &&
+ inode_only >= LOG_INODE_EXISTS))
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ else
+ max_key.type = (u8)-1;
+ max_key.offset = (u64)-1;
+
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+ full_dir_logging = true;
+
+ /*
+ * If we are logging a directory while we are logging dentries of the
+ * delayed items of some other inode, then we need to flush the delayed
+ * items of this directory and not log the delayed items directly. This
+ * is to prevent more than one level of recursion into btrfs_log_inode()
+ * by having something like this:
+ *
+ * $ mkdir -p a/b/c/d/e/f/g/h/...
+ * $ xfs_io -c "fsync" a
+ *
+ * Where all directories in the path did not exist before and are
+ * created in the current transaction.
+ * So in such a case we directly log the delayed items of the main
+ * directory ("a") without flushing them first, while for each of its
+ * subdirectories we flush their delayed items before logging them.
+ * This prevents a potential unbounded recursion like this:
+ *
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * (...)
+ *
+ * We have thresholds for the maximum number of delayed items to have in
+ * memory, and once they are hit, the items are flushed asynchronously.
+ * However the limit is quite high, so lets prevent deep levels of
+ * recursion to happen by limiting the maximum depth to be 1.
+ */
+ if (full_dir_logging && ctx->logging_new_delayed_dentries) {
+ ret = btrfs_commit_inode_delayed_items(trans, inode);
+ if (ret)
+ goto out;
+ }
+
+ mutex_lock(&inode->log_mutex);
+
+ /*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
+ * Before logging the inode item, cache the value returned by
+ * inode_logged(), because after that we have the need to figure out if
+ * the inode was previously logged in this transaction.
+ */
+ ret = inode_logged(trans, inode, path);
+ if (ret < 0)
+ goto out_unlock;
+ ctx->logged_before = (ret == 1);
+ ret = 0;
+
+ /*
+ * This is for cases where logging a directory could result in losing a
+ * a file after replaying the log. For example, if we move a file from a
+ * directory A to a directory B, then fsync directory A, we have no way
+ * to known the file was moved from A to B, so logging just A would
+ * result in losing the file after a log replay.
+ */
+ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
+ btrfs_set_log_full_commit(trans);
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out_unlock;
+ }
+
+ /*
+ * a brute force approach to making sure we get the most uptodate
+ * copies of everything.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ BTRFS_XATTR_ITEM_KEY);
+ } else {
+ if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
+ /*
+ * Make sure the new inode item we write to the log has
+ * the same isize as the current one (if it exists).
+ * This is necessary to prevent data loss after log
+ * replay, and also to prevent doing a wrong expanding
+ * truncate - for e.g. create file, write 4K into offset
+ * 0, fsync, write 4K into offset 4096, add hard link,
+ * fsync some other file (to sync log), power fail - if
+ * we use the inode's current i_size, after log replay
+ * we get a 8Kb file, with the last 4Kb extent as a hole
+ * (zeroes), as if an expanding truncate happened,
+ * instead of getting a file of 4Kb only.
+ */
+ ret = logged_inode_size(log, inode, path, &logged_isize);
+ if (ret)
+ goto out_unlock;
+ }
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &inode->runtime_flags)) {
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path,
+ inode, max_key.type);
+ } else {
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &inode->runtime_flags);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &inode->runtime_flags);
+ if (ctx->logged_before)
+ ret = truncate_inode_items(trans, log,
+ inode, 0, 0);
+ }
+ } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &inode->runtime_flags) ||
+ inode_only == LOG_INODE_EXISTS) {
+ if (inode_only == LOG_INODE_ALL)
+ fast_search = true;
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
+ } else {
+ if (inode_only == LOG_INODE_ALL)
+ fast_search = true;
+ inode_item_dropped = false;
+ goto log_extents;
+ }
+
+ }
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * If we are logging a directory in full mode, collect the delayed items
+ * before iterating the subvolume tree, so that we don't miss any new
+ * dir index items in case they get flushed while or right after we are
+ * iterating the subvolume tree.
+ */
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries)
+ btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
+
+ ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ path, dst_path, logged_isize,
+ inode_only, ctx,
+ &need_log_inode_item);
+ if (ret)
+ goto out_unlock;
+
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
+ goto out_unlock;
+ xattrs_logged = true;
+ if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ ret = btrfs_log_holes(trans, inode, path);
+ if (ret)
+ goto out_unlock;
+ }
+log_extents:
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ if (need_log_inode_item) {
+ ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
+ if (ret)
+ goto out_unlock;
+ /*
+ * If we are doing a fast fsync and the inode was logged before
+ * in this transaction, we don't need to log the xattrs because
+ * they were logged before. If xattrs were added, changed or
+ * deleted since the last time we logged the inode, then we have
+ * already logged them because the inode had the runtime flag
+ * BTRFS_INODE_COPY_EVERYTHING set.
+ */
+ if (!xattrs_logged && inode->logged_trans < trans->transid) {
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
+ goto out_unlock;
+ btrfs_release_path(path);
+ }
+ }
+ if (fast_search) {
+ ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
+ if (ret)
+ goto out_unlock;
+ } else if (inode_only == LOG_INODE_ALL) {
+ struct extent_map *em, *n;
+
+ write_lock(&em_tree->lock);
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
+ write_unlock(&em_tree->lock);
+ }
+
+ if (full_dir_logging) {
+ ret = log_directory_changes(trans, inode, path, dst_path, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_insertion_items(trans, inode, path,
+ &delayed_ins_list, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_deletion_items(trans, inode, path,
+ &delayed_del_list, ctx);
+ if (ret)
+ goto out_unlock;
+ }
+
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for three reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
+ *
+ * 3) If we are logging that an ancestor inode exists as part of
+ * logging a new name from a link or rename operation, don't update
+ * its last_log_commit - otherwise if an explicit fsync is made
+ * against an ancestor, the fsync considers the inode in the log
+ * and doesn't sync the log, resulting in the ancestor missing after
+ * a power failure unless the log was synced as part of an fsync
+ * against any other unrelated inode.
+ */
+ if (inode_only != LOG_INODE_EXISTS)
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+
+ /*
+ * Reset the last_reflink_trans so that the next fsync does not need to
+ * go through the slower path when logging extents and their checksums.
+ */
+ if (inode_only == LOG_INODE_ALL)
+ inode->last_reflink_trans = 0;
+
+out_unlock:
+ mutex_unlock(&inode->log_mutex);
+out:
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+
+ if (ret)
+ free_conflicting_inodes(ctx);
+ else
+ ret = log_conflicting_inodes(trans, inode->root, ctx);
+
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
+ if (!ret)
+ ret = log_new_delayed_dentries(trans, inode,
+ &delayed_ins_list, ctx);
+
+ btrfs_log_put_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
+ }
+
+ return ret;
+}
+
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root *root = inode->root;
+ const u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u32 cur_offset = 0;
+ u32 item_size;
+ unsigned long ptr;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+ if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ item_size = btrfs_item_size(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ struct btrfs_key inode_key;
+ struct inode *dir_inode;
+
+ inode_key.type = BTRFS_INODE_ITEM_KEY;
+ inode_key.offset = 0;
+
+ if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)
+ (ptr + cur_offset);
+ inode_key.objectid = btrfs_inode_extref_parent(
+ leaf, extref);
+ cur_offset += sizeof(*extref);
+ cur_offset += btrfs_inode_extref_name_len(leaf,
+ extref);
+ } else {
+ inode_key.objectid = key.offset;
+ cur_offset = item_size;
+ }
+
+ dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
+ root);
+ /*
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
+ */
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
+ btrfs_add_delayed_iput(dir_inode);
+ continue;
+ }
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
+ LOG_INODE_ALL, ctx);
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans,
+ BTRFS_I(dir_inode), ctx);
+ btrfs_add_delayed_iput(dir_inode);
+ if (ret)
+ goto out;
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int log_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ while (true) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key search_key;
+ struct inode *inode;
+ u64 ino;
+ int ret = 0;
+
+ btrfs_release_path(path);
+
+ ino = found_key.offset;
+
+ search_key.objectid = found_key.offset;
+ search_key.type = BTRFS_INODE_ITEM_KEY;
+ search_key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (BTRFS_I(inode)->generation >= trans->transid &&
+ need_log_inode(trans, BTRFS_I(inode)))
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
+ LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ return ret;
+
+ if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
+ break;
+
+ search_key.type = BTRFS_INODE_REF_KEY;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return -ENOENT;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != search_key.objectid ||
+ found_key.type != BTRFS_INODE_REF_KEY)
+ return -ENOENT;
+ }
+ return 0;
+}
+
+static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ struct dentry *old_parent = NULL;
+ struct super_block *sb = inode->vfs_inode.i_sb;
+ int ret = 0;
+
+ while (true) {
+ if (!parent || d_really_is_negative(parent) ||
+ sb != parent->d_sb)
+ break;
+
+ inode = BTRFS_I(d_inode(parent));
+ if (root != inode->root)
+ break;
+
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode)) {
+ ret = btrfs_log_inode(trans, inode,
+ LOG_INODE_EXISTS, ctx);
+ if (ret)
+ break;
+ }
+ if (IS_ROOT(parent))
+ break;
+
+ parent = dget_parent(parent);
+ dput(old_parent);
+ old_parent = parent;
+ }
+ dput(old_parent);
+
+ return ret;
+}
+
+static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_path *path;
+ struct btrfs_key search_key;
+ int ret;
+
+ /*
+ * For a single hard link case, go through a fast path that does not
+ * need to iterate the fs/subvolume tree.
+ */
+ if (inode->vfs_inode.i_nlink < 2)
+ return log_new_ancestors_fast(trans, inode, parent, ctx);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret == 0)
+ path->slots[0]++;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key found_key;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != ino ||
+ found_key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ /*
+ * Don't deal with extended references because they are rare
+ * cases and too complex to deal with (we would need to keep
+ * track of which subitem we are processing for each item in
+ * this loop, etc). So just return some error to fallback to
+ * a transaction commit.
+ */
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = -EMLINK;
+ goto out;
+ }
+
+ /*
+ * Logging ancestors needs to do more searches on the fs/subvol
+ * tree, so it releases the path as needed to avoid deadlocks.
+ * Keep track of the last inode ref key and resume from that key
+ * after logging all new ancestors for the current hard link.
+ */
+ memcpy(&search_key, &found_key, sizeof(search_key));
+
+ ret = log_new_ancestors(trans, root, path, ctx);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ goto again;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log. A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ int inode_only,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+ bool log_dentries = false;
+
+ if (btrfs_test_opt(fs_info, NOTREELOG)) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto end_no_trans;
+ }
+
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto end_no_trans;
+ }
+
+ /*
+ * Skip already logged inodes or inodes corresponding to tmpfiles
+ * (since logging them is pointless, a link count of 0 means they
+ * will never be accessible).
+ */
+ if ((btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents)) ||
+ inode->vfs_inode.i_nlink == 0) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto end_no_trans;
+ }
+
+ ret = start_log_trans(trans, root, ctx);
+ if (ret)
+ goto end_no_trans;
+
+ ret = btrfs_log_inode(trans, inode, inode_only, ctx);
+ if (ret)
+ goto end_trans;
+
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->vfs_inode.i_mode) &&
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid) {
+ ret = 0;
+ goto end_trans;
+ }
+
+ if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
+ log_dentries = true;
+
+ /*
+ * On unlink we must make sure all our current and old parent directory
+ * inodes are fully logged. This is to prevent leaving dangling
+ * directory index entries in directories that were our parents but are
+ * not anymore. Not doing this results in old parent directory being
+ * impossible to delete after log replay (rmdir will always fail with
+ * error -ENOTEMPTY).
+ *
+ * Example 1:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * ln testdir/foo testdir/bar
+ * sync
+ * unlink testdir/bar
+ * xfs_io -c fsync testdir/foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * If we don't log the parent directory (testdir), after log replay the
+ * directory still has an entry pointing to the file inode using the bar
+ * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+ * the file inode has a link count of 1.
+ *
+ * Example 2:
+ *
+ * mkdir testdir
+ * touch foo
+ * ln foo testdir/foo2
+ * ln foo testdir/foo3
+ * sync
+ * unlink testdir/foo3
+ * xfs_io -c fsync foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * Similar as the first example, after log replay the parent directory
+ * testdir still has an entry pointing to the inode file with name foo3
+ * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+ * and has a link count of 2.
+ */
+ if (inode->last_unlink_trans >= trans->transid) {
+ ret = btrfs_log_all_parents(trans, inode, ctx);
+ if (ret)
+ goto end_trans;
+ }
+
+ ret = log_all_new_ancestors(trans, inode, parent, ctx);
+ if (ret)
+ goto end_trans;
+
+ if (log_dentries)
+ ret = log_new_dir_dentries(trans, inode, ctx);
+ else
+ ret = 0;
+end_trans:
+ if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ }
+
+ if (ret)
+ btrfs_remove_log_ctx(root, ctx);
+ btrfs_end_log_trans(root);
+end_no_trans:
+ return ret;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct dentry *dentry,
+ struct btrfs_log_ctx *ctx)
+{
+ struct dentry *parent = dget_parent(dentry);
+ int ret;
+
+ ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
+ LOG_INODE_ALL, ctx);
+ dput(parent);
+
+ return ret;
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *log;
+ struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+ struct walk_control wc = {
+ .process_func = process_one_buffer,
+ .stage = LOG_WALK_PIN_ONLY,
+ };
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
+ wc.trans = trans;
+ wc.pin = 1;
+
+ ret = walk_log_tree(trans, log_root_tree, &wc);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+
+again:
+ key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ btrfs_release_path(path);
+ if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ break;
+
+ log = btrfs_read_tree_root(log_root_tree, &found_key);
+ if (IS_ERR(log)) {
+ ret = PTR_ERR(log);
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+
+ wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
+ true);
+ if (IS_ERR(wc.replay_dest)) {
+ ret = PTR_ERR(wc.replay_dest);
+
+ /*
+ * We didn't find the subvol, likely because it was
+ * deleted. This is ok, simply skip this log and go to
+ * the next one.
+ *
+ * We need to exclude the root because we can't have
+ * other log replays overwriting this log as we'll read
+ * it back in a few more times. This will keep our
+ * block from being modified, and we'll just bail for
+ * each subsequent pass.
+ */
+ if (ret == -ENOENT)
+ ret = btrfs_pin_extent_for_log_replay(trans,
+ log->node->start,
+ log->node->len);
+ btrfs_put_root(log);
+
+ if (!ret)
+ goto next;
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+
+ wc.replay_dest->log_root = log;
+ ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
+ if (ret)
+ /* The loop needs to continue due to the root refs */
+ btrfs_abort_transaction(trans, ret);
+ else
+ ret = walk_log_tree(trans, log, &wc);
+
+ if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+ ret = fixup_inode_link_counts(trans, wc.replay_dest,
+ path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+
+ if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+ struct btrfs_root *root = wc.replay_dest;
+
+ btrfs_release_path(path);
+
+ /*
+ * We have just replayed everything, and the highest
+ * objectid of fs roots probably has changed in case
+ * some inode_item's got replayed.
+ *
+ * root->objectid_mutex is not acquired as log replay
+ * could only happen during mount.
+ */
+ ret = btrfs_init_root_free_objectid(root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+
+ wc.replay_dest->log_root = NULL;
+ btrfs_put_root(wc.replay_dest);
+ btrfs_put_root(log);
+
+ if (ret)
+ goto error;
+next:
+ if (found_key.offset == 0)
+ break;
+ key.offset = found_key.offset - 1;
+ }
+ btrfs_release_path(path);
+
+ /* step one is to pin it all, step two is to replay just inodes */
+ if (wc.pin) {
+ wc.pin = 0;
+ wc.process_func = replay_one_buffer;
+ wc.stage = LOG_WALK_REPLAY_INODES;
+ goto again;
+ }
+ /* step three is to replay everything */
+ if (wc.stage < LOG_WALK_REPLAY_ALL) {
+ wc.stage++;
+ goto again;
+ }
+
+ btrfs_free_path(path);
+
+ /* step 4: commit the transaction, which also unpins the blocks */
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+
+ log_root_tree->log_root = NULL;
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+ btrfs_put_root(log_root_tree);
+
+ return 0;
+error:
+ if (wc.trans)
+ btrfs_end_transaction(wc.trans);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ *
+ * Must be called before the unlink operations (updates to the subvolume tree,
+ * inodes, etc) are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir, struct btrfs_inode *inode,
+ int for_rename)
+{
+ /*
+ * when we're logging a file, if it hasn't been renamed
+ * or unlinked, and its inode is fully committed on disk,
+ * we don't have to worry about walking up the directory chain
+ * to log its parents.
+ *
+ * So, we use the last_unlink_trans field to put this transid
+ * into the file. When the file is logged we check it and
+ * don't log the parents if the file is fully on disk.
+ */
+ mutex_lock(&inode->log_mutex);
+ inode->last_unlink_trans = trans->transid;
+ mutex_unlock(&inode->log_mutex);
+
+ /*
+ * if this directory was already logged any new
+ * names for this file/dir will get recorded
+ */
+ if (dir->logged_trans == trans->transid)
+ return;
+
+ /*
+ * if the inode we're about to unlink was logged,
+ * the log will be properly updated for any new names
+ */
+ if (inode->logged_trans == trans->transid)
+ return;
+
+ /*
+ * when renaming files across directories, if the directory
+ * there we're unlinking from gets fsync'd later on, there's
+ * no way to find the destination directory later and fsync it
+ * properly. So, we have to be conservative and force commits
+ * so the new name gets discovered.
+ */
+ if (for_rename)
+ goto record;
+
+ /* we can safely do the unlink without any special recording */
+ return;
+
+record:
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
+}
+
+/*
+ * Make sure that if someone attempts to fsync the parent directory of a deleted
+ * snapshot, it ends up triggering a transaction commit. This is to guarantee
+ * that after replaying the log tree of the parent directory's root we will not
+ * see the snapshot anymore and at log replay time we will not see any log tree
+ * corresponding to the deleted snapshot's root, which could lead to replaying
+ * it after replaying the log tree of the parent directory (which would replay
+ * the snapshot delete operation).
+ *
+ * Must be called before the actual snapshot destroy operation (updates to the
+ * parent root and tree of tree roots trees, etc) are done.
+ */
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir)
+{
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
+}
+
+/**
+ * Update the log after adding a new name for an inode.
+ *
+ * @trans: Transaction handle.
+ * @old_dentry: The dentry associated with the old name and the old
+ * parent directory.
+ * @old_dir: The inode of the previous parent directory for the case
+ * of a rename. For a link operation, it must be NULL.
+ * @old_dir_index: The index number associated with the old name, meaningful
+ * only for rename operations (when @old_dir is not NULL).
+ * Ignored for link operations.
+ * @parent: The dentry associated with the directory under which the
+ * new name is located.
+ *
+ * Call this after adding a new name for an inode, as a result of a link or
+ * rename operation, and it will properly update the log to reflect the new name.
+ */
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent)
+{
+ struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_log_ctx ctx;
+ bool log_pinned = false;
+ int ret;
+
+ /*
+ * this will force the logging code to walk the dentry chain
+ * up for the file
+ */
+ if (!S_ISDIR(inode->vfs_inode.i_mode))
+ inode->last_unlink_trans = trans->transid;
+
+ /*
+ * if this inode hasn't been logged and directory we're renaming it
+ * from hasn't been logged, we don't need to log it
+ */
+ ret = inode_logged(trans, inode, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ if (!old_dir)
+ return;
+ /*
+ * If the inode was not logged and we are doing a rename (old_dir is not
+ * NULL), check if old_dir was logged - if it was not we can return and
+ * do nothing.
+ */
+ ret = inode_logged(trans, old_dir, NULL);
+ if (ret < 0)
+ goto out;
+ else if (ret == 0)
+ return;
+ }
+ ret = 0;
+
+ /*
+ * If we are doing a rename (old_dir is not NULL) from a directory that
+ * was previously logged, make sure that on log replay we get the old
+ * dir entry deleted. This is needed because we will also log the new
+ * name of the renamed inode, so we need to make sure that after log
+ * replay we don't end up with both the new and old dir entries existing.
+ */
+ if (old_dir && old_dir->logged_trans == trans->transid) {
+ struct btrfs_root *log = old_dir->root->log_root;
+ struct btrfs_path *path;
+ struct fscrypt_name fname;
+
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+
+ ret = fscrypt_setup_filename(&old_dir->vfs_inode,
+ &old_dentry->d_name, 0, &fname);
+ if (ret)
+ goto out;
+ /*
+ * We have two inodes to update in the log, the old directory and
+ * the inode that got renamed, so we must pin the log to prevent
+ * anyone from syncing the log until we have updated both inodes
+ * in the log.
+ */
+ ret = join_running_log_trans(root);
+ /*
+ * At least one of the inodes was logged before, so this should
+ * not fail, but if it does, it's not serious, just bail out and
+ * mark the log for a full commit.
+ */
+ if (WARN_ON_ONCE(ret < 0)) {
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
+ log_pinned = true;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
+ /*
+ * Other concurrent task might be logging the old directory,
+ * as it can be triggered when logging other inode that had or
+ * still has a dentry in the old directory. We lock the old
+ * directory's log_mutex to ensure the deletion of the old
+ * name is persisted, because during directory logging we
+ * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
+ * the old name's dir index item is in the delayed items, so
+ * it could be missed by an in progress directory logging.
+ */
+ mutex_lock(&old_dir->log_mutex);
+ ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
+ &fname.disk_name, old_dir_index);
+ if (ret > 0) {
+ /*
+ * The dentry does not exist in the log, so record its
+ * deletion.
+ */
+ btrfs_release_path(path);
+ ret = insert_dir_log_key(trans, log, path,
+ btrfs_ino(old_dir),
+ old_dir_index, old_dir_index);
+ }
+ mutex_unlock(&old_dir->log_mutex);
+
+ btrfs_free_path(path);
+ fscrypt_free_filename(&fname);
+ if (ret < 0)
+ goto out;
+ }
+
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ ASSERT(list_empty(&ctx.conflict_inodes));
+out:
+ /*
+ * If an error happened mark the log for a full commit because it's not
+ * consistent and up to date or we couldn't find out if one of the
+ * inodes was logged before in this transaction. Do it before unpinning
+ * the log, to avoid any races with someone else trying to commit it.
+ */
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
+ if (log_pinned)
+ btrfs_end_log_trans(root);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000..8adebf4c9
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_TREE_LOG_H
+#define BTRFS_TREE_LOG_H
+
+#include "ctree.h"
+#include "transaction.h"
+
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
+/* We can't use the tree log for whatever reason, force a transaction commit */
+#define BTRFS_LOG_FORCE_COMMIT (1)
+
+struct btrfs_log_ctx {
+ int log_ret;
+ int log_transid;
+ bool log_new_dentries;
+ bool logging_new_name;
+ bool logging_new_delayed_dentries;
+ /* Indicate if the inode being logged was logged before. */
+ bool logged_before;
+ struct inode *inode;
+ struct list_head list;
+ /* Only used for fast fsyncs. */
+ struct list_head ordered_extents;
+ struct list_head conflict_inodes;
+ int num_conflict_inodes;
+ bool logging_conflict_inodes;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
+ struct inode *inode)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
+ ctx->logged_before = false;
+ ctx->inode = inode;
+ INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
+}
+
+static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
+{
+ WRITE_ONCE(trans->fs_info->last_trans_log_full_commit, trans->transid);
+}
+
+static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans)
+{
+ return READ_ONCE(trans->fs_info->last_trans_log_full_commit) ==
+ trans->transid;
+}
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct dentry *dentry,
+ struct btrfs_log_ctx *ctx);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct fscrypt_str *name,
+ struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct fscrypt_str *name,
+ struct btrfs_inode *inode, u64 dirid);
+void btrfs_end_log_trans(struct btrfs_root *root);
+void btrfs_pin_log_trans(struct btrfs_root *root);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir, struct btrfs_inode *inode,
+ int for_rename);
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir);
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent);
+
+#endif
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
new file mode 100644
index 000000000..8a3a14686
--- /dev/null
+++ b/fs/btrfs/tree-mod-log.c
@@ -0,0 +1,929 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "tree-mod-log.h"
+#include "disk-io.h"
+
+struct tree_mod_root {
+ u64 logical;
+ u8 level;
+};
+
+struct tree_mod_elem {
+ struct rb_node node;
+ u64 logical;
+ u64 seq;
+ enum btrfs_mod_log_op op;
+
+ /*
+ * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
+ * operations.
+ */
+ int slot;
+
+ /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
+ u64 generation;
+
+ /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
+ struct btrfs_disk_key key;
+ u64 blockptr;
+
+ /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
+
+ /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
+ struct tree_mod_root old_root;
+};
+
+/*
+ * Pull a new tree mod seq number for our operation.
+ */
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+ return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem)
+{
+ write_lock(&fs_info->tree_mod_log_lock);
+ if (!elem->seq) {
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+
+ return elem->seq;
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct rb_node *next;
+ struct tree_mod_elem *tm;
+ u64 min_seq = BTRFS_SEQ_LAST;
+ u64 seq_putting = elem->seq;
+
+ if (!seq_putting)
+ return;
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ list_del(&elem->list);
+ elem->seq = 0;
+
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
+ } else {
+ struct btrfs_seq_list *first;
+
+ first = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct btrfs_seq_list, list);
+ if (seq_putting > first->seq) {
+ /*
+ * Blocker with lower sequence number exists, we cannot
+ * remove anything from the log.
+ */
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return;
+ }
+ min_seq = first->seq;
+ }
+
+ /*
+ * Anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+ tm = rb_entry(node, struct tree_mod_elem, node);
+ if (tm->seq >= min_seq)
+ continue;
+ rb_erase(node, tm_root);
+ kfree(tm);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * Key order of the log:
+ * node/leaf start address -> sequence
+ *
+ * The 'start address' is the logical address of the *new* root node for root
+ * replace operations, or the logical address of the affected block for all
+ * other operations.
+ */
+static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem *tm)
+{
+ struct rb_root *tm_root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct tree_mod_elem *cur;
+
+ lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
+
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+
+ tm_root = &fs_info->tree_mod_log;
+ new = &tm_root->rb_node;
+ while (*new) {
+ cur = rb_entry(*new, struct tree_mod_elem, node);
+ parent = *new;
+ if (cur->logical < tm->logical)
+ new = &((*new)->rb_left);
+ else if (cur->logical > tm->logical)
+ new = &((*new)->rb_right);
+ else if (cur->seq < tm->seq)
+ new = &((*new)->rb_left);
+ else if (cur->seq > tm->seq)
+ new = &((*new)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&tm->node, parent, new);
+ rb_insert_color(&tm->node, tm_root);
+ return 0;
+}
+
+/*
+ * Determines if logging can be omitted. Returns true if it can. Otherwise, it
+ * returns false with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * write unlock fs_info::tree_mod_log_lock.
+ */
+static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ return true;
+ if (eb && btrfs_header_level(eb) == 0)
+ return true;
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return true;
+ }
+
+ return false;
+}
+
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ return false;
+ if (eb && btrfs_header_level(eb) == 0)
+ return false;
+
+ return true;
+}
+
+static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
+ int slot,
+ enum btrfs_mod_log_op op,
+ gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+
+ tm = kzalloc(sizeof(*tm), flags);
+ if (!tm)
+ return NULL;
+
+ tm->logical = eb->start;
+ if (op != BTRFS_MOD_LOG_KEY_ADD) {
+ btrfs_node_key(eb, &tm->key, slot);
+ tm->blockptr = btrfs_node_blockptr(eb, slot);
+ }
+ tm->op = op;
+ tm->slot = slot;
+ tm->generation = btrfs_node_ptr_generation(eb, slot);
+ RB_CLEAR_NODE(&tm->node);
+
+ return tm;
+}
+
+int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum btrfs_mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ tm = alloc_tree_mod_elem(eb, slot, op, flags);
+ if (!tm)
+ return -ENOMEM;
+
+ if (tree_mod_dont_log(eb->fs_info, eb)) {
+ kfree(tm);
+ return 0;
+ }
+
+ ret = tree_mod_log_insert(eb->fs_info, tm);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ if (ret)
+ kfree(tm);
+
+ return ret;
+}
+
+int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items)
+{
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int ret = 0;
+ int i;
+ bool locked = false;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->logical = eb->start;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(eb->fs_info, eb))
+ goto free_tms;
+ locked = true;
+
+ /*
+ * When we override something during the move, we log these removals.
+ * This can only happen when we move towards the beginning of the
+ * buffer, i.e. dst_slot < src_slot.
+ */
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ ret = tree_mod_log_insert(eb->fs_info, tm);
+ if (ret)
+ goto free_tms;
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+ kfree(tm);
+
+ return ret;
+}
+
+static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
+{
+ int i, j;
+ int ret;
+
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert(fs_info, tm_list[i]);
+ if (ret) {
+ for (j = nritems - 1; j > i; j--)
+ rb_erase(&tm_list[j]->node,
+ &fs_info->tree_mod_log);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root,
+ bool log_removal)
+{
+ struct btrfs_fs_info *fs_info = old_root->fs_info;
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int ret = 0;
+ int i;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (log_removal && btrfs_header_level(old_root) > 0) {
+ nritems = btrfs_header_nritems(old_root);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(old_root, i,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+ }
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->logical = new_root->start;
+ tm->old_root.logical = old_root->start;
+ tm->old_root.level = btrfs_header_level(old_root);
+ tm->generation = btrfs_header_generation(old_root);
+ tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+
+ if (tm_list)
+ ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
+ if (!ret)
+ ret = tree_mod_log_insert(fs_info, tm);
+
+ write_unlock(&fs_info->tree_mod_log_lock);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return ret;
+
+free_tms:
+ if (tm_list) {
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+ }
+ kfree(tm);
+
+ return ret;
+}
+
+static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq,
+ bool smallest)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct tree_mod_elem *cur = NULL;
+ struct tree_mod_elem *found = NULL;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ node = tm_root->rb_node;
+ while (node) {
+ cur = rb_entry(node, struct tree_mod_elem, node);
+ if (cur->logical < start) {
+ node = node->rb_left;
+ } else if (cur->logical > start) {
+ node = node->rb_right;
+ } else if (cur->seq < min_seq) {
+ node = node->rb_left;
+ } else if (!smallest) {
+ /* We want the node with the highest seq */
+ if (found)
+ BUG_ON(found->seq > cur->seq);
+ found = cur;
+ node = node->rb_left;
+ } else if (cur->seq > min_seq) {
+ /* We want the node with the smallest seq */
+ if (found)
+ BUG_ON(found->seq < cur->seq);
+ found = cur;
+ node = node->rb_right;
+ } else {
+ found = cur;
+ break;
+ }
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return found;
+}
+
+/*
+ * This returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). Any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, true);
+}
+
+/*
+ * This returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). Any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, false);
+}
+
+int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
+ struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ int nr_items)
+{
+ struct btrfs_fs_info *fs_info = dst->fs_info;
+ int ret = 0;
+ struct tree_mod_elem **tm_list = NULL;
+ struct tree_mod_elem **tm_list_add, **tm_list_rem;
+ int i;
+ bool locked = false;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ return 0;
+
+ tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm_list_add = tm_list;
+ tm_list_rem = tm_list + nr_items;
+ for (i = 0; i < nr_items; i++) {
+ tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ if (!tm_list_rem[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ if (!tm_list_add[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+ locked = true;
+
+ for (i = 0; i < nr_items; i++) {
+ ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
+ if (ret)
+ goto free_tms;
+ ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ write_unlock(&fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items * 2; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ write_unlock(&fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return ret;
+}
+
+int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
+{
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int i;
+ int ret = 0;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ nritems = btrfs_header_nritems(eb);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(eb->fs_info, eb))
+ goto free_tms;
+
+ ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+
+ return ret;
+}
+
+/*
+ * Returns the logical address of the oldest predecessor of the given root.
+ * Entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
+ u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct tree_mod_elem *found = NULL;
+ u64 root_logical = eb_root->start;
+ bool looped = false;
+
+ if (!time_seq)
+ return NULL;
+
+ /*
+ * The very last operation that's logged for a root is the replacement
+ * operation (if it is replaced at all). This has the logical address
+ * of the *new* root, making it the very first operation that's logged
+ * for this root.
+ */
+ while (1) {
+ tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
+ time_seq);
+ if (!looped && !tm)
+ return NULL;
+ /*
+ * If there are no tree operation for the oldest root, we simply
+ * return it. This should only happen if that (old) root is at
+ * level 0.
+ */
+ if (!tm)
+ break;
+
+ /*
+ * If there's an operation that's not a root replacement, we
+ * found the oldest version of our root. Normally, we'll find a
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+ */
+ if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
+ break;
+
+ found = tm;
+ root_logical = tm->old_root.logical;
+ looped = true;
+ }
+
+ /* If there's no old root to return, return what we found instead */
+ if (!found)
+ found = tm;
+
+ return found;
+}
+
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. Then, all
+ * previous operations will be rewound (until we reach something older than
+ * time_seq).
+ */
+static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ u64 time_seq,
+ struct tree_mod_elem *first_tm)
+{
+ u32 n;
+ struct rb_node *next;
+ struct tree_mod_elem *tm = first_tm;
+ unsigned long o_dst;
+ unsigned long o_src;
+ unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+ n = btrfs_header_nritems(eb);
+ read_lock(&fs_info->tree_mod_log_lock);
+ while (tm && tm->seq >= time_seq) {
+ /*
+ * All the operations are recorded with the operator used for
+ * the modification. As we're going backwards, we do the
+ * opposite of each operation here.
+ */
+ switch (tm->op) {
+ case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+ BUG_ON(tm->slot < n);
+ fallthrough;
+ case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case BTRFS_MOD_LOG_KEY_REMOVE:
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ n++;
+ break;
+ case BTRFS_MOD_LOG_KEY_REPLACE:
+ BUG_ON(tm->slot >= n);
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ break;
+ case BTRFS_MOD_LOG_KEY_ADD:
+ /* if a move operation is needed it's in the log */
+ n--;
+ break;
+ case BTRFS_MOD_LOG_MOVE_KEYS:
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ memmove_extent_buffer(eb, o_dst, o_src,
+ tm->move.nr_items * p_size);
+ break;
+ case BTRFS_MOD_LOG_ROOT_REPLACE:
+ /*
+ * This operation is special. For roots, this must be
+ * handled explicitly before rewinding.
+ * For non-roots, this operation may exist if the node
+ * was a root: root A -> child B; then A gets empty and
+ * B is promoted to the new root. In the mod log, we'll
+ * have a root-replace operation for B, a tree block
+ * that is no root. We simply ignore that operation.
+ */
+ break;
+ }
+ next = rb_next(&tm->node);
+ if (!next)
+ break;
+ tm = rb_entry(next, struct tree_mod_elem, node);
+ if (tm->logical != first_tm->logical)
+ break;
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+ btrfs_set_header_nritems(eb, n);
+}
+
+/*
+ * Called with eb read locked. If the buffer cannot be rewound, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
+struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ u64 time_seq)
+{
+ struct extent_buffer *eb_rewin;
+ struct tree_mod_elem *tm;
+
+ if (!time_seq)
+ return eb;
+
+ if (btrfs_header_level(eb) == 0)
+ return eb;
+
+ tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+ if (!tm)
+ return eb;
+
+ if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ BUG_ON(tm->slot != 0);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ btrfs_set_header_bytenr(eb_rewin, eb->start);
+ btrfs_set_header_backref_rev(eb_rewin,
+ btrfs_header_backref_rev(eb));
+ btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+ } else {
+ eb_rewin = btrfs_clone_extent_buffer(eb);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ }
+
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
+ btrfs_tree_read_lock(eb_rewin);
+ tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
+ WARN_ON(btrfs_header_nritems(eb_rewin) >
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+
+ return eb_rewin;
+}
+
+/*
+ * Rewind the state of @root's root node to the given @time_seq value.
+ * If there are no changes, the current root->root_node is returned. If anything
+ * changed in between, there's a fresh buffer allocated on which the rewind
+ * operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct tree_mod_elem *tm;
+ struct extent_buffer *eb = NULL;
+ struct extent_buffer *eb_root;
+ u64 eb_root_owner = 0;
+ struct extent_buffer *old;
+ struct tree_mod_root *old_root = NULL;
+ u64 old_generation = 0;
+ u64 logical;
+ int level;
+
+ eb_root = btrfs_read_lock_root_node(root);
+ tm = tree_mod_log_oldest_root(eb_root, time_seq);
+ if (!tm)
+ return eb_root;
+
+ if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+ logical = old_root->logical;
+ level = old_root->level;
+ } else {
+ logical = eb_root->start;
+ level = btrfs_header_level(eb_root);
+ }
+
+ tm = tree_mod_log_search(fs_info, logical, time_seq);
+ if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ old = read_tree_block(fs_info, logical, root->root_key.objectid,
+ 0, level, NULL);
+ if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
+ btrfs_warn(fs_info,
+ "failed to read tree block %llu from get_old_root",
+ logical);
+ } else {
+ struct tree_mod_elem *tm2;
+
+ btrfs_tree_read_lock(old);
+ eb = btrfs_clone_extent_buffer(old);
+ /*
+ * After the lookup for the most recent tree mod operation
+ * above and before we locked and cloned the extent buffer
+ * 'old', a new tree mod log operation may have been added.
+ * So lookup for a more recent one to make sure the number
+ * of mod log operations we replay is consistent with the
+ * number of items we have in the cloned extent buffer,
+ * otherwise we can hit a BUG_ON when rewinding the extent
+ * buffer.
+ */
+ tm2 = tree_mod_log_search(fs_info, logical, time_seq);
+ btrfs_tree_read_unlock(old);
+ free_extent_buffer(old);
+ ASSERT(tm2);
+ ASSERT(tm2 == tm || tm2->seq > tm->seq);
+ if (!tm2 || tm2->seq < tm->seq) {
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ tm = tm2;
+ }
+ } else if (old_root) {
+ eb_root_owner = btrfs_header_owner(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ eb = alloc_dummy_extent_buffer(fs_info, logical);
+ } else {
+ eb = btrfs_clone_extent_buffer(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ }
+
+ if (!eb)
+ return NULL;
+ if (old_root) {
+ btrfs_set_header_bytenr(eb, eb->start);
+ btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(eb, eb_root_owner);
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
+ }
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
+ if (tm)
+ tree_mod_log_rewind(fs_info, eb, time_seq, tm);
+ else
+ WARN_ON(btrfs_header_level(eb) != 0);
+ WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+
+ return eb;
+}
+
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ int level;
+ struct extent_buffer *eb_root = btrfs_root_node(root);
+
+ tm = tree_mod_log_oldest_root(eb_root, time_seq);
+ if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
+ level = tm->old_root.level;
+ else
+ level = btrfs_header_level(eb_root);
+
+ free_extent_buffer(eb_root);
+
+ return level;
+}
+
+/*
+ * Return the lowest sequence number in the tree modification log.
+ *
+ * Return the sequence number of the oldest tree modification log user, which
+ * corresponds to the lowest sequence number of all existing users. If there are
+ * no users it returns 0.
+ */
+u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
+{
+ u64 ret = 0;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct btrfs_seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct btrfs_seq_list, list);
+ ret = elem->seq;
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return ret;
+}
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
new file mode 100644
index 000000000..12605d196
--- /dev/null
+++ b/fs/btrfs/tree-mod-log.h
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef BTRFS_TREE_MOD_LOG_H
+#define BTRFS_TREE_MOD_LOG_H
+
+#include "ctree.h"
+
+/* Represents a tree mod log user. */
+struct btrfs_seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+#define BTRFS_SEQ_LAST ((u64)-1)
+
+enum btrfs_mod_log_op {
+ BTRFS_MOD_LOG_KEY_REPLACE,
+ BTRFS_MOD_LOG_KEY_ADD,
+ BTRFS_MOD_LOG_KEY_REMOVE,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+ BTRFS_MOD_LOG_MOVE_KEYS,
+ BTRFS_MOD_LOG_ROOT_REPLACE,
+};
+
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem);
+int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root,
+ bool log_removal);
+int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum btrfs_mod_log_op op, gfp_t flags);
+int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
+struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ u64 time_seq);
+struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
+ struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ int nr_items);
+int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items);
+u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000..3374c9e9b
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ */
+
+#include <linux/slab.h>
+#include "ulist.h"
+#include "ctree.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * ULIST_ITER_INIT(&uiter);
+ *
+ * while ((elem = ulist_next(ulist, &uiter)) {
+ * for (all child nodes n in elem)
+ * ulist_add(ulist, n);
+ * do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are addressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist: the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+ INIT_LIST_HEAD(&ulist->nodes);
+ ulist->root = RB_ROOT;
+ ulist->nnodes = 0;
+}
+
+/**
+ * ulist_release - free up additionally allocated memory for the ulist
+ * @ulist: the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+void ulist_release(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_node *next;
+
+ list_for_each_entry_safe(node, next, &ulist->nodes, list) {
+ kfree(node);
+ }
+ ulist->root = RB_ROOT;
+ INIT_LIST_HEAD(&ulist->nodes);
+}
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist: ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+ ulist_release(ulist);
+ ulist_init(ulist);
+}
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask: allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(gfp_t gfp_mask)
+{
+ struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+ if (!ulist)
+ return NULL;
+
+ ulist_init(ulist);
+
+ return ulist;
+}
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist: ulist to free
+ *
+ * It is not necessary to call ulist_release before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+ if (!ulist)
+ return;
+ ulist_release(ulist);
+ kfree(ulist);
+}
+
+static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
+{
+ struct rb_node *n = ulist->root.rb_node;
+ struct ulist_node *u = NULL;
+
+ while (n) {
+ u = rb_entry(n, struct ulist_node, rb_node);
+ if (u->val < val)
+ n = n->rb_right;
+ else if (u->val > val)
+ n = n->rb_left;
+ else
+ return u;
+ }
+ return NULL;
+}
+
+static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
+{
+ rb_erase(&node->rb_node, &ulist->root);
+ list_del(&node->list);
+ kfree(node);
+ BUG_ON(ulist->nnodes == 0);
+ ulist->nnodes--;
+}
+
+static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
+{
+ struct rb_node **p = &ulist->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct ulist_node *cur = NULL;
+
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(parent, struct ulist_node, rb_node);
+
+ if (cur->val < ins->val)
+ p = &(*p)->rb_right;
+ else if (cur->val > ins->val)
+ p = &(*p)->rb_left;
+ else
+ return -EEXIST;
+ }
+ rb_link_node(&ins->rb_node, parent, p);
+ rb_insert_color(&ins->rb_node, &ulist->root);
+ return 0;
+}
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist: ulist to add the element to
+ * @val: value to add to ulist
+ * @aux: auxiliary value to store along with val
+ * @gfp_mask: flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ * locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
+{
+ return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
+}
+
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+ u64 *old_aux, gfp_t gfp_mask)
+{
+ int ret;
+ struct ulist_node *node;
+
+ node = ulist_rbtree_search(ulist, val);
+ if (node) {
+ if (old_aux)
+ *old_aux = node->aux;
+ return 0;
+ }
+ node = kmalloc(sizeof(*node), gfp_mask);
+ if (!node)
+ return -ENOMEM;
+
+ node->val = val;
+ node->aux = aux;
+
+ ret = ulist_rbtree_insert(ulist, node);
+ ASSERT(!ret);
+ list_add_tail(&node->list, &ulist->nodes);
+ ulist->nnodes++;
+
+ return 1;
+}
+
+/*
+ * ulist_del - delete one node from ulist
+ * @ulist: ulist to remove node from
+ * @val: value to delete
+ * @aux: aux to delete
+ *
+ * The deletion will only be done when *BOTH* val and aux matches.
+ * Return 0 for successful delete.
+ * Return > 0 for not found.
+ */
+int ulist_del(struct ulist *ulist, u64 val, u64 aux)
+{
+ struct ulist_node *node;
+
+ node = ulist_rbtree_search(ulist, val);
+ /* Not found */
+ if (!node)
+ return 1;
+
+ if (node->aux != aux)
+ return 1;
+
+ /* Found and delete */
+ ulist_rbtree_erase(ulist, node);
+ return 0;
+}
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist: ulist to iterate
+ * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ * locking is needed
+ *
+ * This function is used to iterate an ulist.
+ * It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
+{
+ struct ulist_node *node;
+
+ if (list_empty(&ulist->nodes))
+ return NULL;
+ if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
+ return NULL;
+ if (uiter->cur_list) {
+ uiter->cur_list = uiter->cur_list->next;
+ } else {
+ uiter->cur_list = ulist->nodes.next;
+ }
+ node = list_entry(uiter->cur_list, struct ulist_node, list);
+ return node;
+}
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000..02fda0a2d
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ */
+
+#ifndef BTRFS_ULIST_H
+#define BTRFS_ULIST_H
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ */
+struct ulist_iterator {
+ struct list_head *cur_list; /* hint to start search */
+};
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+ u64 val; /* value to store */
+ u64 aux; /* auxiliary value saved along with the val */
+
+ struct list_head list; /* used to link node */
+ struct rb_node rb_node; /* used to speed up search */
+};
+
+struct ulist {
+ /*
+ * number of elements stored in list
+ */
+ unsigned long nnodes;
+
+ struct list_head nodes;
+ struct rb_root root;
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_release(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(gfp_t gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+ u64 *old_aux, gfp_t gfp_mask);
+int ulist_del(struct ulist *ulist, u64 val, u64 aux);
+
+/* just like ulist_add_merge() but take a pointer for the aux data */
+static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
+ void **old_aux, gfp_t gfp_mask)
+{
+#if BITS_PER_LONG == 32
+ u64 old64 = (uintptr_t)*old_aux;
+ int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
+ *old_aux = (void *)((uintptr_t)old64);
+ return ret;
+#else
+ return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
+#endif
+}
+
+struct ulist_node *ulist_next(struct ulist *ulist,
+ struct ulist_iterator *uiter);
+
+#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
+
+#endif
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
new file mode 100644
index 000000000..b458452a1
--- /dev/null
+++ b/fs/btrfs/uuid-tree.c
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) STRATO AG 2013. All rights reserved.
+ */
+
+#include <linux/uuid.h>
+#include <asm/unaligned.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+
+static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
+{
+ key->type = type;
+ key->objectid = get_unaligned_le64(uuid);
+ key->offset = get_unaligned_le64(uuid + sizeof(u64));
+}
+
+/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */
+static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+ u8 type, u64 subid)
+{
+ int ret;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *eb;
+ int slot;
+ u32 item_size;
+ unsigned long offset;
+ struct btrfs_key key;
+
+ if (WARN_ON_ONCE(!uuid_root)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ btrfs_uuid_to_key(uuid, type, &key);
+ ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size(eb, slot);
+ offset = btrfs_item_ptr_offset(eb, slot);
+ ret = -ENOENT;
+
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ btrfs_warn(uuid_root->fs_info,
+ "uuid item with illegal size %lu!",
+ (unsigned long)item_size);
+ goto out;
+ }
+ while (item_size) {
+ __le64 data;
+
+ read_extent_buffer(eb, &data, offset, sizeof(data));
+ if (le64_to_cpu(data) == subid) {
+ ret = 0;
+ break;
+ }
+ offset += sizeof(data);
+ item_size -= sizeof(data);
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ u64 subid_cpu)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *uuid_root = fs_info->uuid_root;
+ int ret;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ int slot;
+ unsigned long offset;
+ __le64 subid_le;
+
+ ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu);
+ if (ret != -ENOENT)
+ return ret;
+
+ if (WARN_ON_ONCE(!uuid_root)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_uuid_to_key(uuid, type, &key);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
+ sizeof(subid_le));
+ if (ret >= 0) {
+ /* Add an item for the type for the first time */
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ offset = btrfs_item_ptr_offset(eb, slot);
+ } else if (ret == -EEXIST) {
+ /*
+ * An item with that type already exists.
+ * Extend the item and store the new subid at the end.
+ */
+ btrfs_extend_item(path, sizeof(subid_le));
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ offset = btrfs_item_ptr_offset(eb, slot);
+ offset += btrfs_item_size(eb, slot) - sizeof(subid_le);
+ } else {
+ btrfs_warn(fs_info,
+ "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
+ ret, key.objectid, key.offset, type);
+ goto out;
+ }
+
+ ret = 0;
+ subid_le = cpu_to_le64(subid_cpu);
+ write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+ u64 subid)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *uuid_root = fs_info->uuid_root;
+ int ret;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ int slot;
+ unsigned long offset;
+ u32 item_size;
+ unsigned long move_dst;
+ unsigned long move_src;
+ unsigned long move_len;
+
+ if (WARN_ON_ONCE(!uuid_root)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_uuid_to_key(uuid, type, &key);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "error %d while searching for uuid item!",
+ ret);
+ goto out;
+ }
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ offset = btrfs_item_ptr_offset(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ btrfs_warn(fs_info, "uuid item with illegal size %lu!",
+ (unsigned long)item_size);
+ ret = -ENOENT;
+ goto out;
+ }
+ while (item_size) {
+ __le64 read_subid;
+
+ read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid));
+ if (le64_to_cpu(read_subid) == subid)
+ break;
+ offset += sizeof(read_subid);
+ item_size -= sizeof(read_subid);
+ }
+
+ if (!item_size) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ item_size = btrfs_item_size(eb, slot);
+ if (item_size == sizeof(subid)) {
+ ret = btrfs_del_item(trans, uuid_root, path);
+ goto out;
+ }
+
+ move_dst = offset;
+ move_src = offset + sizeof(subid);
+ move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
+ memmove_extent_buffer(eb, move_dst, move_src, move_len);
+ btrfs_truncate_item(path, item_size - sizeof(subid), 1);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ u64 subid)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /* 1 - for the uuid item */
+ trans = btrfs_start_transaction(uuid_root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_uuid_tree_remove(trans, uuid, type, subid);
+ btrfs_end_transaction(trans);
+
+out:
+ return ret;
+}
+
+/*
+ * Check if there's an matching subvolume for given UUID
+ *
+ * Return:
+ * 0 check succeeded, the entry is not outdated
+ * > 0 if the check failed, the caller should remove the entry
+ * < 0 if an error occurred
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+ u8 *uuid, u8 type, u64 subvolid)
+{
+ int ret = 0;
+ struct btrfs_root *subvol_root;
+
+ if (type != BTRFS_UUID_KEY_SUBVOL &&
+ type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+ goto out;
+
+ subvol_root = btrfs_get_fs_root(fs_info, subvolid, true);
+ if (IS_ERR(subvol_root)) {
+ ret = PTR_ERR(subvol_root);
+ if (ret == -ENOENT)
+ ret = 1;
+ goto out;
+ }
+
+ switch (type) {
+ case BTRFS_UUID_KEY_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.received_uuid,
+ BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ }
+ btrfs_put_root(subvol_root);
+out:
+ return ret;
+}
+
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = fs_info->uuid_root;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret = 0;
+ struct extent_buffer *leaf;
+ int slot;
+ u32 item_size;
+ unsigned long offset;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+
+again_search_slot:
+ ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ goto out;
+ }
+
+ while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ ret = -EINTR;
+ goto out;
+ }
+ cond_resched();
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.type != BTRFS_UUID_KEY_SUBVOL &&
+ key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+ goto skip;
+
+ offset = btrfs_item_ptr_offset(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ btrfs_warn(fs_info,
+ "uuid item with illegal size %lu!",
+ (unsigned long)item_size);
+ goto skip;
+ }
+ while (item_size) {
+ u8 uuid[BTRFS_UUID_SIZE];
+ __le64 subid_le;
+ u64 subid_cpu;
+
+ put_unaligned_le64(key.objectid, uuid);
+ put_unaligned_le64(key.offset, uuid + sizeof(u64));
+ read_extent_buffer(leaf, &subid_le, offset,
+ sizeof(subid_le));
+ subid_cpu = le64_to_cpu(subid_le);
+ ret = btrfs_check_uuid_tree_entry(fs_info, uuid,
+ key.type, subid_cpu);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ btrfs_release_path(path);
+ ret = btrfs_uuid_iter_rem(root, uuid, key.type,
+ subid_cpu);
+ if (ret == 0) {
+ /*
+ * this might look inefficient, but the
+ * justification is that it is an
+ * exception that check_func returns 1,
+ * and that in the regular case only one
+ * entry per UUID exists.
+ */
+ goto again_search_slot;
+ }
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ key.offset++;
+ goto again_search_slot;
+ }
+ item_size -= sizeof(subid_le);
+ offset += sizeof(subid_le);
+ }
+
+skip:
+ ret = btrfs_next_item(root, path);
+ if (ret == 0)
+ continue;
+ else if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
new file mode 100644
index 000000000..ee00e33c3
--- /dev/null
+++ b/fs/btrfs/verity.c
@@ -0,0 +1,812 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+
+/*
+ * Implementation of the interface defined in struct fsverity_operations.
+ *
+ * The main question is how and where to store the verity descriptor and the
+ * Merkle tree. We store both in dedicated btree items in the filesystem tree,
+ * together with the rest of the inode metadata. This means we'll need to do
+ * extra work to encrypt them once encryption is supported in btrfs, but btrfs
+ * has a lot of careful code around i_size and it seems better to make a new key
+ * type than try and adjust all of our expectations for i_size.
+ *
+ * Note that this differs from the implementation in ext4 and f2fs, where
+ * this data is stored as if it were in the file, but past EOF. However, btrfs
+ * does not have a widespread mechanism for caching opaque metadata pages, so we
+ * do pretend that the Merkle tree pages themselves are past EOF for the
+ * purposes of caching them (as opposed to creating a virtual inode).
+ *
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
+ * size of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.
+ * The latter are opaque to btrfs, we just read and write them as a blob for
+ * the higher level verity code. The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.
+ * So when fsverity asks for page 0 of the merkle tree, we pull up one page
+ * starting at offset 0 for this key type. These are also opaque to btrfs,
+ * we're blindly storing whatever fsverity sends down.
+ *
+ * Another important consideration is the fact that the Merkle tree data scales
+ * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
+ * ~1/127th the size) so for large files, writing the tree can be a lengthy
+ * operation. For that reason, we guard the whole enable verity operation
+ * (between begin_enable_verity and end_enable_verity) with an orphan item.
+ * Again, because the data can be pretty large, it's quite possible that we
+ * could run out of space writing it, so we try our best to handle errors by
+ * stopping and rolling back rather than aborting the victim transaction.
+ */
+
+#define MERKLE_START_ALIGN 65536
+
+/*
+ * Compute the logical file offset where we cache the Merkle tree.
+ *
+ * @inode: inode of the verity file
+ *
+ * For the purposes of caching the Merkle tree pages, as required by
+ * fs-verity, it is convenient to do size computations in terms of a file
+ * offset, rather than in terms of page indices.
+ *
+ * Use 64K to be sure it's past the last page in the file, even with 64K pages.
+ * That rounding operation itself can overflow loff_t, so we do it in u64 and
+ * check.
+ *
+ * Returns the file offset on success, negative error code on failure.
+ */
+static loff_t merkle_file_pos(const struct inode *inode)
+{
+ u64 sz = inode->i_size;
+ u64 rounded = round_up(sz, MERKLE_START_ALIGN);
+
+ if (rounded > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ return rounded;
+}
+
+/*
+ * Drop all the items for this inode with this key_type.
+ *
+ * @inode: inode to drop items for
+ * @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or
+ * BTRFS_VERITY_MERKLE_ITEM)
+ *
+ * Before doing a verity enable we cleanup any existing verity items.
+ * This is also used to clean up if a verity enable failed half way through.
+ *
+ * Returns number of dropped items on success, negative error code on failure.
+ */
+static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int count = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ /* 1 for the item being dropped */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /*
+ * Walk backwards through all the items until we find one that
+ * isn't from our key type or objectid
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = 0;
+ /* No more keys of this type, we're done */
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ } else if (ret < 0) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ /* No more keys of this type, we're done */
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+ /*
+ * This shouldn't be a performance sensitive function because
+ * it's not used as part of truncate. If it ever becomes
+ * perf sensitive, change this to walk forward and bulk delete
+ * items
+ */
+ ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ count++;
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ }
+ ret = count;
+ btrfs_end_transaction(trans);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Drop all verity items
+ *
+ * @inode: inode to drop verity items for
+ *
+ * In most contexts where we are dropping verity items, we want to do it for all
+ * the types of verity items, not a particular one.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ int ret;
+
+ ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
+ if (ret < 0)
+ return ret;
+ ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Insert and write inode items with a given key type and offset.
+ *
+ * @inode: inode to insert for
+ * @key_type: key type to insert
+ * @offset: item offset to insert at
+ * @src: source data to write
+ * @len: length of source data to write
+ *
+ * Write len bytes from src into items of up to 2K length.
+ * The inserted items will have key (ino, key_type, offset + off) where off is
+ * consecutively increasing from 0 up to the last item ending at offset + len.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ const char *src, u64 len)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long copy_bytes;
+ unsigned long src_offset = 0;
+ void *data;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (len > 0) {
+ /* 1 for the new item being inserted */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = offset;
+
+ /*
+ * Insert 2K at a time mostly to be friendly for smaller leaf
+ * size filesystems
+ */
+ copy_bytes = min_t(u64, len, 2048);
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ break;
+ }
+
+ leaf = path->nodes[0];
+
+ data = btrfs_item_ptr(leaf, path->slots[0], void);
+ write_extent_buffer(leaf, src + src_offset,
+ (unsigned long)data, copy_bytes);
+ offset += copy_bytes;
+ src_offset += copy_bytes;
+ len -= copy_bytes;
+
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Read inode items of the given key type and offset from the btree.
+ *
+ * @inode: inode to read items of
+ * @key_type: key type to read
+ * @offset: item offset to read from
+ * @dest: Buffer to read into. This parameter has slightly tricky
+ * semantics. If it is NULL, the function will not do any copying
+ * and will just return the size of all the items up to len bytes.
+ * If dest_page is passed, then the function will kmap_local the
+ * page and ignore dest, but it must still be non-NULL to avoid the
+ * counting-only behavior.
+ * @len: length in bytes to read
+ * @dest_page: copy into this page instead of the dest buffer
+ *
+ * Helper function to read items from the btree. This returns the number of
+ * bytes read or < 0 for errors. We can return short reads if the items don't
+ * exist on disk or aren't big enough to fill the desired length. Supports
+ * reading into a provided buffer (dest) or into the page cache
+ *
+ * Returns number of bytes read or a negative error code on failure.
+ */
+static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ char *dest, u64 len, struct page *dest_page)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 item_end;
+ u64 copy_end;
+ int copied = 0;
+ u32 copy_offset;
+ unsigned long copy_bytes;
+ unsigned long dest_offset = 0;
+ void *data;
+ char *kaddr = dest;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (dest_page)
+ path->reada = READA_FORWARD;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (len > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+ item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset;
+
+ if (copied > 0) {
+ /*
+ * Once we've copied something, we want all of the items
+ * to be sequential
+ */
+ if (key.offset != offset)
+ break;
+ } else {
+ /*
+ * Our initial offset might be in the middle of an
+ * item. Make sure it all makes sense.
+ */
+ if (key.offset > offset)
+ break;
+ if (item_end <= offset)
+ break;
+ }
+
+ /* desc = NULL to just sum all the item lengths */
+ if (!dest)
+ copy_end = item_end;
+ else
+ copy_end = min(offset + len, item_end);
+
+ /* Number of bytes in this item we want to copy */
+ copy_bytes = copy_end - offset;
+
+ /* Offset from the start of item for copying */
+ copy_offset = offset - key.offset;
+
+ if (dest) {
+ if (dest_page)
+ kaddr = kmap_local_page(dest_page);
+
+ data = btrfs_item_ptr(leaf, path->slots[0], void);
+ read_extent_buffer(leaf, kaddr + dest_offset,
+ (unsigned long)data + copy_offset,
+ copy_bytes);
+
+ if (dest_page)
+ kunmap_local(kaddr);
+ }
+
+ offset += copy_bytes;
+ dest_offset += copy_bytes;
+ len -= copy_bytes;
+ copied += copy_bytes;
+
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ /*
+ * We've reached the last slot in this leaf and we need
+ * to go to the next leaf.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ }
+out:
+ btrfs_free_path(path);
+ if (!ret)
+ ret = copied;
+ return ret;
+}
+
+/*
+ * Delete an fsverity orphan
+ *
+ * @trans: transaction to do the delete in
+ * @inode: inode to orphan
+ *
+ * Capture verity orphan specific logic that is repeated in the couple places
+ * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
+ * with 0 links.
+ *
+ * Returns zero on success or a negative error code on failure.
+ */
+static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+ int ret;
+
+ /*
+ * If the inode has no links, it is either already unlinked, or was
+ * created with O_TMPFILE. In either case, it should have an orphan from
+ * that other operation. Rather than reference count the orphans, we
+ * simply ignore them here, because we only invoke the verity path in
+ * the orphan logic when i_nlink is 1.
+ */
+ if (!inode->vfs_inode.i_nlink)
+ return 0;
+
+ ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Rollback in-progress verity if we encounter an error.
+ *
+ * @inode: inode verity had an error for
+ *
+ * We try to handle recoverable errors while enabling verity by rolling it back
+ * and just failing the operation, rather than having an fs level error no
+ * matter what. However, any error in rollback is unrecoverable.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int rollback_verity(struct btrfs_inode *inode)
+{
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *root = inode->root;
+ int ret;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+ truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
+ clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ ret = btrfs_drop_verity_items(inode);
+ if (ret) {
+ btrfs_handle_fs_error(root->fs_info, ret,
+ "failed to drop verity items in rollback %llu",
+ (u64)inode->vfs_inode.i_ino);
+ goto out;
+ }
+
+ /*
+ * 1 for updating the inode flag
+ * 1 for deleting the orphan
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ btrfs_handle_fs_error(root->fs_info, ret,
+ "failed to start transaction in verity rollback %llu",
+ (u64)inode->vfs_inode.i_ino);
+ goto out;
+ }
+ inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ ret = del_orphan(trans, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+out:
+ if (trans)
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+/*
+ * Finalize making the file a valid verity file
+ *
+ * @inode: inode to be marked as verity
+ * @desc: contents of the verity descriptor to write (not NULL)
+ * @desc_size: size of the verity descriptor
+ *
+ * Do the actual work of finalizing verity after successfully writing the Merkle
+ * tree:
+ *
+ * - write out the descriptor items
+ * - mark the inode with the verity flag
+ * - delete the orphan item
+ * - mark the ro compat bit
+ * - clear the in progress bit
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int finish_verity(struct btrfs_inode *inode, const void *desc,
+ size_t desc_size)
+{
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_verity_descriptor_item item;
+ int ret;
+
+ /* Write out the descriptor item */
+ memset(&item, 0, sizeof(item));
+ btrfs_set_stack_verity_descriptor_size(&item, desc_size);
+ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
+ (const char *)&item, sizeof(item));
+ if (ret)
+ goto out;
+
+ /* Write out the descriptor itself */
+ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
+ desc, desc_size);
+ if (ret)
+ goto out;
+
+ /*
+ * 1 for updating the inode flag
+ * 1 for deleting the orphan
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ inode->ro_flags |= BTRFS_INODE_RO_VERITY;
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto end_trans;
+ ret = del_orphan(trans, inode);
+ if (ret)
+ goto end_trans;
+ clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ btrfs_set_fs_compat_ro(root->fs_info, VERITY);
+end_trans:
+ btrfs_end_transaction(trans);
+out:
+ return ret;
+
+}
+
+/*
+ * fsverity op that begins enabling verity.
+ *
+ * @filp: file to enable verity on
+ *
+ * Begin enabling fsverity for the file. We drop any existing verity items, add
+ * an orphan and set the in progress bit.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int btrfs_begin_enable_verity(struct file *filp)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ASSERT(inode_is_locked(file_inode(filp)));
+
+ if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
+ return -EBUSY;
+
+ /*
+ * This should almost never do anything, but theoretically, it's
+ * possible that we failed to enable verity on a file, then were
+ * interrupted or failed while rolling back, failed to cleanup the
+ * orphan, and finally attempt to enable verity again.
+ */
+ ret = btrfs_drop_verity_items(inode);
+ if (ret)
+ return ret;
+
+ /* 1 for the orphan item */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (!ret)
+ set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ btrfs_end_transaction(trans);
+
+ return 0;
+}
+
+/*
+ * fsverity op that ends enabling verity.
+ *
+ * @filp: file we are finishing enabling verity on
+ * @desc: verity descriptor to write out (NULL in error conditions)
+ * @desc_size: size of the verity descriptor (variable with signatures)
+ * @merkle_tree_size: size of the merkle tree in bytes
+ *
+ * If desc is null, then VFS is signaling an error occurred during verity
+ * enable, and we should try to rollback. Otherwise, attempt to finish verity.
+ *
+ * Returns 0 on success, negative error code on error.
+ */
+static int btrfs_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+ int ret = 0;
+ int rollback_ret;
+
+ ASSERT(inode_is_locked(file_inode(filp)));
+
+ if (desc == NULL)
+ goto rollback;
+
+ ret = finish_verity(inode, desc, desc_size);
+ if (ret)
+ goto rollback;
+ return ret;
+
+rollback:
+ rollback_ret = rollback_verity(inode);
+ if (rollback_ret)
+ btrfs_err(inode->root->fs_info,
+ "failed to rollback verity items: %d", rollback_ret);
+ return ret;
+}
+
+/*
+ * fsverity op that gets the struct fsverity_descriptor.
+ *
+ * @inode: inode to get the descriptor of
+ * @buf: output buffer for the descriptor contents
+ * @buf_size: size of the output buffer. 0 to query the size
+ *
+ * fsverity does a two pass setup for reading the descriptor, in the first pass
+ * it calls with buf_size = 0 to query the size of the descriptor, and then in
+ * the second pass it actually reads the descriptor off disk.
+ *
+ * Returns the size on success or a negative error code on failure.
+ */
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
+{
+ u64 true_size;
+ int ret = 0;
+ struct btrfs_verity_descriptor_item item;
+
+ memset(&item, 0, sizeof(item));
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
+ (char *)&item, sizeof(item), NULL);
+ if (ret < 0)
+ return ret;
+
+ if (item.reserved[0] != 0 || item.reserved[1] != 0)
+ return -EUCLEAN;
+
+ true_size = btrfs_stack_verity_descriptor_size(&item);
+ if (true_size > INT_MAX)
+ return -EUCLEAN;
+
+ if (buf_size == 0)
+ return true_size;
+ if (buf_size < true_size)
+ return -ERANGE;
+
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
+ buf, buf_size, NULL);
+ if (ret < 0)
+ return ret;
+ if (ret != true_size)
+ return -EIO;
+
+ return true_size;
+}
+
+/*
+ * fsverity op that reads and caches a merkle tree page.
+ *
+ * @inode: inode to read a merkle tree page for
+ * @index: page index relative to the start of the merkle tree
+ * @num_ra_pages: number of pages to readahead. Optional, we ignore it
+ *
+ * The Merkle tree is stored in the filesystem btree, but its pages are cached
+ * with a logical position past EOF in the inode's mapping.
+ *
+ * Returns the page we read, or an ERR_PTR on error.
+ */
+static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index,
+ unsigned long num_ra_pages)
+{
+ struct page *page;
+ u64 off = (u64)index << PAGE_SHIFT;
+ loff_t merkle_pos = merkle_file_pos(inode);
+ int ret;
+
+ if (merkle_pos < 0)
+ return ERR_PTR(merkle_pos);
+ if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
+ return ERR_PTR(-EFBIG);
+ index += merkle_pos >> PAGE_SHIFT;
+again:
+ page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+ if (page) {
+ if (PageUptodate(page))
+ return page;
+
+ lock_page(page);
+ /*
+ * We only insert uptodate pages, so !Uptodate has to be
+ * an error
+ */
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ unlock_page(page);
+ return page;
+ }
+
+ page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Merkle item keys are indexed from byte 0 in the merkle tree.
+ * They have the form:
+ *
+ * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
+ */
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
+ page_address(page), PAGE_SIZE, page);
+ if (ret < 0) {
+ put_page(page);
+ return ERR_PTR(ret);
+ }
+ if (ret < PAGE_SIZE)
+ memzero_page(page, ret, PAGE_SIZE - ret);
+
+ SetPageUptodate(page);
+ ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+
+ if (!ret) {
+ /* Inserted and ready for fsverity */
+ unlock_page(page);
+ } else {
+ put_page(page);
+ /* Did someone race us into inserting this page? */
+ if (ret == -EEXIST)
+ goto again;
+ page = ERR_PTR(ret);
+ }
+ return page;
+}
+
+/*
+ * fsverity op that writes a Merkle tree block into the btree.
+ *
+ * @inode: inode to write a Merkle tree block for
+ * @buf: Merkle tree data block to write
+ * @index: index of the block in the Merkle tree
+ * @log_blocksize: log base 2 of the Merkle tree block size
+ *
+ * Note that the block size could be different from the page size, so it is not
+ * safe to assume that index is a page index.
+ *
+ * Returns 0 on success or negative error code on failure
+ */
+static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ u64 off = index << log_blocksize;
+ u64 len = 1ULL << log_blocksize;
+ loff_t merkle_pos = merkle_file_pos(inode);
+
+ if (merkle_pos < 0)
+ return merkle_pos;
+ if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+ return -EFBIG;
+
+ return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
+ off, buf, len);
+}
+
+const struct fsverity_operations btrfs_verityops = {
+ .begin_enable_verity = btrfs_begin_enable_verity,
+ .end_enable_verity = btrfs_end_enable_verity,
+ .get_verity_descriptor = btrfs_get_verity_descriptor,
+ .read_merkle_tree_page = btrfs_read_merkle_tree_page,
+ .write_merkle_tree_block = btrfs_write_merkle_tree_block,
+};
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000..6fc2d9927
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,8497 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/semaphore.h>
+#include <linux/uuid.h>
+#include <linux/list_sort.h>
+#include <linux/namei.h>
+#include "misc.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "sysfs.h"
+#include "tree-checker.h"
+#include "space-info.h"
+#include "block-group.h"
+#include "discard.h"
+#include "zoned.h"
+
+static struct bio_set btrfs_bioset;
+
+#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID10 | \
+ BTRFS_BLOCK_GROUP_RAID56_MASK)
+
+const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+ .dev_stripes = 1,
+ .devs_max = 0, /* 0 == as many as possible */
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ .nparity = 0,
+ .raid_name = "raid10",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
+ .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
+ },
+ [BTRFS_RAID_RAID1] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 2,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ .nparity = 0,
+ .raid_name = "raid1",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+ },
+ [BTRFS_RAID_RAID1C3] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 3,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 3,
+ .ncopies = 3,
+ .nparity = 0,
+ .raid_name = "raid1c3",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
+ },
+ [BTRFS_RAID_RAID1C4] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 4,
+ .devs_min = 4,
+ .tolerated_failures = 3,
+ .devs_increment = 4,
+ .ncopies = 4,
+ .nparity = 0,
+ .raid_name = "raid1c4",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
+ },
+ [BTRFS_RAID_DUP] = {
+ .sub_stripes = 1,
+ .dev_stripes = 2,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 2,
+ .nparity = 0,
+ .raid_name = "dup",
+ .bg_flag = BTRFS_BLOCK_GROUP_DUP,
+ .mindev_error = 0,
+ },
+ [BTRFS_RAID_RAID0] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ .nparity = 0,
+ .raid_name = "raid0",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
+ .mindev_error = 0,
+ },
+ [BTRFS_RAID_SINGLE] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ .nparity = 0,
+ .raid_name = "single",
+ .bg_flag = 0,
+ .mindev_error = 0,
+ },
+ [BTRFS_RAID_RAID5] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 1,
+ .ncopies = 1,
+ .nparity = 1,
+ .raid_name = "raid5",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
+ .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
+ },
+ [BTRFS_RAID_RAID6] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 1,
+ .ncopies = 1,
+ .nparity = 2,
+ .raid_name = "raid6",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
+ .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
+ },
+};
+
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
+{
+ const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ if (!profile)
+ return BTRFS_RAID_SINGLE;
+
+ return BTRFS_BG_FLAG_TO_INDEX(profile);
+}
+
+const char *btrfs_bg_type_to_raid_name(u64 flags)
+{
+ const int index = btrfs_bg_flags_to_raid_index(flags);
+
+ if (index >= BTRFS_NR_RAID_TYPES)
+ return NULL;
+
+ return btrfs_raid_array[index].raid_name;
+}
+
+int btrfs_nr_parity_stripes(u64 type)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
+
+ return btrfs_raid_array[index].nparity;
+}
+
+/*
+ * Fill @buf with textual description of @bg_flags, no more than @size_buf
+ * bytes including terminating null byte.
+ */
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
+{
+ int i;
+ int ret;
+ char *bp = buf;
+ u64 flags = bg_flags;
+ u32 size_bp = size_buf;
+
+ if (!flags) {
+ strcpy(bp, "NONE");
+ return;
+ }
+
+#define DESCRIBE_FLAG(flag, desc) \
+ do { \
+ if (flags & (flag)) { \
+ ret = snprintf(bp, size_bp, "%s|", (desc)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ flags &= ~(flag); \
+ } \
+ } while (0)
+
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+
+ DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
+ btrfs_raid_array[i].raid_name);
+#undef DESCRIBE_FLAG
+
+ if (flags) {
+ ret = snprintf(bp, size_bp, "0x%llx|", flags);
+ size_bp -= ret;
+ }
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
+
+ /*
+ * The text is trimmed, it's up to the caller to provide sufficiently
+ * large buffer
+ */
+out_overflow:;
+}
+
+static int init_first_rw_device(struct btrfs_trans_handle *trans);
+static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
+ enum btrfs_map_op op, u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap,
+ int *mirror_num_ret, int need_raid_map);
+
+/*
+ * Device locking
+ * ==============
+ *
+ * There are several mutexes that protect manipulation of devices and low-level
+ * structures like chunks but not block groups, extents or files
+ *
+ * uuid_mutex (global lock)
+ * ------------------------
+ * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
+ * the SCAN_DEV ioctl registration or from mount either implicitly (the first
+ * device) or requested by the device= mount option
+ *
+ * the mutex can be very coarse and can cover long-running operations
+ *
+ * protects: updates to fs_devices counters like missing devices, rw devices,
+ * seeding, structure cloning, opening/closing devices at mount/umount time
+ *
+ * global::fs_devs - add, remove, updates to the global list
+ *
+ * does not protect: manipulation of the fs_devices::devices list in general
+ * but in mount context it could be used to exclude list modifications by eg.
+ * scan ioctl
+ *
+ * btrfs_device::name - renames (write side), read is RCU
+ *
+ * fs_devices::device_list_mutex (per-fs, with RCU)
+ * ------------------------------------------------
+ * protects updates to fs_devices::devices, ie. adding and deleting
+ *
+ * simple list traversal with read-only actions can be done with RCU protection
+ *
+ * may be used to exclude some operations from running concurrently without any
+ * modifications to the list (see write_all_supers)
+ *
+ * Is not required at mount and close times, because our device list is
+ * protected by the uuid_mutex at that point.
+ *
+ * balance_mutex
+ * -------------
+ * protects balance structures (status, state) and context accessed from
+ * several places (internally, ioctl)
+ *
+ * chunk_mutex
+ * -----------
+ * protects chunks, adding or removing during allocation, trim or when a new
+ * device is added/removed. Additionally it also protects post_commit_list of
+ * individual devices, since they can be added to the transaction's
+ * post_commit_list only with chunk_mutex held.
+ *
+ * cleaner_mutex
+ * -------------
+ * a big lock that is held by the cleaner thread and prevents running subvolume
+ * cleaning together with relocation or delayed iputs
+ *
+ *
+ * Lock nesting
+ * ============
+ *
+ * uuid_mutex
+ * device_list_mutex
+ * chunk_mutex
+ * balance_mutex
+ *
+ *
+ * Exclusive operations
+ * ====================
+ *
+ * Maintains the exclusivity of the following operations that apply to the
+ * whole filesystem and cannot run in parallel.
+ *
+ * - Balance (*)
+ * - Device add
+ * - Device remove
+ * - Device replace (*)
+ * - Resize
+ *
+ * The device operations (as above) can be in one of the following states:
+ *
+ * - Running state
+ * - Paused state
+ * - Completed state
+ *
+ * Only device operations marked with (*) can go into the Paused state for the
+ * following reasons:
+ *
+ * - ioctl (only Balance can be Paused through ioctl)
+ * - filesystem remounted as read-only
+ * - filesystem unmounted and mounted as read-only
+ * - system power-cycle and filesystem mounted as read-only
+ * - filesystem or device errors leading to forced read-only
+ *
+ * The status of exclusive operation is set and cleared atomically.
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
+ * A device operation in Paused or Running state can be canceled or resumed
+ * either by ioctl (Balance only) or when remounted as read-write.
+ * The exclusive status is cleared when the device operation is canceled or
+ * completed.
+ */
+
+DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
+{
+ return &fs_uuids;
+}
+
+/*
+ * alloc_fs_devices - allocate struct btrfs_fs_devices
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid
+ * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
+ *
+ * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
+ * The returned struct is not linked onto any lists and can be destroyed with
+ * kfree() right away.
+ */
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
+ const u8 *metadata_fsid)
+{
+ struct btrfs_fs_devices *fs_devs;
+
+ fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
+ if (!fs_devs)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&fs_devs->device_list_mutex);
+
+ INIT_LIST_HEAD(&fs_devs->devices);
+ INIT_LIST_HEAD(&fs_devs->alloc_list);
+ INIT_LIST_HEAD(&fs_devs->fs_list);
+ INIT_LIST_HEAD(&fs_devs->seed_list);
+ if (fsid)
+ memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+
+ if (metadata_fsid)
+ memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
+ else if (fsid)
+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
+
+ return fs_devs;
+}
+
+void btrfs_free_device(struct btrfs_device *device)
+{
+ WARN_ON(!list_empty(&device->post_commit_list));
+ rcu_string_free(device->name);
+ extent_io_tree_release(&device->alloc_state);
+ btrfs_destroy_dev_zone_info(device);
+ kfree(device);
+}
+
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+
+ WARN_ON(fs_devices->opened);
+ while (!list_empty(&fs_devices->devices)) {
+ device = list_entry(fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
+ kfree(fs_devices);
+}
+
+void __exit btrfs_cleanup_fs_uuids(void)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ while (!list_empty(&fs_uuids)) {
+ fs_devices = list_entry(fs_uuids.next,
+ struct btrfs_fs_devices, fs_list);
+ list_del(&fs_devices->fs_list);
+ free_fs_devices(fs_devices);
+ }
+}
+
+static noinline struct btrfs_fs_devices *find_fsid(
+ const u8 *fsid, const u8 *metadata_fsid)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ ASSERT(fsid);
+
+ /* Handle non-split brain cases */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (metadata_fsid) {
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
+ && memcmp(metadata_fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ } else {
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
+ }
+ return NULL;
+}
+
+static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
+ struct btrfs_super_block *disk_super)
+{
+
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by first scanning
+ * a device which didn't have its fsid/metadata_uuid changed
+ * at all and the CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(disk_super->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by a device that
+ * has an outdated pair of fsid/metadata_uuid and
+ * CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(fs_devices->metadata_uuid,
+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
+ memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+
+ return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
+}
+
+
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+ int flush, struct block_device **bdev,
+ struct btrfs_super_block **disk_super)
+{
+ int ret;
+
+ *bdev = blkdev_get_by_path(device_path, flags, holder);
+
+ if (IS_ERR(*bdev)) {
+ ret = PTR_ERR(*bdev);
+ goto error;
+ }
+
+ if (flush)
+ sync_blockdev(*bdev);
+ ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
+ if (ret) {
+ blkdev_put(*bdev, flags);
+ goto error;
+ }
+ invalidate_bdev(*bdev);
+ *disk_super = btrfs_read_dev_super(*bdev);
+ if (IS_ERR(*disk_super)) {
+ ret = PTR_ERR(*disk_super);
+ blkdev_put(*bdev, flags);
+ goto error;
+ }
+
+ return 0;
+
+error:
+ *bdev = NULL;
+ return ret;
+}
+
+/**
+ * Search and remove all stale devices (which are not mounted).
+ * When both inputs are NULL, it will search and release all stale devices.
+ *
+ * @devt: Optional. When provided will it release all unmounted devices
+ * matching this devt only.
+ * @skip_device: Optional. Will skip this device when searching for the stale
+ * devices.
+ *
+ * Return: 0 for success or if @devt is 0.
+ * -EBUSY if @devt is a mounted device.
+ * -ENOENT if @devt does not match any device in the list.
+ */
+static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
+{
+ struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
+ struct btrfs_device *device, *tmp_device;
+ int ret = 0;
+
+ lockdep_assert_held(&uuid_mutex);
+
+ if (devt)
+ ret = -ENOENT;
+
+ list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry_safe(device, tmp_device,
+ &fs_devices->devices, dev_list) {
+ if (skip_device && skip_device == device)
+ continue;
+ if (devt && devt != device->devt)
+ continue;
+ if (fs_devices->opened) {
+ /* for an already deleted device return 0 */
+ if (devt && ret != 0)
+ ret = -EBUSY;
+ break;
+ }
+
+ /* delete the stale device */
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+
+ ret = 0;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (fs_devices->num_devices == 0) {
+ btrfs_sysfs_remove_fsid(fs_devices);
+ list_del(&fs_devices->fs_list);
+ free_fs_devices(fs_devices);
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * This is only used on mount, and we are protected from competing things
+ * messing with our fs_devices by the uuid_mutex, thus we do not need the
+ * fs_devices->device_list_mutex here.
+ */
+static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device *device, fmode_t flags,
+ void *holder)
+{
+ struct block_device *bdev;
+ struct btrfs_super_block *disk_super;
+ u64 devid;
+ int ret;
+
+ if (device->bdev)
+ return -EINVAL;
+ if (!device->name)
+ return -EINVAL;
+
+ ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ &bdev, &disk_super);
+ if (ret)
+ return ret;
+
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ if (devid != device->devid)
+ goto error_free_page;
+
+ if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
+ goto error_free_page;
+
+ device->generation = btrfs_super_generation(disk_super);
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ if (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
+ pr_err(
+ "BTRFS: Invalid seeding and uuid-changed device detected\n");
+ goto error_free_page;
+ }
+
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->seeding = true;
+ } else {
+ if (bdev_read_only(bdev))
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ else
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ }
+
+ if (!bdev_nonrot(bdev))
+ fs_devices->rotating = true;
+
+ device->bdev = bdev;
+ clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ device->mode = flags;
+
+ fs_devices->open_devices++;
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ fs_devices->rw_devices++;
+ list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
+ }
+ btrfs_release_disk_super(disk_super);
+
+ return 0;
+
+error_free_page:
+ btrfs_release_disk_super(disk_super);
+ blkdev_put(bdev, flags);
+
+ return -EINVAL;
+}
+
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
+{
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+
+ return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
+}
+
+/*
+ * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
+ * being created with a disk that has already completed its fsid change. Such
+ * disk can belong to an fs which has its FSID changed or to one which doesn't.
+ * Handle both cases here.
+ */
+static struct btrfs_fs_devices *find_fsid_inprogress(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
+ BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
+ return fs_devices;
+ }
+ }
+
+ return find_fsid(disk_super->fsid, NULL);
+}
+
+
+static struct btrfs_fs_devices *find_fsid_changed(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handles the case where scanned device is part of an fs that had
+ * multiple successful changes of FSID but currently device didn't
+ * observe it. Meaning our fsid will be different than theirs. We need
+ * to handle two subcases :
+ * 1 - The fs still continues to have different METADATA/FSID uuids.
+ * 2 - The fs is switched back to its original FSID (METADATA/FSID
+ * are equal).
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ /* Changed UUIDs */
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, disk_super->fsid,
+ BTRFS_FSID_SIZE) != 0)
+ return fs_devices;
+
+ /* Unchanged UUIDs */
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
+
+ return NULL;
+}
+
+static struct btrfs_fs_devices *find_fsid_reverted_metadata(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handle the case where the scanned device is part of an fs whose last
+ * metadata UUID change reverted it to the original FSID. At the same
+ * time * fs_devices was first created by another constitutent device
+ * which didn't fully observe the operation. This results in an
+ * btrfs_fs_devices created with metadata/fsid different AND
+ * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
+ * fs_devices equal to the FSID of the disk.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ fs_devices->fsid_change)
+ return fs_devices;
+ }
+
+ return NULL;
+}
+/*
+ * Add new device to list of registered devices
+ *
+ * Returns:
+ * device pointer which was just added or updated when successful
+ * error pointer when failed
+ */
+static noinline struct btrfs_device *device_list_add(const char *path,
+ struct btrfs_super_block *disk_super,
+ bool *new_device_added)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ struct rcu_string *name;
+ u64 found_transid = btrfs_super_generation(disk_super);
+ u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_t path_devt;
+ int error;
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+ bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
+ BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+ error = lookup_bdev(path, &path_devt);
+ if (error) {
+ btrfs_err(NULL, "failed to lookup block device for path %s: %d",
+ path, error);
+ return ERR_PTR(error);
+ }
+
+ if (fsid_change_in_progress) {
+ if (!has_metadata_uuid)
+ fs_devices = find_fsid_inprogress(disk_super);
+ else
+ fs_devices = find_fsid_changed(disk_super);
+ } else if (has_metadata_uuid) {
+ fs_devices = find_fsid_with_metadata_uuid(disk_super);
+ } else {
+ fs_devices = find_fsid_reverted_metadata(disk_super);
+ if (!fs_devices)
+ fs_devices = find_fsid(disk_super->fsid, NULL);
+ }
+
+
+ if (!fs_devices) {
+ if (has_metadata_uuid)
+ fs_devices = alloc_fs_devices(disk_super->fsid,
+ disk_super->metadata_uuid);
+ else
+ fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
+
+ if (IS_ERR(fs_devices))
+ return ERR_CAST(fs_devices);
+
+ fs_devices->fsid_change = fsid_change_in_progress;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add(&fs_devices->fs_list, &fs_uuids);
+
+ device = NULL;
+ } else {
+ struct btrfs_dev_lookup_args args = {
+ .devid = devid,
+ .uuid = disk_super->dev_item.uuid,
+ };
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ device = btrfs_find_device(fs_devices, &args);
+
+ /*
+ * If this disk has been pulled into an fs devices created by
+ * a device which had the CHANGING_FSID_V2 flag then replace the
+ * metadata_uuid/fsid values of the fs_devices.
+ */
+ if (fs_devices->fsid_change &&
+ found_transid > fs_devices->latest_generation) {
+ memcpy(fs_devices->fsid, disk_super->fsid,
+ BTRFS_FSID_SIZE);
+
+ if (has_metadata_uuid)
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE);
+ else
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->fsid, BTRFS_FSID_SIZE);
+
+ fs_devices->fsid_change = false;
+ }
+ }
+
+ if (!device) {
+ if (fs_devices->opened) {
+ btrfs_err(NULL,
+ "device %s belongs to fsid %pU, and the fs is already mounted",
+ path, fs_devices->fsid);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ERR_PTR(-EBUSY);
+ }
+
+ device = btrfs_alloc_device(NULL, &devid,
+ disk_super->dev_item.uuid);
+ if (IS_ERR(device)) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ /* we can safely leave the fs_devices entry around */
+ return device;
+ }
+
+ name = rcu_string_strdup(path, GFP_NOFS);
+ if (!name) {
+ btrfs_free_device(device);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ rcu_assign_pointer(device->name, name);
+ device->devt = path_devt;
+
+ list_add_rcu(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
+
+ device->fs_devices = fs_devices;
+ *new_device_added = true;
+
+ if (disk_super->label[0])
+ pr_info(
+ "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
+ disk_super->label, devid, found_transid, path,
+ current->comm, task_pid_nr(current));
+ else
+ pr_info(
+ "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
+ disk_super->fsid, devid, found_transid, path,
+ current->comm, task_pid_nr(current));
+
+ } else if (!device->name || strcmp(device->name->str, path)) {
+ /*
+ * When FS is already mounted.
+ * 1. If you are here and if the device->name is NULL that
+ * means this device was missing at time of FS mount.
+ * 2. If you are here and if the device->name is different
+ * from 'path' that means either
+ * a. The same device disappeared and reappeared with
+ * different name. or
+ * b. The missing-disk-which-was-replaced, has
+ * reappeared now.
+ *
+ * We must allow 1 and 2a above. But 2b would be a spurious
+ * and unintentional.
+ *
+ * Further in case of 1 and 2a above, the disk at 'path'
+ * would have missed some transaction when it was away and
+ * in case of 2a the stale bdev has to be updated as well.
+ * 2b must not be allowed at all time.
+ */
+
+ /*
+ * For now, we do allow update to btrfs_fs_device through the
+ * btrfs dev scan cli after FS has been mounted. We're still
+ * tracking a problem where systems fail mount by subvolume id
+ * when we reject replacement on a mounted FS.
+ */
+ if (!fs_devices->opened && found_transid < device->generation) {
+ /*
+ * That is if the FS is _not_ mounted and if you
+ * are here, that means there is more than one
+ * disk with same uuid and devid.We keep the one
+ * with larger generation number or the last-in if
+ * generation are equal.
+ */
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_err(NULL,
+"device %s already registered with a higher generation, found %llu expect %llu",
+ path, found_transid, device->generation);
+ return ERR_PTR(-EEXIST);
+ }
+
+ /*
+ * We are going to replace the device path for a given devid,
+ * make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
+ */
+ if (device->bdev) {
+ if (device->devt != path_devt) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_warn_in_rcu(NULL,
+ "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
+ path, devid, found_transid,
+ current->comm,
+ task_pid_nr(current));
+ return ERR_PTR(-EEXIST);
+ }
+ btrfs_info_in_rcu(NULL,
+ "devid %llu device path %s changed to %s scanned by %s (%d)",
+ devid, rcu_str_deref(device->name),
+ path, current->comm,
+ task_pid_nr(current));
+ }
+
+ name = rcu_string_strdup(path, GFP_NOFS);
+ if (!name) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ rcu_string_free(device->name);
+ rcu_assign_pointer(device->name, name);
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ fs_devices->missing_devices--;
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+ }
+ device->devt = path_devt;
+ }
+
+ /*
+ * Unmount does not free the btrfs_device struct but would zero
+ * generation along with most of the other members. So just update
+ * it back. We need it to pick the disk with largest generation
+ * (as above).
+ */
+ if (!fs_devices->opened) {
+ device->generation = found_transid;
+ fs_devices->latest_generation = max_t(u64, found_transid,
+ fs_devices->latest_generation);
+ }
+
+ fs_devices->total_devices = btrfs_super_num_devices(disk_super);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return device;
+}
+
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+ struct btrfs_fs_devices *fs_devices;
+ struct btrfs_device *device;
+ struct btrfs_device *orig_dev;
+ int ret = 0;
+
+ lockdep_assert_held(&uuid_mutex);
+
+ fs_devices = alloc_fs_devices(orig->fsid, NULL);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ fs_devices->total_devices = orig->total_devices;
+
+ list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+ struct rcu_string *name;
+
+ device = btrfs_alloc_device(NULL, &orig_dev->devid,
+ orig_dev->uuid);
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ /*
+ * This is ok to do without rcu read locked because we hold the
+ * uuid mutex so nothing we touch in here is going to disappear.
+ */
+ if (orig_dev->name) {
+ name = rcu_string_strdup(orig_dev->name->str,
+ GFP_KERNEL);
+ if (!name) {
+ btrfs_free_device(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
+ }
+
+ if (orig_dev->zone_info) {
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = btrfs_clone_dev_zone_info(orig_dev);
+ if (!zone_info) {
+ btrfs_free_device(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ device->zone_info = zone_info;
+ }
+
+ list_add(&device->dev_list, &fs_devices->devices);
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+ }
+ return fs_devices;
+error:
+ free_fs_devices(fs_devices);
+ return ERR_PTR(ret);
+}
+
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device **latest_dev)
+{
+ struct btrfs_device *device, *next;
+
+ /* This is the initialized path, it is safe to release the devices. */
+ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
+ if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+ &device->dev_state) &&
+ !test_bit(BTRFS_DEV_STATE_MISSING,
+ &device->dev_state) &&
+ (!*latest_dev ||
+ device->generation > (*latest_dev)->generation)) {
+ *latest_dev = device;
+ }
+ continue;
+ }
+
+ /*
+ * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
+ * in btrfs_init_dev_replace() so just continue.
+ */
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ continue;
+
+ if (device->bdev) {
+ blkdev_put(device->bdev, device->mode);
+ device->bdev = NULL;
+ fs_devices->open_devices--;
+ }
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ list_del_init(&device->dev_alloc_list);
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->rw_devices--;
+ }
+ list_del_init(&device->dev_list);
+ fs_devices->num_devices--;
+ btrfs_free_device(device);
+ }
+
+}
+
+/*
+ * After we have read the system tree and know devids belonging to this
+ * filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_fs_devices *seed_dev;
+
+ mutex_lock(&uuid_mutex);
+ __btrfs_free_extra_devids(fs_devices, &latest_dev);
+
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
+ __btrfs_free_extra_devids(seed_dev, &latest_dev);
+
+ fs_devices->latest_dev = latest_dev;
+
+ mutex_unlock(&uuid_mutex);
+}
+
+static void btrfs_close_bdev(struct btrfs_device *device)
+{
+ if (!device->bdev)
+ return;
+
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ sync_blockdev(device->bdev);
+ invalidate_bdev(device->bdev);
+ }
+
+ blkdev_put(device->bdev, device->mode);
+}
+
+static void btrfs_close_one_device(struct btrfs_device *device)
+{
+ struct btrfs_fs_devices *fs_devices = device->fs_devices;
+
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+ fs_devices->missing_devices--;
+ }
+
+ btrfs_close_bdev(device);
+ if (device->bdev) {
+ fs_devices->open_devices--;
+ device->bdev = NULL;
+ }
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ btrfs_destroy_dev_zone_info(device);
+
+ device->fs_info = NULL;
+ atomic_set(&device->dev_stats_ccnt, 0);
+ extent_io_tree_release(&device->alloc_state);
+
+ /*
+ * Reset the flush error record. We might have a transient flush error
+ * in this mount, and if so we aborted the current transaction and set
+ * the fs to an error state, guaranteeing no super blocks can be further
+ * committed. However that error might be transient and if we unmount the
+ * filesystem and mount it again, we should allow the mount to succeed
+ * (btrfs_check_rw_degradable() should not fail) - if after mounting the
+ * filesystem again we still get flush errors, then we will again abort
+ * any transaction and set the error state, guaranteeing no commits of
+ * unsafe super blocks.
+ */
+ device->last_flush_error = 0;
+
+ /* Verify the device is back in a pristine state */
+ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
+ ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ ASSERT(list_empty(&device->dev_alloc_list));
+ ASSERT(list_empty(&device->post_commit_list));
+}
+
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device, *tmp;
+
+ lockdep_assert_held(&uuid_mutex);
+
+ if (--fs_devices->opened > 0)
+ return;
+
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
+ btrfs_close_one_device(device);
+
+ WARN_ON(fs_devices->open_devices);
+ WARN_ON(fs_devices->rw_devices);
+ fs_devices->opened = 0;
+ fs_devices->seeding = false;
+ fs_devices->fs_info = NULL;
+}
+
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+ LIST_HEAD(list);
+ struct btrfs_fs_devices *tmp;
+
+ mutex_lock(&uuid_mutex);
+ close_fs_devices(fs_devices);
+ if (!fs_devices->opened) {
+ list_splice_init(&fs_devices->seed_list, &list);
+
+ /*
+ * If the struct btrfs_fs_devices is not assembled with any
+ * other device, it can be re-initialized during the next mount
+ * without the needing device-scan step. Therefore, it can be
+ * fully freed.
+ */
+ if (fs_devices->num_devices == 1) {
+ list_del(&fs_devices->fs_list);
+ free_fs_devices(fs_devices);
+ }
+ }
+
+
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
+ close_fs_devices(fs_devices);
+ list_del(&fs_devices->seed_list);
+ free_fs_devices(fs_devices);
+ }
+ mutex_unlock(&uuid_mutex);
+}
+
+static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder)
+{
+ struct btrfs_device *device;
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_device *tmp_device;
+
+ flags |= FMODE_EXCL;
+
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
+ dev_list) {
+ int ret;
+
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret == 0 &&
+ (!latest_dev || device->generation > latest_dev->generation)) {
+ latest_dev = device;
+ } else if (ret == -ENODATA) {
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
+ }
+ if (fs_devices->open_devices == 0)
+ return -EINVAL;
+
+ fs_devices->opened = 1;
+ fs_devices->latest_dev = latest_dev;
+ fs_devices->total_rw_bytes = 0;
+ fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+ fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+
+ return 0;
+}
+
+static int devid_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ const struct btrfs_device *dev1, *dev2;
+
+ dev1 = list_entry(a, struct btrfs_device, dev_list);
+ dev2 = list_entry(b, struct btrfs_device, dev_list);
+
+ if (dev1->devid < dev2->devid)
+ return -1;
+ else if (dev1->devid > dev2->devid)
+ return 1;
+ return 0;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder)
+{
+ int ret;
+
+ lockdep_assert_held(&uuid_mutex);
+ /*
+ * The device_list_mutex cannot be taken here in case opening the
+ * underlying device takes further locks like open_mutex.
+ *
+ * We also don't need the lock here as this is called during mount and
+ * exclusion is provided by uuid_mutex
+ */
+
+ if (fs_devices->opened) {
+ fs_devices->opened++;
+ ret = 0;
+ } else {
+ list_sort(NULL, &fs_devices->devices, devid_cmp);
+ ret = open_fs_devices(fs_devices, flags, holder);
+ }
+
+ return ret;
+}
+
+void btrfs_release_disk_super(struct btrfs_super_block *super)
+{
+ struct page *page = virt_to_page(super);
+
+ put_page(page);
+}
+
+static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ u64 bytenr, u64 bytenr_orig)
+{
+ struct btrfs_super_block *disk_super;
+ struct page *page;
+ void *p;
+ pgoff_t index;
+
+ /* make sure our super fits in the device */
+ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
+ return ERR_PTR(-EINVAL);
+
+ /* make sure our super fits in the page */
+ if (sizeof(*disk_super) > PAGE_SIZE)
+ return ERR_PTR(-EINVAL);
+
+ /* make sure our super doesn't straddle pages on disk */
+ index = bytenr >> PAGE_SHIFT;
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
+ return ERR_PTR(-EINVAL);
+
+ /* pull in the page with our super */
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
+
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+
+ p = page_address(page);
+
+ /* align our pointer to the offset of the super block */
+ disk_super = p + offset_in_page(bytenr);
+
+ if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(p);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
+
+ return disk_super;
+}
+
+int btrfs_forget_devices(dev_t devt)
+{
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ ret = btrfs_free_stale_devices(devt, NULL);
+ mutex_unlock(&uuid_mutex);
+
+ return ret;
+}
+
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
+struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
+ void *holder)
+{
+ struct btrfs_super_block *disk_super;
+ bool new_device_added = false;
+ struct btrfs_device *device = NULL;
+ struct block_device *bdev;
+ u64 bytenr, bytenr_orig;
+ int ret;
+
+ lockdep_assert_held(&uuid_mutex);
+
+ /*
+ * we would like to check all the supers, but that would make
+ * a btrfs mount succeed after a mkfs from a different FS.
+ * So, we need to add a special mount option to scan for
+ * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ */
+
+ /*
+ * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
+ * initiate the device scan which may race with the user's mount
+ * or mkfs command, resulting in failure.
+ * Since the device scan is solely for reading purposes, there is
+ * no need for FMODE_EXCL. Additionally, the devices are read again
+ * during the mount process. It is ok to get some inconsistent
+ * values temporarily, as the device paths of the fsid are the only
+ * required information for assembling the volume.
+ */
+ bdev = blkdev_get_by_path(path, flags, holder);
+ if (IS_ERR(bdev))
+ return ERR_CAST(bdev);
+
+ bytenr_orig = btrfs_sb_offset(0);
+ ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ if (ret) {
+ device = ERR_PTR(ret);
+ goto error_bdev_put;
+ }
+
+ disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
+ if (IS_ERR(disk_super)) {
+ device = ERR_CAST(disk_super);
+ goto error_bdev_put;
+ }
+
+ device = device_list_add(path, disk_super, &new_device_added);
+ if (!IS_ERR(device) && new_device_added)
+ btrfs_free_stale_devices(device->devt, device);
+
+ btrfs_release_disk_super(disk_super);
+
+error_bdev_put:
+ blkdev_put(bdev, flags);
+
+ return device;
+}
+
+/*
+ * Try to find a chunk that intersects [start, start + len] range and when one
+ * such is found, record the end of it in *start
+ */
+static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
+ u64 len)
+{
+ u64 physical_start, physical_end;
+
+ lockdep_assert_held(&device->fs_info->chunk_mutex);
+
+ if (!find_first_extent_bit(&device->alloc_state, *start,
+ &physical_start, &physical_end,
+ CHUNK_ALLOCATED, NULL)) {
+
+ if (in_range(physical_start, *start, len) ||
+ in_range(*start, physical_start,
+ physical_end - physical_start)) {
+ *start = physical_end + 1;
+ return true;
+ }
+ }
+ return false;
+}
+
+static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
+{
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ /*
+ * We don't care about the starting region like regular
+ * allocator, because we anyway use/reserve the first two zones
+ * for superblock logging.
+ */
+ return ALIGN(start, device->zone_info->zone_size);
+ default:
+ BUG();
+ }
+}
+
+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
+ u64 *hole_start, u64 *hole_size,
+ u64 num_bytes)
+{
+ u64 zone_size = device->zone_info->zone_size;
+ u64 pos;
+ int ret;
+ bool changed = false;
+
+ ASSERT(IS_ALIGNED(*hole_start, zone_size));
+
+ while (*hole_size > 0) {
+ pos = btrfs_find_allocatable_zones(device, *hole_start,
+ *hole_start + *hole_size,
+ num_bytes);
+ if (pos != *hole_start) {
+ *hole_size = *hole_start + *hole_size - pos;
+ *hole_start = pos;
+ changed = true;
+ if (*hole_size < num_bytes)
+ break;
+ }
+
+ ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
+
+ /* Range is ensured to be empty */
+ if (!ret)
+ return changed;
+
+ /* Given hole range was invalid (outside of device) */
+ if (ret == -ERANGE) {
+ *hole_start += *hole_size;
+ *hole_size = 0;
+ return true;
+ }
+
+ *hole_start += zone_size;
+ *hole_size -= zone_size;
+ changed = true;
+ }
+
+ return changed;
+}
+
+/**
+ * dev_extent_hole_check - check if specified hole is suitable for allocation
+ * @device: the device which we have the hole
+ * @hole_start: starting position of the hole
+ * @hole_size: the size of the hole
+ * @num_bytes: the size of the free space that we need
+ *
+ * This function may modify @hole_start and @hole_size to reflect the suitable
+ * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
+ */
+static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
+ u64 *hole_size, u64 num_bytes)
+{
+ bool changed = false;
+ u64 hole_end = *hole_start + *hole_size;
+
+ for (;;) {
+ /*
+ * Check before we set max_hole_start, otherwise we could end up
+ * sending back this offset anyway.
+ */
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
+ if (hole_end >= *hole_start)
+ *hole_size = hole_end - *hole_start;
+ else
+ *hole_size = 0;
+ changed = true;
+ }
+
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /* No extra check */
+ break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ if (dev_extent_hole_check_zoned(device, hole_start,
+ hole_size, num_bytes)) {
+ changed = true;
+ /*
+ * The changed hole can contain pending extent.
+ * Loop again to check that.
+ */
+ continue;
+ }
+ break;
+ default:
+ BUG();
+ }
+
+ break;
+ }
+
+ return changed;
+}
+
+/*
+ * find_free_dev_extent_start - find free space in the specified device
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @search_start: the position from which to begin the search
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size
+ * of the max free space if we don't find suitable free space
+ *
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
+ *
+ * NOTE: This function will search *commit* root of device tree, and does extra
+ * check to ensure dev extents are not double allocated.
+ * This makes the function safe to allocate dev extents but may not report
+ * correct usable device space, as device extent freed in current transaction
+ * is not reported as available.
+ */
+static int find_free_dev_extent_start(struct btrfs_device *device,
+ u64 num_bytes, u64 search_start, u64 *start,
+ u64 *len)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_key key;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_path *path;
+ u64 hole_size;
+ u64 max_hole_start;
+ u64 max_hole_size;
+ u64 extent_end;
+ u64 search_end = device->total_bytes;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+
+ search_start = dev_extent_search_start(device, search_start);
+
+ WARN_ON(device->zone_info &&
+ !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ max_hole_start = search_start;
+ max_hole_size = 0;
+
+again:
+ if (search_start >= search_end ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ path->reada = READA_FORWARD;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = device->devid;
+ key.offset = search_start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ ret = btrfs_search_backwards(root, &key, path);
+ if (ret < 0)
+ goto out;
+
+ while (search_start < search_end) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid < device->devid)
+ goto next;
+
+ if (key.objectid > device->devid)
+ break;
+
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
+ goto next;
+
+ if (key.offset > search_end)
+ break;
+
+ if (key.offset > search_start) {
+ hole_size = key.offset - search_start;
+ dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes);
+
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
+
+ /*
+ * If this free space is greater than which we need,
+ * it must be the max free space that we have found
+ * until now, so max_hole_start must point to the start
+ * of this free space and the length of this free space
+ * is stored in max_hole_size. Thus, we return
+ * max_hole_start and max_hole_size and go back to the
+ * caller.
+ */
+ if (hole_size >= num_bytes) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (extent_end > search_start)
+ search_start = extent_end;
+next:
+ path->slots[0]++;
+ cond_resched();
+ }
+
+ /*
+ * At this point, search_start should be the end of
+ * allocated dev extents, and when shrinking the device,
+ * search_end may be smaller than search_start.
+ */
+ if (search_end > search_start) {
+ hole_size = search_end - search_start;
+ if (dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes)) {
+ btrfs_release_path(path);
+ goto again;
+ }
+
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
+ }
+
+ /* See above. */
+ if (max_hole_size < num_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+
+ ASSERT(max_hole_start + max_hole_size <= search_end);
+out:
+ btrfs_free_path(path);
+ *start = max_hole_start;
+ if (len)
+ *len = max_hole_size;
+ return ret;
+}
+
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *len)
+{
+ /* FIXME use last free of some kind */
+ return find_free_dev_extent_start(device, num_bytes, 0, start, len);
+}
+
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 start, u64 *dev_extent_len)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf = NULL;
+ struct btrfs_dev_extent *extent = NULL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+again:
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid,
+ BTRFS_DEV_EXTENT_KEY);
+ if (ret)
+ goto out;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ BUG_ON(found_key.offset > start || found_key.offset +
+ btrfs_dev_extent_length(leaf, extent) < start);
+ key = found_key;
+ btrfs_release_path(path);
+ goto again;
+ } else if (ret == 0) {
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ } else {
+ goto out;
+ }
+
+ *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret == 0)
+ set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct rb_node *n;
+ u64 ret = 0;
+
+ em_tree = &fs_info->mapping_tree;
+ read_lock(&em_tree->lock);
+ n = rb_last(&em_tree->map.rb_root);
+ if (n) {
+ em = rb_entry(n, struct extent_map, rb_node);
+ ret = em->start + em->len;
+ }
+ read_unlock(&em_tree->lock);
+
+ return ret;
+}
+
+static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
+ u64 *devid_ret)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ if (ret == 0) {
+ /* Corruption */
+ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
+ ret = -EUCLEAN;
+ goto error;
+ }
+
+ ret = btrfs_previous_item(fs_info->chunk_root, path,
+ BTRFS_DEV_ITEMS_OBJECTID,
+ BTRFS_DEV_ITEM_KEY);
+ if (ret) {
+ *devid_ret = 1;
+ } else {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ *devid_ret = found_key.offset + 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_dev_item *dev_item;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ btrfs_reserve_chunk_metadata(trans, true);
+ ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
+ &key, sizeof(*dev_item));
+ btrfs_trans_release_chunk_metadata(trans);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+ btrfs_set_device_id(leaf, dev_item, device->devid);
+ btrfs_set_device_generation(leaf, dev_item, 0);
+ btrfs_set_device_type(leaf, dev_item, device->type);
+ btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+ btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+ btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
+ btrfs_set_device_group(leaf, dev_item, 0);
+ btrfs_set_device_seek_speed(leaf, dev_item, 0);
+ btrfs_set_device_bandwidth(leaf, dev_item, 0);
+ btrfs_set_device_start_offset(leaf, dev_item, 0);
+
+ ptr = btrfs_device_uuid(dev_item);
+ write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+ ptr = btrfs_device_fsid(dev_item);
+ write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
+ ptr, BTRFS_FSID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Function to update ctime/mtime for a given device path.
+ * Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
+ */
+static void update_dev_time(const char *device_path)
+{
+ struct path path;
+ struct timespec64 now;
+ int ret;
+
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return;
+
+ now = current_time(d_inode(path.dentry));
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ path_put(&path);
+}
+
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
+{
+ struct btrfs_root *root = device->fs_info->chunk_root;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ btrfs_trans_release_chunk_metadata(trans);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Verify that @num_devices satisfies the RAID profile constraints in the whole
+ * filesystem. It's up to the caller to adjust that number regarding eg. device
+ * replace.
+ */
+static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
+ u64 num_devices)
+{
+ u64 all_avail;
+ unsigned seq;
+ int i;
+
+ do {
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ all_avail = fs_info->avail_data_alloc_bits |
+ fs_info->avail_system_alloc_bits |
+ fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (!(all_avail & btrfs_raid_array[i].bg_flag))
+ continue;
+
+ if (num_devices < btrfs_raid_array[i].devs_min)
+ return btrfs_raid_array[i].mindev_error;
+ }
+
+ return 0;
+}
+
+static struct btrfs_device * btrfs_find_next_active_device(
+ struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
+{
+ struct btrfs_device *next_device;
+
+ list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
+ if (next_device != device &&
+ !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
+ && next_device->bdev)
+ return next_device;
+ }
+
+ return NULL;
+}
+
+/*
+ * Helper function to check if the given device is part of s_bdev / latest_dev
+ * and replace it with the provided or the next active device, in the context
+ * where this function called, there should be always be another device (or
+ * this_dev) which is active.
+ */
+void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
+ struct btrfs_device *next_device)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+
+ if (!next_device)
+ next_device = btrfs_find_next_active_device(fs_info->fs_devices,
+ device);
+ ASSERT(next_device);
+
+ if (fs_info->sb->s_bdev &&
+ (fs_info->sb->s_bdev == device->bdev))
+ fs_info->sb->s_bdev = next_device->bdev;
+
+ if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+ fs_info->fs_devices->latest_dev = next_device;
+}
+
+/*
+ * Return btrfs_fs_devices::num_devices excluding the device that's being
+ * currently replaced.
+ */
+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
+{
+ u64 num_devices = fs_info->fs_devices->num_devices;
+
+ down_read(&fs_info->dev_replace.rwsem);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+ ASSERT(num_devices > 1);
+ num_devices--;
+ }
+ up_read(&fs_info->dev_replace.rwsem);
+
+ return num_devices;
+}
+
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
+{
+ struct btrfs_super_block *disk_super;
+ int copy_num;
+
+ if (!bdev)
+ return;
+
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
+ struct page *page;
+ int ret;
+
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
+ if (IS_ERR(disk_super))
+ continue;
+
+ if (bdev_is_zoned(bdev)) {
+ btrfs_reset_sb_log_zones(bdev, copy_num);
+ continue;
+ }
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+
+ page = virt_to_page(disk_super);
+ set_page_dirty(page);
+ lock_page(page);
+ /* write_on_page() unlocks the page */
+ ret = write_one_page(page);
+ if (ret)
+ btrfs_warn(fs_info,
+ "error clearing superblock number %d (%d)",
+ copy_num, ret);
+ btrfs_release_disk_super(disk_super);
+
+ }
+
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
+}
+
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ struct block_device **bdev, fmode_t *mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *cur_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u64 num_devices;
+ int ret = 0;
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
+ num_devices = btrfs_num_devices(fs_info);
+
+ ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
+ if (ret)
+ return ret;
+
+ device = btrfs_find_device(fs_info->fs_devices, args);
+ if (!device) {
+ if (args->missing)
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
+ else
+ ret = -ENOENT;
+ return ret;
+ }
+
+ if (btrfs_pinned_by_swapfile(fs_info, device)) {
+ btrfs_warn_in_rcu(fs_info,
+ "cannot remove device %s (devid %llu) due to active swapfile",
+ rcu_str_deref(device->name), device->devid);
+ return -ETXTBSY;
+ }
+
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return BTRFS_ERROR_DEV_TGT_REPLACE;
+
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ fs_info->fs_devices->rw_devices == 1)
+ return BTRFS_ERROR_DEV_ONLY_WRITABLE;
+
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ mutex_lock(&fs_info->chunk_mutex);
+ list_del_init(&device->dev_alloc_list);
+ device->fs_devices->rw_devices--;
+ mutex_unlock(&fs_info->chunk_mutex);
+ }
+
+ ret = btrfs_shrink_device(device, 0);
+ if (ret)
+ goto error_undo;
+
+ trans = btrfs_start_transaction(fs_info->chunk_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto error_undo;
+ }
+
+ ret = btrfs_rm_dev_item(trans, device);
+ if (ret) {
+ /* Any error in dev item removal is critical */
+ btrfs_crit(fs_info,
+ "failed to remove device item for devid %llu: %d",
+ device->devid, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ btrfs_scrub_cancel_dev(device);
+
+ /*
+ * the device list mutex makes sure that we don't change
+ * the device list while someone else is writing out all
+ * the device supers. Whoever is writing all supers, should
+ * lock the device list mutex before getting the number of
+ * devices in the super block (super_copy). Conversely,
+ * whoever updates the number of devices in the super block
+ * (super_copy) should hold the device list mutex.
+ */
+
+ /*
+ * In normal cases the cur_devices == fs_devices. But in case
+ * of deleting a seed device, the cur_devices should point to
+ * its own fs_devices listed under the fs_devices->seed_list.
+ */
+ cur_devices = device->fs_devices;
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_del_rcu(&device->dev_list);
+
+ cur_devices->num_devices--;
+ cur_devices->total_devices--;
+ /* Update total_devices of the parent fs_devices if it's seed */
+ if (cur_devices != fs_devices)
+ fs_devices->total_devices--;
+
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ cur_devices->missing_devices--;
+
+ btrfs_assign_next_active_device(device, NULL);
+
+ if (device->bdev) {
+ cur_devices->open_devices--;
+ /* remove sysfs entry */
+ btrfs_sysfs_remove_device(device);
+ }
+
+ num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
+ btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /*
+ * At this point, the device is zero sized and detached from the
+ * devices list. All that's left is to zero out the old supers and
+ * free the device.
+ *
+ * We cannot call btrfs_close_bdev() here because we're holding the sb
+ * write lock, and blkdev_put() will pull in the ->open_mutex on the
+ * block device and it's dependencies. Instead just flush the device
+ * and let the caller do the final blkdev_put.
+ */
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ btrfs_scratch_superblocks(fs_info, device->bdev,
+ device->name->str);
+ if (device->bdev) {
+ sync_blockdev(device->bdev);
+ invalidate_bdev(device->bdev);
+ }
+ }
+
+ *bdev = device->bdev;
+ *mode = device->mode;
+ synchronize_rcu();
+ btrfs_free_device(device);
+
+ /*
+ * This can happen if cur_devices is the private seed devices list. We
+ * cannot call close_fs_devices() here because it expects the uuid_mutex
+ * to be held, but in fact we don't need that for the private
+ * seed_devices, we can simply decrement cur_devices->opened and then
+ * remove it from our list and free the fs_devices.
+ */
+ if (cur_devices->num_devices == 0) {
+ list_del_init(&cur_devices->seed_list);
+ ASSERT(cur_devices->opened == 1);
+ cur_devices->opened--;
+ free_fs_devices(cur_devices);
+ }
+
+ ret = btrfs_commit_transaction(trans);
+
+ return ret;
+
+error_undo:
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ mutex_lock(&fs_info->chunk_mutex);
+ list_add(&device->dev_alloc_list,
+ &fs_devices->alloc_list);
+ device->fs_devices->rw_devices++;
+ mutex_unlock(&fs_info->chunk_mutex);
+ }
+ return ret;
+}
+
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
+
+ /*
+ * in case of fs with no seed, srcdev->fs_devices will point
+ * to fs_devices of fs_info. However when the dev being replaced is
+ * a seed dev it will point to the seed's local fs_devices. In short
+ * srcdev will have its correct fs_devices in both the cases.
+ */
+ fs_devices = srcdev->fs_devices;
+
+ list_del_rcu(&srcdev->dev_list);
+ list_del(&srcdev->dev_alloc_list);
+ fs_devices->num_devices--;
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
+ fs_devices->missing_devices--;
+
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
+ fs_devices->rw_devices--;
+
+ if (srcdev->bdev)
+ fs_devices->open_devices--;
+}
+
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
+{
+ struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
+
+ mutex_lock(&uuid_mutex);
+
+ btrfs_close_bdev(srcdev);
+ synchronize_rcu();
+ btrfs_free_device(srcdev);
+
+ /* if this is no devs we rather delete the fs_devices */
+ if (!fs_devices->num_devices) {
+ /*
+ * On a mounted FS, num_devices can't be zero unless it's a
+ * seed. In case of a seed device being replaced, the replace
+ * target added to the sprout FS, so there will be no more
+ * device left under the seed FS.
+ */
+ ASSERT(fs_devices->seeding);
+
+ list_del_init(&fs_devices->seed_list);
+ close_fs_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ }
+ mutex_unlock(&uuid_mutex);
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
+{
+ struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ btrfs_sysfs_remove_device(tgtdev);
+
+ if (tgtdev->bdev)
+ fs_devices->open_devices--;
+
+ fs_devices->num_devices--;
+
+ btrfs_assign_next_active_device(tgtdev, NULL);
+
+ list_del_rcu(&tgtdev->dev_list);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
+ tgtdev->name->str);
+
+ btrfs_close_bdev(tgtdev);
+ synchronize_rcu();
+ btrfs_free_device(tgtdev);
+}
+
+/**
+ * Populate args from device at path
+ *
+ * @fs_info: the filesystem
+ * @args: the args to populate
+ * @path: the path to the device
+ *
+ * This will read the super block of the device at @path and populate @args with
+ * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
+ * lookup a device to operate on, but need to do it before we take any locks.
+ * This properly handles the special case of "missing" that a user may pass in,
+ * and does some basic sanity checks. The caller must make sure that @path is
+ * properly NUL terminated before calling in, and must call
+ * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+ * uuid buffers.
+ *
+ * Return: 0 for success, -errno for failure
+ */
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path)
+{
+ struct btrfs_super_block *disk_super;
+ struct block_device *bdev;
+ int ret;
+
+ if (!path || !path[0])
+ return -EINVAL;
+ if (!strcmp(path, "missing")) {
+ args->missing = true;
+ return 0;
+ }
+
+ args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+ args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+ if (!args->uuid || !args->fsid) {
+ btrfs_put_dev_args_from_path(args);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+ &bdev, &disk_super);
+ if (ret) {
+ btrfs_put_dev_args_from_path(args);
+ return ret;
+ }
+
+ args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+ memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+ else
+ memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+ btrfs_release_disk_super(disk_super);
+ blkdev_put(bdev, FMODE_READ);
+ return 0;
+}
+
+/*
+ * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+ * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+ * that don't need to be freed.
+ */
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+{
+ kfree(args->uuid);
+ kfree(args->fsid);
+ args->uuid = NULL;
+ args->fsid = NULL;
+}
+
+struct btrfs_device *btrfs_find_device_by_devspec(
+ struct btrfs_fs_info *fs_info, u64 devid,
+ const char *device_path)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct btrfs_device *device;
+ int ret;
+
+ if (devid) {
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ if (!device)
+ return ERR_PTR(-ENOENT);
+ return device;
+ }
+
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+ if (ret)
+ return ERR_PTR(ret);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ btrfs_put_dev_args_from_path(&args);
+ if (!device)
+ return ERR_PTR(-ENOENT);
+ return device;
+}
+
+static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *old_devices;
+ struct btrfs_fs_devices *seed_devices;
+
+ lockdep_assert_held(&uuid_mutex);
+ if (!fs_devices->seeding)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Private copy of the seed devices, anchored at
+ * fs_info->fs_devices->seed_list
+ */
+ seed_devices = alloc_fs_devices(NULL, NULL);
+ if (IS_ERR(seed_devices))
+ return seed_devices;
+
+ /*
+ * It's necessary to retain a copy of the original seed fs_devices in
+ * fs_uuids so that filesystems which have been seeded can successfully
+ * reference the seed device from open_seed_devices. This also supports
+ * multiple fs seed.
+ */
+ old_devices = clone_fs_devices(fs_devices);
+ if (IS_ERR(old_devices)) {
+ kfree(seed_devices);
+ return old_devices;
+ }
+
+ list_add(&old_devices->fs_list, &fs_uuids);
+
+ memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+ seed_devices->opened = 1;
+ INIT_LIST_HEAD(&seed_devices->devices);
+ INIT_LIST_HEAD(&seed_devices->alloc_list);
+ mutex_init(&seed_devices->device_list_mutex);
+
+ return seed_devices;
+}
+
+/*
+ * Splice seed devices into the sprout fs_devices.
+ * Generate a new fsid for the sprouted read-write filesystem.
+ */
+static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *seed_devices)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct btrfs_device *device;
+ u64 super_flags;
+
+ /*
+ * We are updating the fsid, the thread leading to device_list_add()
+ * could race, so uuid_mutex is needed.
+ */
+ lockdep_assert_held(&uuid_mutex);
+
+ /*
+ * The threads listed below may traverse dev_list but can do that without
+ * device_list_mutex:
+ * - All device ops and balance - as we are in btrfs_exclop_start.
+ * - Various dev_list readers - are using RCU.
+ * - btrfs_ioctl_fitrim() - is using RCU.
+ *
+ * For-read threads as below are using device_list_mutex:
+ * - Readonly scrub btrfs_scrub_dev()
+ * - Readonly scrub btrfs_scrub_progress()
+ * - btrfs_get_dev_stats()
+ */
+ lockdep_assert_held(&fs_devices->device_list_mutex);
+
+ list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
+ synchronize_rcu);
+ list_for_each_entry(device, &seed_devices->devices, dev_list)
+ device->fs_devices = seed_devices;
+
+ fs_devices->seeding = false;
+ fs_devices->num_devices = 0;
+ fs_devices->open_devices = 0;
+ fs_devices->missing_devices = 0;
+ fs_devices->rotating = false;
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
+
+ generate_random_uuid(fs_devices->fsid);
+ memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+
+ super_flags = btrfs_super_flags(disk_super) &
+ ~BTRFS_SUPER_FLAG_SEEDING;
+ btrfs_set_super_flags(disk_super, super_flags);
+}
+
+/*
+ * Store the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = fs_info->chunk_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dev_item *dev_item;
+ struct btrfs_device *device;
+ struct btrfs_key key;
+ u8 fs_uuid[BTRFS_FSID_SIZE];
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.offset = 0;
+ key.type = BTRFS_DEV_ITEM_KEY;
+
+ while (1) {
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ btrfs_trans_release_chunk_metadata(trans);
+ if (ret < 0)
+ goto error;
+
+ leaf = path->nodes[0];
+next_slot:
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ break;
+ if (ret < 0)
+ goto error;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(path);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+ key.type != BTRFS_DEV_ITEM_KEY)
+ break;
+
+ dev_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_item);
+ args.devid = btrfs_device_id(leaf, dev_item);
+ read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
+ BTRFS_UUID_SIZE);
+ read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
+ BTRFS_FSID_SIZE);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ BUG_ON(!device); /* Logic error */
+
+ if (device->fs_devices->seeding) {
+ btrfs_set_device_generation(leaf, dev_item,
+ device->generation);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+
+ path->slots[0]++;
+ goto next_slot;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
+{
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct super_block *sb = fs_info->sb;
+ struct rcu_string *name;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *seed_devices = NULL;
+ u64 orig_super_total_bytes;
+ u64 orig_super_num_devices;
+ int ret = 0;
+ bool seeding_dev = false;
+ bool locked = false;
+
+ if (sb_rdonly(sb) && !fs_devices->seeding)
+ return -EROFS;
+
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+ fs_info->bdev_holder);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (fs_devices->seeding) {
+ seeding_dev = true;
+ down_write(&sb->s_umount);
+ mutex_lock(&uuid_mutex);
+ locked = true;
+ }
+
+ sync_blockdev(bdev);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
+ if (device->bdev == bdev) {
+ ret = -EEXIST;
+ rcu_read_unlock();
+ goto error;
+ }
+ }
+ rcu_read_unlock();
+
+ device = btrfs_alloc_device(fs_info, NULL, NULL);
+ if (IS_ERR(device)) {
+ /* we can safely leave the fs_devices entry around */
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ name = rcu_string_strdup(device_path, GFP_KERNEL);
+ if (!name) {
+ ret = -ENOMEM;
+ goto error_free_device;
+ }
+ rcu_assign_pointer(device->name, name);
+
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error_free_device;
+
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ goto error_free_device;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto error_free_zone;
+ }
+
+ set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ device->generation = trans->transid;
+ device->io_width = fs_info->sectorsize;
+ device->io_align = fs_info->sectorsize;
+ device->sector_size = fs_info->sectorsize;
+ device->total_bytes =
+ round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
+ device->disk_total_bytes = device->total_bytes;
+ device->commit_total_bytes = device->total_bytes;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+ device->mode = FMODE_EXCL;
+ device->dev_stats_valid = 1;
+ set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+
+ if (seeding_dev) {
+ btrfs_clear_sb_rdonly(sb);
+
+ /* GFP_KERNEL allocation must not be under device_list_mutex */
+ seed_devices = btrfs_init_sprout(fs_info);
+ if (IS_ERR(seed_devices)) {
+ ret = PTR_ERR(seed_devices);
+ btrfs_abort_transaction(trans, ret);
+ goto error_trans;
+ }
+ }
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ if (seeding_dev) {
+ btrfs_setup_sprout(fs_info, seed_devices);
+ btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+ device);
+ }
+
+ device->fs_devices = fs_devices;
+
+ mutex_lock(&fs_info->chunk_mutex);
+ list_add_rcu(&device->dev_list, &fs_devices->devices);
+ list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+ fs_devices->num_devices++;
+ fs_devices->open_devices++;
+ fs_devices->rw_devices++;
+ fs_devices->total_devices++;
+ fs_devices->total_rw_bytes += device->total_bytes;
+
+ atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
+
+ if (!bdev_nonrot(bdev))
+ fs_devices->rotating = true;
+
+ orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+ btrfs_set_super_total_bytes(fs_info->super_copy,
+ round_down(orig_super_total_bytes + device->total_bytes,
+ fs_info->sectorsize));
+
+ orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
+ btrfs_set_super_num_devices(fs_info->super_copy,
+ orig_super_num_devices + 1);
+
+ /*
+ * we've got more storage, clear any full flags on the space
+ * infos
+ */
+ btrfs_clear_space_info_full(fs_info);
+
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ /* Add sysfs device entry */
+ btrfs_sysfs_add_device(device);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (seeding_dev) {
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = init_first_rw_device(trans);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_sysfs;
+ }
+ }
+
+ ret = btrfs_add_dev_item(trans, device);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_sysfs;
+ }
+
+ if (seeding_dev) {
+ ret = btrfs_finish_sprout(trans);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_sysfs;
+ }
+
+ /*
+ * fs_devices now represents the newly sprouted filesystem and
+ * its fsid has been changed by btrfs_sprout_splice().
+ */
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
+ }
+
+ ret = btrfs_commit_transaction(trans);
+
+ if (seeding_dev) {
+ mutex_unlock(&uuid_mutex);
+ up_write(&sb->s_umount);
+ locked = false;
+
+ if (ret) /* transaction commit */
+ return ret;
+
+ ret = btrfs_relocate_sys_chunks(fs_info);
+ if (ret < 0)
+ btrfs_handle_fs_error(fs_info, ret,
+ "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) == -ENOENT)
+ return 0;
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto error_sysfs;
+ }
+ ret = btrfs_commit_transaction(trans);
+ }
+
+ /*
+ * Now that we have written a new super block to this device, check all
+ * other fs_devices list if device_path alienates any other scanned
+ * device.
+ * We can ignore the return value as it typically returns -EINVAL and
+ * only succeeds if the device was an alien.
+ */
+ btrfs_forget_devices(device->devt);
+
+ /* Update ctime/mtime for blkid or udev */
+ update_dev_time(device_path);
+
+ return ret;
+
+error_sysfs:
+ btrfs_sysfs_remove_device(device);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
+ list_del_rcu(&device->dev_list);
+ list_del(&device->dev_alloc_list);
+ fs_info->fs_devices->num_devices--;
+ fs_info->fs_devices->open_devices--;
+ fs_info->fs_devices->rw_devices--;
+ fs_info->fs_devices->total_devices--;
+ fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
+ atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
+ btrfs_set_super_total_bytes(fs_info->super_copy,
+ orig_super_total_bytes);
+ btrfs_set_super_num_devices(fs_info->super_copy,
+ orig_super_num_devices);
+ mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+error_trans:
+ if (seeding_dev)
+ btrfs_set_sb_rdonly(sb);
+ if (trans)
+ btrfs_end_transaction(trans);
+error_free_zone:
+ btrfs_destroy_dev_zone_info(device);
+error_free_device:
+ btrfs_free_device(device);
+error:
+ blkdev_put(bdev, FMODE_EXCL);
+ if (locked) {
+ mutex_unlock(&uuid_mutex);
+ up_write(&sb->s_umount);
+ }
+ return ret;
+}
+
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root = device->fs_info->chunk_root;
+ struct btrfs_dev_item *dev_item;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+ btrfs_set_device_id(leaf, dev_item, device->devid);
+ btrfs_set_device_type(leaf, dev_item, device->type);
+ btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+ btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+ btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
+ btrfs_mark_buffer_dirty(leaf);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
+ u64 old_total;
+ u64 diff;
+ int ret;
+
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ return -EACCES;
+
+ new_size = round_down(new_size, fs_info->sectorsize);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ old_total = btrfs_super_total_bytes(super_copy);
+ diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
+
+ if (new_size <= device->total_bytes ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ return -EINVAL;
+ }
+
+ btrfs_set_super_total_bytes(super_copy,
+ round_down(old_total + diff, fs_info->sectorsize));
+ device->fs_devices->total_rw_bytes += diff;
+
+ btrfs_device_set_total_bytes(device, new_size);
+ btrfs_device_set_disk_total_bytes(device, new_size);
+ btrfs_clear_space_info_full(device->fs_info);
+ if (list_empty(&device->post_commit_list))
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = fs_info->chunk_root;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = chunk_offset;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0) { /* Logic error or corruption */
+ btrfs_handle_fs_error(fs_info, -ENOENT,
+ "Failed lookup while freeing chunk.");
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret < 0)
+ btrfs_handle_fs_error(fs_info, ret,
+ "Failed to delete chunk item.");
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+{
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ u8 *ptr;
+ int ret = 0;
+ u32 num_stripes;
+ u32 array_size;
+ u32 len = 0;
+ u32 cur;
+ struct btrfs_key key;
+
+ lockdep_assert_held(&fs_info->chunk_mutex);
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ ptr = super_copy->sys_chunk_array;
+ cur = 0;
+
+ while (cur < array_size) {
+ disk_key = (struct btrfs_disk_key *)ptr;
+ btrfs_disk_key_to_cpu(&key, disk_key);
+
+ len = sizeof(*disk_key);
+
+ if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+ chunk = (struct btrfs_chunk *)(ptr + len);
+ num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+ len += btrfs_chunk_item_size(num_stripes);
+ } else {
+ ret = -EIO;
+ break;
+ }
+ if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
+ key.offset == chunk_offset) {
+ memmove(ptr, ptr + len, array_size - (cur + len));
+ array_size -= len;
+ btrfs_set_super_sys_array_size(super_copy, array_size);
+ } else {
+ ptr += len;
+ cur += len;
+ }
+ }
+ return ret;
+}
+
+/*
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * @logical: Logical block offset in bytes.
+ * @length: Length of extent in bytes.
+ *
+ * Return: Chunk mapping or ERR_PTR.
+ */
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &fs_info->mapping_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(fs_info,
+ "unable to find chunk map for logical %llu length %llu",
+ logical, length);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (em->start > logical || em->start + em->len <= logical) {
+ btrfs_crit(fs_info,
+ "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
+ logical, logical + length, em->start, em->start + em->len);
+ free_extent_map(em);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* callers are responsible for dropping em's ref. */
+ return em;
+}
+
+static int remove_chunk_item(struct btrfs_trans_handle *trans,
+ struct map_lookup *map, u64 chunk_offset)
+{
+ int i;
+
+ /*
+ * Removing chunk items and updating the device items in the chunks btree
+ * requires holding the chunk_mutex.
+ * See the comment at btrfs_chunk_alloc() for the details.
+ */
+ lockdep_assert_held(&trans->fs_info->chunk_mutex);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ int ret;
+
+ ret = btrfs_update_device(trans, map->stripes[i].dev);
+ if (ret)
+ return ret;
+ }
+
+ return btrfs_free_chunk(trans, chunk_offset);
+}
+
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 dev_extent_len = 0;
+ int i, ret = 0;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em)) {
+ /*
+ * This is a logic error, but we don't want to just rely on the
+ * user having built with ASSERT enabled, so if ASSERT doesn't
+ * do anything we still error out.
+ */
+ ASSERT(0);
+ return PTR_ERR(em);
+ }
+ map = em->map_lookup;
+
+ /*
+ * First delete the device extent items from the devices btree.
+ * We take the device_list_mutex to avoid racing with the finishing phase
+ * of a device replace operation. See the comment below before acquiring
+ * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
+ * because that can result in a deadlock when deleting the device extent
+ * items from the devices btree - COWing an extent buffer from the btree
+ * may result in allocating a new metadata chunk, which would attempt to
+ * lock again fs_info->chunk_mutex.
+ */
+ mutex_lock(&fs_devices->device_list_mutex);
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *device = map->stripes[i].dev;
+ ret = btrfs_free_dev_extent(trans, device,
+ map->stripes[i].physical,
+ &dev_extent_len);
+ if (ret) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ if (device->bytes_used > 0) {
+ mutex_lock(&fs_info->chunk_mutex);
+ btrfs_device_set_bytes_used(device,
+ device->bytes_used - dev_extent_len);
+ atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
+ btrfs_clear_space_info_full(fs_info);
+ mutex_unlock(&fs_info->chunk_mutex);
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /*
+ * We acquire fs_info->chunk_mutex for 2 reasons:
+ *
+ * 1) Just like with the first phase of the chunk allocation, we must
+ * reserve system space, do all chunk btree updates and deletions, and
+ * update the system chunk array in the superblock while holding this
+ * mutex. This is for similar reasons as explained on the comment at
+ * the top of btrfs_chunk_alloc();
+ *
+ * 2) Prevent races with the final phase of a device replace operation
+ * that replaces the device object associated with the map's stripes,
+ * because the device object's id can change at any time during that
+ * final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of
+ * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
+ * the device item, which does not exists on the chunk btree.
+ * The finishing phase of device replace acquires both the
+ * device_list_mutex and the chunk_mutex, in that order, so we are
+ * safe by just acquiring the chunk_mutex.
+ */
+ trans->removing_chunk = true;
+ mutex_lock(&fs_info->chunk_mutex);
+
+ check_system_chunk(trans, map->type);
+
+ ret = remove_chunk_item(trans, map, chunk_offset);
+ /*
+ * Normally we should not get -ENOSPC since we reserved space before
+ * through the call to check_system_chunk().
+ *
+ * Despite our system space_info having enough free space, we may not
+ * be able to allocate extents from its block groups, because all have
+ * an incompatible profile, which will force us to allocate a new system
+ * block group with the right profile, or right after we called
+ * check_system_space() above, a scrub turned the only system block group
+ * with enough free space into RO mode.
+ * This is explained with more detail at do_chunk_alloc().
+ *
+ * So if we get -ENOSPC, allocate a new system chunk and retry once.
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
+ struct btrfs_block_group *sys_bg;
+
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = remove_chunk_item(trans, map, chunk_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ }
+
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans->removing_chunk = false;
+
+ /*
+ * We are done with chunk btree updates and deletions, so release the
+ * system space we previously reserved (with check_system_chunk()).
+ */
+ btrfs_trans_release_chunk_metadata(trans);
+
+ ret = btrfs_remove_block_group(trans, chunk_offset, em);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+out:
+ if (trans->removing_chunk) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans->removing_chunk = false;
+ }
+ /* once for us */
+ free_extent_map(em);
+ return ret;
+}
+
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+{
+ struct btrfs_root *root = fs_info->chunk_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_group *block_group;
+ u64 length;
+ int ret;
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "relocate: not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
+ /*
+ * Prevent races with automatic removal of unused block groups.
+ * After we relocate and before we remove the chunk with offset
+ * chunk_offset, automatic removal of the block group can kick in,
+ * resulting in a failure when calling btrfs_remove_chunk() below.
+ *
+ * Make sure to acquire this mutex before doing a tree search (dev
+ * or chunk trees) to find chunks. Otherwise the cleaner kthread might
+ * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
+ * we release the path used to search the chunk/dev tree and before
+ * the current task acquires this mutex and calls us.
+ */
+ lockdep_assert_held(&fs_info->reclaim_bgs_lock);
+
+ /* step one, relocate all the extents inside this chunk */
+ btrfs_scrub_pause(fs_info);
+ ret = btrfs_relocate_block_group(fs_info, chunk_offset);
+ btrfs_scrub_continue(fs_info);
+ if (ret) {
+ /*
+ * If we had a transaction abort, stop all running scrubs.
+ * See transaction.c:cleanup_transaction() why we do it here.
+ */
+ if (BTRFS_FS_ERROR(fs_info))
+ btrfs_scrub_cancel(fs_info);
+ return ret;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (!block_group)
+ return -ENOENT;
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+ length = block_group->length;
+ btrfs_put_block_group(block_group);
+
+ /*
+ * On a zoned file system, discard the whole block group, this will
+ * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
+ * resetting the zone fails, don't treat it as a fatal problem from the
+ * filesystem's point of view.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
+ if (ret)
+ btrfs_info(fs_info,
+ "failed to reset zone %llu after relocation",
+ chunk_offset);
+ }
+
+ trans = btrfs_start_trans_remove_block_group(root->fs_info,
+ chunk_offset);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_handle_fs_error(root->fs_info, ret, NULL);
+ return ret;
+ }
+
+ /*
+ * step two, delete the device extents and the
+ * chunk tree entries
+ */
+ ret = btrfs_remove_chunk(trans, chunk_offset);
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_chunk *chunk;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 chunk_type;
+ bool retried = false;
+ int failed = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+again:
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ while (1) {
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+ ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto error;
+ }
+ BUG_ON(ret == 0); /* Corruption */
+
+ ret = btrfs_previous_item(chunk_root, path, key.objectid,
+ key.type);
+ if (ret)
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ if (ret < 0)
+ goto error;
+ if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ chunk = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
+ btrfs_release_path(path);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ if (ret == -ENOSPC)
+ failed++;
+ else
+ BUG_ON(ret);
+ }
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+ if (found_key.offset == 0)
+ break;
+ key.offset = found_key.offset - 1;
+ }
+ ret = 0;
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (WARN_ON(failed && retried)) {
+ ret = -ENOSPC;
+ }
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * return 1 : allocate a data chunk successfully,
+ * return <0: errors during allocating a data chunk,
+ * return 0 : no need to allocate a data chunk.
+ */
+static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset)
+{
+ struct btrfs_block_group *cache;
+ u64 bytes_used;
+ u64 chunk_type;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ ASSERT(cache);
+ chunk_type = cache->flags;
+ btrfs_put_block_group(cache);
+
+ if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
+
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
+
+ if (!bytes_used) {
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans);
+ if (ret < 0)
+ return ret;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int insert_balance_item(struct btrfs_fs_info *fs_info,
+ struct btrfs_balance_control *bctl)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_TEMPORARY_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*item));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
+
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+ btrfs_set_balance_data(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+ btrfs_set_balance_meta(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+ btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+ btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+static int del_balance_item(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_TEMPORARY_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+ /*
+ * Turn on soft mode for chunk types that were being converted.
+ */
+ if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+ /*
+ * Turn on usage filter if is not already used. The idea is
+ * that chunks that we have already balanced should be
+ * reasonably full. Don't do it for chunks that are being
+ * converted - that will keep us from relocating unconverted
+ * (albeit full) chunks.
+ */
+ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->data.usage = 90;
+ }
+ if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->sys.usage = 90;
+ }
+ if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->meta.usage = 90;
+ }
+}
+
+/*
+ * Clear the balance status in fs_info and delete the balance item from disk.
+ */
+static void reset_balance_state(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ int ret;
+
+ BUG_ON(!fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = NULL;
+ spin_unlock(&fs_info->balance_lock);
+
+ kfree(bctl);
+ ret = del_balance_item(fs_info);
+ if (ret)
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+}
+
+/*
+ * Balance filters. Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_type,
+ struct btrfs_balance_args *bargs)
+{
+ chunk_type = chunk_to_extended(chunk_type) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (bargs->profiles & chunk_type)
+ return 0;
+
+ return 1;
+}
+
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group *cache;
+ u64 chunk_used;
+ u64 user_thresh_min;
+ u64 user_thresh_max;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = cache->used;
+
+ if (bargs->usage_min == 0)
+ user_thresh_min = 0;
+ else
+ user_thresh_min = div_factor_fine(cache->length,
+ bargs->usage_min);
+
+ if (bargs->usage_max == 0)
+ user_thresh_max = 1;
+ else if (bargs->usage_max > 100)
+ user_thresh_max = cache->length;
+ else
+ user_thresh_max = div_factor_fine(cache->length,
+ bargs->usage_max);
+
+ if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group *cache;
+ u64 chunk_used, user_thresh;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = cache->used;
+
+ if (bargs->usage_min == 0)
+ user_thresh = 1;
+ else if (bargs->usage > 100)
+ user_thresh = cache->length;
+ else
+ user_thresh = div_factor_fine(cache->length, bargs->usage);
+
+ if (chunk_used < user_thresh)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ int i;
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+ return 0;
+ }
+
+ return 1;
+}
+
+static u64 calc_data_stripes(u64 type, int num_stripes)
+{
+ const int index = btrfs_bg_flags_to_raid_index(type);
+ const int ncopies = btrfs_raid_array[index].ncopies;
+ const int nparity = btrfs_raid_array[index].nparity;
+
+ return (num_stripes - nparity) / ncopies;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ u64 stripe_offset;
+ u64 stripe_length;
+ u64 type;
+ int factor;
+ int i;
+
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+ return 0;
+
+ type = btrfs_chunk_type(leaf, chunk);
+ factor = calc_data_stripes(type, num_stripes);
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+ continue;
+
+ stripe_offset = btrfs_stripe_offset(leaf, stripe);
+ stripe_length = btrfs_chunk_length(leaf, chunk);
+ stripe_length = div_u64(stripe_length, factor);
+
+ if (stripe_offset < bargs->pend &&
+ stripe_offset + stripe_length > bargs->pstart)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ if (chunk_offset < bargs->vend &&
+ chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+ /* at least part of the chunk is inside this vrange */
+ return 0;
+
+ return 1;
+}
+
+static int chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ if (bargs->stripes_min <= num_stripes
+ && num_stripes <= bargs->stripes_max)
+ return 0;
+
+ return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_type,
+ struct btrfs_balance_args *bargs)
+{
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return 0;
+
+ chunk_type = chunk_to_extended(chunk_type) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (bargs->target == chunk_type)
+ return 1;
+
+ return 0;
+}
+
+static int should_balance_chunk(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ struct btrfs_balance_args *bargs = NULL;
+ u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+ /* type filter */
+ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+ (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+ return 0;
+ }
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ bargs = &bctl->data;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ bargs = &bctl->sys;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ bargs = &bctl->meta;
+
+ /* profiles filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+ chunk_profiles_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ /* usage filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ chunk_usage_filter(fs_info, chunk_offset, bargs)) {
+ return 0;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+ chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+ chunk_devid_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
+ /* drange filter, makes sense only with devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+ chunk_drange_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
+ /* vrange filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+ chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* stripes filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
+ chunk_stripes_range_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
+ /* soft profile changing mode */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+ chunk_soft_convert_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ /*
+ * limited by count, must be the last filter
+ */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
+ if (bargs->limit == 0)
+ return 0;
+ else
+ bargs->limit--;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
+ /*
+ * Same logic as the 'limit' filter; the minimum cannot be
+ * determined here because we do not have the global information
+ * about the count of all chunks that satisfy the filters.
+ */
+ if (bargs->limit_max == 0)
+ return 0;
+ else
+ bargs->limit_max--;
+ }
+
+ return 1;
+}
+
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ u64 chunk_type;
+ struct btrfs_chunk *chunk;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ int slot;
+ int ret;
+ int enospc_errors = 0;
+ bool counting = true;
+ /* The single value limit and min/max limits use the same bytes in the */
+ u64 limit_data = bctl->data.limit;
+ u64 limit_meta = bctl->meta.limit;
+ u64 limit_sys = bctl->sys.limit;
+ u32 count_data = 0;
+ u32 count_meta = 0;
+ u32 count_sys = 0;
+ int chunk_reserved = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* zero out stat counters */
+ spin_lock(&fs_info->balance_lock);
+ memset(&bctl->stat, 0, sizeof(bctl->stat));
+ spin_unlock(&fs_info->balance_lock);
+again:
+ if (!counting) {
+ /*
+ * The single value limit and min/max limits use the same bytes
+ * in the
+ */
+ bctl->data.limit = limit_data;
+ bctl->meta.limit = limit_meta;
+ bctl->sys.limit = limit_sys;
+ }
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ while (1) {
+ if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -ECANCELED;
+ goto error;
+ }
+
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+ ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto error;
+ }
+
+ /*
+ * this shouldn't happen, it means the last relocate
+ * failed
+ */
+ if (ret == 0)
+ BUG(); /* FIXME break ? */
+
+ ret = btrfs_previous_item(chunk_root, path, 0,
+ BTRFS_CHUNK_ITEM_KEY);
+ if (ret) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ ret = 0;
+ break;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid != key.objectid) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ break;
+ }
+
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
+
+ if (!counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.considered++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ ret = should_balance_chunk(leaf, chunk, found_key.offset);
+
+ btrfs_release_path(path);
+ if (!ret) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto loop;
+ }
+
+ if (counting) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.expected++;
+ spin_unlock(&fs_info->balance_lock);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ count_data++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ count_sys++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ count_meta++;
+
+ goto loop;
+ }
+
+ /*
+ * Apply limit_min filter, no need to check if the LIMITS
+ * filter is used, limit_min is 0 by default
+ */
+ if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+ count_data < bctl->data.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
+ count_meta < bctl->meta.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ count_sys < bctl->sys.limit_min)) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto loop;
+ }
+
+ if (!chunk_reserved) {
+ /*
+ * We may be relocating the only data chunk we have,
+ * which could potentially end up with losing data's
+ * raid profile, so lets allocate an empty one in
+ * advance.
+ */
+ ret = btrfs_may_alloc_data_chunk(fs_info,
+ found_key.offset);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto error;
+ } else if (ret == 1) {
+ chunk_reserved = 1;
+ }
+ }
+
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ if (ret == -ENOSPC) {
+ enospc_errors++;
+ } else if (ret == -ETXTBSY) {
+ btrfs_info(fs_info,
+ "skipping relocation of block group %llu due to active swapfile",
+ found_key.offset);
+ ret = 0;
+ } else if (ret) {
+ goto error;
+ } else {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.completed++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+loop:
+ if (found_key.offset == 0)
+ break;
+ key.offset = found_key.offset - 1;
+ }
+
+ if (counting) {
+ btrfs_release_path(path);
+ counting = false;
+ goto again;
+ }
+error:
+ btrfs_free_path(path);
+ if (enospc_errors) {
+ btrfs_info(fs_info, "%d enospc errors during balance",
+ enospc_errors);
+ if (!ret)
+ ret = -ENOSPC;
+ }
+
+ return ret;
+}
+
+/**
+ * alloc_profile_is_valid - see if a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static int alloc_profile_is_valid(u64 flags, int extended)
+{
+ u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
+ BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ /* 1) check that all other bits are zeroed */
+ if (flags & ~mask)
+ return 0;
+
+ /* 2) see if profile is reduced */
+ if (flags == 0)
+ return !extended; /* "0" is valid for usual profiles */
+
+ return has_single_bit_set(flags);
+}
+
+/*
+ * Validate target profile against allowed profiles and return true if it's OK.
+ * Otherwise print the error message and return false.
+ */
+static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
+ const struct btrfs_balance_args *bargs,
+ u64 allowed, const char *type)
+{
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return true;
+
+ /* Profile is valid and does not have bits outside of the allowed set */
+ if (alloc_profile_is_valid(bargs->target, 1) &&
+ (bargs->target & ~allowed) == 0)
+ return true;
+
+ btrfs_err(fs_info, "balance: invalid convert %s profile %s",
+ type, btrfs_bg_type_to_raid_name(bargs->target));
+ return false;
+}
+
+/*
+ * Fill @buf with textual description of balance filter flags @bargs, up to
+ * @size_buf including the terminating null. The output may be trimmed if it
+ * does not fit into the provided buffer.
+ */
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
+ u32 size_buf)
+{
+ int ret;
+ u32 size_bp = size_buf;
+ char *bp = buf;
+ u64 flags = bargs->flags;
+ char tmp_buf[128] = {'\0'};
+
+ if (!flags)
+ return;
+
+#define CHECK_APPEND_NOARG(a) \
+ do { \
+ ret = snprintf(bp, size_bp, (a)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+#define CHECK_APPEND_1ARG(a, v1) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+#define CHECK_APPEND_2ARG(a, v1, v2) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+ if (flags & BTRFS_BALANCE_ARGS_CONVERT)
+ CHECK_APPEND_1ARG("convert=%s,",
+ btrfs_bg_type_to_raid_name(bargs->target));
+
+ if (flags & BTRFS_BALANCE_ARGS_SOFT)
+ CHECK_APPEND_NOARG("soft,");
+
+ if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
+ btrfs_describe_block_groups(bargs->profiles, tmp_buf,
+ sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
+ }
+
+ if (flags & BTRFS_BALANCE_ARGS_USAGE)
+ CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
+
+ if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
+ CHECK_APPEND_2ARG("usage=%u..%u,",
+ bargs->usage_min, bargs->usage_max);
+
+ if (flags & BTRFS_BALANCE_ARGS_DEVID)
+ CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
+
+ if (flags & BTRFS_BALANCE_ARGS_DRANGE)
+ CHECK_APPEND_2ARG("drange=%llu..%llu,",
+ bargs->pstart, bargs->pend);
+
+ if (flags & BTRFS_BALANCE_ARGS_VRANGE)
+ CHECK_APPEND_2ARG("vrange=%llu..%llu,",
+ bargs->vstart, bargs->vend);
+
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT)
+ CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
+
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
+ CHECK_APPEND_2ARG("limit=%u..%u,",
+ bargs->limit_min, bargs->limit_max);
+
+ if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
+ CHECK_APPEND_2ARG("stripes=%u..%u,",
+ bargs->stripes_min, bargs->stripes_max);
+
+#undef CHECK_APPEND_2ARG
+#undef CHECK_APPEND_1ARG
+#undef CHECK_APPEND_NOARG
+
+out_overflow:
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
+ else
+ buf[0] = '\0';
+}
+
+static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
+{
+ u32 size_buf = 1024;
+ char tmp_buf[192] = {'\0'};
+ char *buf;
+ char *bp;
+ u32 size_bp = size_buf;
+ int ret;
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ buf = kzalloc(size_buf, GFP_KERNEL);
+ if (!buf)
+ return;
+
+ bp = buf;
+
+#define CHECK_APPEND_1ARG(a, v1) \
+ do { \
+ ret = snprintf(bp, size_bp, (a), (v1)); \
+ if (ret < 0 || ret >= size_bp) \
+ goto out_overflow; \
+ size_bp -= ret; \
+ bp += ret; \
+ } while (0)
+
+ if (bctl->flags & BTRFS_BALANCE_FORCE)
+ CHECK_APPEND_1ARG("%s", "-f ");
+
+ if (bctl->flags & BTRFS_BALANCE_DATA) {
+ describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-d%s ", tmp_buf);
+ }
+
+ if (bctl->flags & BTRFS_BALANCE_METADATA) {
+ describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-m%s ", tmp_buf);
+ }
+
+ if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
+ describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
+ CHECK_APPEND_1ARG("-s%s ", tmp_buf);
+ }
+
+#undef CHECK_APPEND_1ARG
+
+out_overflow:
+
+ if (size_bp < size_buf)
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
+ btrfs_info(fs_info, "balance: %s %s",
+ (bctl->flags & BTRFS_BALANCE_RESUME) ?
+ "resume" : "start", buf);
+
+ kfree(buf);
+}
+
+/*
+ * Should be called with balance mutexe held
+ */
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+ struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ u64 meta_target, data_target;
+ u64 allowed;
+ int mixed = 0;
+ int ret;
+ u64 num_devices;
+ unsigned seq;
+ bool reducing_redundancy;
+ bool paused = false;
+ int i;
+
+ if (btrfs_fs_closing(fs_info) ||
+ atomic_read(&fs_info->balance_pause_req) ||
+ btrfs_should_cancel_balance(fs_info)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+ if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = 1;
+
+ /*
+ * In case of mixed groups both data and meta should be picked,
+ * and identical options should be given for both of them.
+ */
+ allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
+ if (mixed && (bctl->flags & allowed)) {
+ if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+ !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+ memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+ btrfs_err(fs_info,
+ "balance: mixed groups data and metadata options must be the same");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * rw_devices will not change at the moment, device add/delete/replace
+ * are exclusive
+ */
+ num_devices = fs_info->fs_devices->rw_devices;
+
+ /*
+ * SINGLE profile on-disk has no profile bit, but in-memory we have a
+ * special bit for it, to make it easier to distinguish. Thus we need
+ * to set it manually, or balance would refuse the profile.
+ */
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
+ if (num_devices >= btrfs_raid_array[i].devs_min)
+ allowed |= btrfs_raid_array[i].bg_flag;
+
+ if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
+ !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
+ !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Allow to reduce metadata or system integrity only if force set for
+ * profiles with redundancy (copies, parity)
+ */
+ allowed = 0;
+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
+ if (btrfs_raid_array[i].ncopies >= 2 ||
+ btrfs_raid_array[i].tolerated_failures >= 1)
+ allowed |= btrfs_raid_array[i].bg_flag;
+ }
+ do {
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_system_alloc_bits & allowed) &&
+ !(bctl->sys.target & allowed)) ||
+ ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_metadata_alloc_bits & allowed) &&
+ !(bctl->meta.target & allowed)))
+ reducing_redundancy = true;
+ else
+ reducing_redundancy = false;
+
+ /* if we're not converting, the target field is uninitialized */
+ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+ data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->data.target : fs_info->avail_data_alloc_bits;
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+ if (reducing_redundancy) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ btrfs_info(fs_info,
+ "balance: force reducing metadata redundancy");
+ } else {
+ btrfs_err(fs_info,
+ "balance: reduces metadata redundancy, use --force if you want this");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
+ btrfs_warn(fs_info,
+ "balance: metadata profile %s has lower redundancy than data profile %s",
+ btrfs_bg_type_to_raid_name(meta_target),
+ btrfs_bg_type_to_raid_name(data_target));
+ }
+
+ ret = insert_balance_item(fs_info, bctl);
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+ BUG_ON(ret == -EEXIST);
+ BUG_ON(fs_info->balance_ctl);
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
+ } else {
+ BUG_ON(ret != -EEXIST);
+ spin_lock(&fs_info->balance_lock);
+ update_balance_args(bctl);
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
+ describe_balance_start_or_resume(fs_info);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ ret = __btrfs_balance(fs_info);
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
+ btrfs_info(fs_info, "balance: paused");
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ paused = true;
+ }
+ /*
+ * Balance can be canceled by:
+ *
+ * - Regular cancel request
+ * Then ret == -ECANCELED and balance_cancel_req > 0
+ *
+ * - Fatal signal to "btrfs" process
+ * Either the signal caught by wait_reserve_ticket() and callers
+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
+ * got -ECANCELED.
+ * Either way, in this case balance_cancel_req = 0, and
+ * ret == -EINTR or ret == -ECANCELED.
+ *
+ * So here we only check the return value to catch canceled balance.
+ */
+ else if (ret == -ECANCELED || ret == -EINTR)
+ btrfs_info(fs_info, "balance: canceled");
+ else
+ btrfs_info(fs_info, "balance: ended with status: %d", ret);
+
+ clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
+
+ if (bargs) {
+ memset(bargs, 0, sizeof(*bargs));
+ btrfs_update_ioctl_balance_args(fs_info, bargs);
+ }
+
+ /* We didn't pause, we can clean everything up. */
+ if (!paused) {
+ reset_balance_state(fs_info);
+ btrfs_exclop_finish(fs_info);
+ }
+
+ wake_up(&fs_info->balance_wait_q);
+
+ return ret;
+out:
+ if (bctl->flags & BTRFS_BALANCE_RESUME)
+ reset_balance_state(fs_info);
+ else
+ kfree(bctl);
+ btrfs_exclop_finish(fs_info);
+
+ return ret;
+}
+
+static int balance_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ int ret = 0;
+
+ sb_start_write(fs_info->sb);
+ mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl)
+ ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
+ mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
+
+ return ret;
+}
+
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *tsk;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return 0;
+ }
+ mutex_unlock(&fs_info->balance_mutex);
+
+ if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
+ btrfs_info(fs_info, "balance: resume skipped");
+ return 0;
+ }
+
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
+ /*
+ * A ro->rw remount sequence should continue with the paused balance
+ * regardless of who pauses it, system or the user as of now, so set
+ * the resume flag.
+ */
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+
+ tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
+ return PTR_ERR_OR_ZERO(tsk);
+}
+
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_TEMPORARY_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) { /* ret = -ENOENT; */
+ ret = 0;
+ goto out;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ bctl->flags = btrfs_balance_flags(leaf, item);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+
+ btrfs_balance_data(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+ btrfs_balance_meta(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+ btrfs_balance_sys(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+ /*
+ * This should never happen, as the paused balance state is recovered
+ * during mount without any chance of other exclusive ops to collide.
+ *
+ * This gives the exclusive op status to balance and keeps in paused
+ * state until user intervention (cancel or umount). If the ownership
+ * cannot be assigned, show a message but do not fail. The balance
+ * is in a paused state and must have fs_info::balance_ctl properly
+ * set up.
+ */
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
+ btrfs_warn(fs_info,
+ "balance: cannot set exclusive op status, resume manually");
+
+ btrfs_release_path(path);
+
+ mutex_lock(&fs_info->balance_mutex);
+ BUG_ON(fs_info->balance_ctl);
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
+ mutex_unlock(&fs_info->balance_mutex);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ atomic_inc(&fs_info->balance_pause_req);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ wait_event(fs_info->balance_wait_q,
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+
+ mutex_lock(&fs_info->balance_mutex);
+ /* we are good with balance_ctl ripped off from under us */
+ BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ atomic_dec(&fs_info->balance_pause_req);
+ } else {
+ ret = -ENOTCONN;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ /*
+ * A paused balance with the item stored on disk can be resumed at
+ * mount time if the mount is read-write. Otherwise it's still paused
+ * and we must not allow cancelling as it deletes the item.
+ */
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -EROFS;
+ }
+
+ atomic_inc(&fs_info->balance_cancel_req);
+ /*
+ * if we are running just wait and return, balance item is
+ * deleted in btrfs_balance in this case
+ */
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ wait_event(fs_info->balance_wait_q,
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ mutex_lock(&fs_info->balance_mutex);
+ } else {
+ mutex_unlock(&fs_info->balance_mutex);
+ /*
+ * Lock released to allow other waiters to continue, we'll
+ * reexamine the status again.
+ */
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl) {
+ reset_balance_state(fs_info);
+ btrfs_exclop_finish(fs_info);
+ btrfs_info(fs_info, "balance: canceled");
+ }
+ }
+
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ atomic_dec(&fs_info->balance_cancel_req);
+ mutex_unlock(&fs_info->balance_mutex);
+ return 0;
+}
+
+int btrfs_uuid_scan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+ struct extent_buffer *eb;
+ int slot;
+ struct btrfs_root_item root_item;
+ u32 item_size;
+ struct btrfs_trans_handle *trans = NULL;
+ bool closing = false;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+
+ while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ closing = true;
+ break;
+ }
+ ret = btrfs_search_forward(root, &key, path,
+ BTRFS_OLDEST_GENERATION);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+ if (key.type != BTRFS_ROOT_ITEM_KEY ||
+ (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
+ key.objectid != BTRFS_FS_TREE_OBJECTID) ||
+ key.objectid > BTRFS_LAST_FREE_OBJECTID)
+ goto skip;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size(eb, slot);
+ if (item_size < sizeof(root_item))
+ goto skip;
+
+ read_extent_buffer(eb, &root_item,
+ btrfs_item_ptr_offset(eb, slot),
+ (int)sizeof(root_item));
+ if (btrfs_root_refs(&root_item) == 0)
+ goto skip;
+
+ if (!btrfs_is_empty_uuid(root_item.uuid) ||
+ !btrfs_is_empty_uuid(root_item.received_uuid)) {
+ if (trans)
+ goto update_tree;
+
+ btrfs_release_path(path);
+ /*
+ * 1 - subvol uuid item
+ * 1 - received_subvol uuid item
+ */
+ trans = btrfs_start_transaction(fs_info->uuid_root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ continue;
+ } else {
+ goto skip;
+ }
+update_tree:
+ btrfs_release_path(path);
+ if (!btrfs_is_empty_uuid(root_item.uuid)) {
+ ret = btrfs_uuid_tree_add(trans, root_item.uuid,
+ BTRFS_UUID_KEY_SUBVOL,
+ key.objectid);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
+ ret);
+ break;
+ }
+ }
+
+ if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
+ ret = btrfs_uuid_tree_add(trans,
+ root_item.received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ key.objectid);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
+ ret);
+ break;
+ }
+ }
+
+skip:
+ btrfs_release_path(path);
+ if (trans) {
+ ret = btrfs_end_transaction(trans);
+ trans = NULL;
+ if (ret)
+ break;
+ }
+
+ if (key.offset < (u64)-1) {
+ key.offset++;
+ } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ } else if (key.objectid < (u64)-1) {
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.objectid++;
+ } else {
+ break;
+ }
+ cond_resched();
+ }
+
+out:
+ btrfs_free_path(path);
+ if (trans && !IS_ERR(trans))
+ btrfs_end_transaction(trans);
+ if (ret)
+ btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
+ else if (!closing)
+ set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
+ up(&fs_info->uuid_tree_rescan_sem);
+ return 0;
+}
+
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *uuid_root;
+ struct task_struct *task;
+ int ret;
+
+ /*
+ * 1 - root node
+ * 1 - root item
+ */
+ trans = btrfs_start_transaction(tree_root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
+ if (IS_ERR(uuid_root)) {
+ ret = PTR_ERR(uuid_root);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ fs_info->uuid_root = uuid_root;
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_scan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
+ }
+
+ return 0;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ u64 length;
+ u64 chunk_offset;
+ int ret;
+ int slot;
+ int failed = 0;
+ bool retried = false;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
+ u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 old_size = btrfs_device_get_total_bytes(device);
+ u64 diff;
+ u64 start;
+
+ new_size = round_down(new_size, fs_info->sectorsize);
+ start = new_size;
+ diff = round_down(old_size - new_size, fs_info->sectorsize);
+
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return -EINVAL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = READA_BACK;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ mutex_lock(&fs_info->chunk_mutex);
+
+ btrfs_device_set_total_bytes(device, new_size);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ device->fs_devices->total_rw_bytes -= diff;
+ atomic64_sub(diff, &fs_info->free_chunk_space);
+ }
+
+ /*
+ * Once the device's size has been set to the new size, ensure all
+ * in-memory chunks are synced to disk so that the loop below sees them
+ * and relocates them accordingly.
+ */
+ if (contains_pending_extent(device, &start, diff)) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ goto done;
+ } else {
+ mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_end_transaction(trans);
+ }
+
+again:
+ key.objectid = device->devid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ do {
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto done;
+ }
+
+ ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ if (ret < 0)
+ goto done;
+ ret = 0;
+ btrfs_release_path(path);
+ break;
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+ if (key.objectid != device->devid) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_release_path(path);
+ break;
+ }
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ length = btrfs_dev_extent_length(l, dev_extent);
+
+ if (key.offset + length <= new_size) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_release_path(path);
+ break;
+ }
+
+ chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+ btrfs_release_path(path);
+
+ /*
+ * We may be relocating the only data chunk we have,
+ * which could potentially end up with losing data's
+ * raid profile, so lets allocate an empty one in
+ * advance.
+ */
+ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto done;
+ }
+
+ ret = btrfs_relocate_chunk(fs_info, chunk_offset);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ if (ret == -ENOSPC) {
+ failed++;
+ } else if (ret) {
+ if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "could not shrink block group %llu due to active swapfile",
+ chunk_offset);
+ }
+ goto done;
+ }
+ } while (key.offset-- > 0);
+
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ ret = -ENOSPC;
+ goto done;
+ }
+
+ /* Shrinking succeeded, else we would be at "done". */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto done;
+ }
+
+ mutex_lock(&fs_info->chunk_mutex);
+ /* Clear all state bits beyond the shrunk device size */
+ clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK);
+
+ btrfs_device_set_disk_total_bytes(device, new_size);
+ if (list_empty(&device->post_commit_list))
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+
+ WARN_ON(diff > old_total);
+ btrfs_set_super_total_bytes(super_copy,
+ round_down(old_total - diff, fs_info->sectorsize));
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ btrfs_reserve_chunk_metadata(trans, false);
+ /* Now btrfs_update_device() will change the on-disk size. */
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ } else {
+ ret = btrfs_commit_transaction(trans);
+ }
+done:
+ btrfs_free_path(path);
+ if (ret) {
+ mutex_lock(&fs_info->chunk_mutex);
+ btrfs_device_set_total_bytes(device, old_size);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ device->fs_devices->total_rw_bytes += diff;
+ atomic64_add(diff, &fs_info->free_chunk_space);
+ mutex_unlock(&fs_info->chunk_mutex);
+ }
+ return ret;
+}
+
+static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key,
+ struct btrfs_chunk *chunk, int item_size)
+{
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
+ struct btrfs_disk_key disk_key;
+ u32 array_size;
+ u8 *ptr;
+
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ array_size = btrfs_super_sys_array_size(super_copy);
+ if (array_size + item_size + sizeof(disk_key)
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ return -EFBIG;
+
+ ptr = super_copy->sys_chunk_array + array_size;
+ btrfs_cpu_key_to_disk(&disk_key, key);
+ memcpy(ptr, &disk_key, sizeof(disk_key));
+ ptr += sizeof(disk_key);
+ memcpy(ptr, chunk, item_size);
+ item_size += sizeof(disk_key);
+ btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+
+ return 0;
+}
+
+/*
+ * sort the devices in descending order by max_avail, total_avail
+ */
+static int btrfs_cmp_device_info(const void *a, const void *b)
+{
+ const struct btrfs_device_info *di_a = a;
+ const struct btrfs_device_info *di_b = b;
+
+ if (di_a->max_avail > di_b->max_avail)
+ return -1;
+ if (di_a->max_avail < di_b->max_avail)
+ return 1;
+ if (di_a->total_avail > di_b->total_avail)
+ return -1;
+ if (di_a->total_avail < di_b->total_avail)
+ return 1;
+ return 0;
+}
+
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ return;
+
+ btrfs_set_fs_incompat(info, RAID56);
+}
+
+static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+ if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
+ return;
+
+ btrfs_set_fs_incompat(info, RAID1C34);
+}
+
+/*
+ * Structure used internally for btrfs_create_chunk() function.
+ * Wraps needed parameters.
+ */
+struct alloc_chunk_ctl {
+ u64 start;
+ u64 type;
+ /* Total number of stripes to allocate */
+ int num_stripes;
+ /* sub_stripes info for map */
+ int sub_stripes;
+ /* Stripes per device */
+ int dev_stripes;
+ /* Maximum number of devices to use */
+ int devs_max;
+ /* Minimum number of devices to use */
+ int devs_min;
+ /* ndevs has to be a multiple of this */
+ int devs_increment;
+ /* Number of copies */
+ int ncopies;
+ /* Number of stripes worth of bytes to store parity information */
+ int nparity;
+ u64 max_stripe_size;
+ u64 max_chunk_size;
+ u64 dev_extent_min;
+ u64 stripe_size;
+ u64 chunk_size;
+ int ndevs;
+};
+
+static void init_alloc_chunk_ctl_policy_regular(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
+ ASSERT(space_info);
+
+ ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
+ ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
+
+ if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
+ ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
+
+ /* We don't want a chunk larger than 10% of writable space */
+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ ctl->max_chunk_size);
+ ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
+}
+
+static void init_alloc_chunk_ctl_policy_zoned(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ u64 zone_size = fs_devices->fs_info->zone_size;
+ u64 limit;
+ int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
+ int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
+ u64 min_chunk_size = min_data_stripes * zone_size;
+ u64 type = ctl->type;
+
+ ctl->max_stripe_size = zone_size;
+ if (type & BTRFS_BLOCK_GROUP_DATA) {
+ ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
+ zone_size);
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ ctl->max_chunk_size = ctl->max_stripe_size;
+ } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+ ctl->devs_max = min_t(int, ctl->devs_max,
+ BTRFS_MAX_DEVS_SYS_CHUNK);
+ } else {
+ BUG();
+ }
+
+ /* We don't want a chunk larger than 10% of writable space */
+ limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+ zone_size),
+ min_chunk_size);
+ ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
+ ctl->dev_extent_min = zone_size * ctl->dev_stripes;
+}
+
+static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ int index = btrfs_bg_flags_to_raid_index(ctl->type);
+
+ ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
+ ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
+ ctl->devs_max = btrfs_raid_array[index].devs_max;
+ if (!ctl->devs_max)
+ ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
+ ctl->devs_min = btrfs_raid_array[index].devs_min;
+ ctl->devs_increment = btrfs_raid_array[index].devs_increment;
+ ctl->ncopies = btrfs_raid_array[index].ncopies;
+ ctl->nparity = btrfs_raid_array[index].nparity;
+ ctl->ndevs = 0;
+
+ switch (fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
+ break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int gather_device_info(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = fs_devices->fs_info;
+ struct btrfs_device *device;
+ u64 total_avail;
+ u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
+ int ret;
+ int ndevs = 0;
+ u64 max_avail;
+ u64 dev_offset;
+
+ /*
+ * in the first pass through the devices list, we gather information
+ * about the available holes on each device.
+ */
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ WARN(1, KERN_ERR
+ "BTRFS: read-only device in alloc_list\n");
+ continue;
+ }
+
+ if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ continue;
+
+ if (device->total_bytes > device->bytes_used)
+ total_avail = device->total_bytes - device->bytes_used;
+ else
+ total_avail = 0;
+
+ /* If there is no space on this device, skip it. */
+ if (total_avail < ctl->dev_extent_min)
+ continue;
+
+ ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
+ &max_avail);
+ if (ret && ret != -ENOSPC)
+ return ret;
+
+ if (ret == 0)
+ max_avail = dev_extent_want;
+
+ if (max_avail < ctl->dev_extent_min) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info,
+ "%s: devid %llu has no free space, have=%llu want=%llu",
+ __func__, device->devid, max_avail,
+ ctl->dev_extent_min);
+ continue;
+ }
+
+ if (ndevs == fs_devices->rw_devices) {
+ WARN(1, "%s: found more than %llu devices\n",
+ __func__, fs_devices->rw_devices);
+ break;
+ }
+ devices_info[ndevs].dev_offset = dev_offset;
+ devices_info[ndevs].max_avail = max_avail;
+ devices_info[ndevs].total_avail = total_avail;
+ devices_info[ndevs].dev = device;
+ ++ndevs;
+ }
+ ctl->ndevs = ndevs;
+
+ /*
+ * now sort the devices by hole size / available space
+ */
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info, NULL);
+
+ return 0;
+}
+
+static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ /* Number of stripes that count for block group size */
+ int data_stripes;
+
+ /*
+ * The primary goal is to maximize the number of stripes, so use as
+ * many devices as possible, even if the stripes are not maximum sized.
+ *
+ * The DUP profile stores more than one stripe per device, the
+ * max_avail is the total size so we have to adjust.
+ */
+ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+
+ /* This will have to be fixed for RAID1 and RAID10 over more drives */
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+
+ /*
+ * Use the number of data stripes to figure out how big this chunk is
+ * really going to be in terms of logical address space, and compare
+ * that answer with the max chunk size. If it's higher, we try to
+ * reduce stripe_size.
+ */
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
+ /*
+ * Reduce stripe_size, round it up to a 16MB boundary again and
+ * then use it, unless it ends up being even bigger than the
+ * previous value we had already.
+ */
+ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
+ data_stripes), SZ_16M),
+ ctl->stripe_size);
+ }
+
+ /* Stripe size should not go beyond 1G. */
+ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
+
+ /* Align to BTRFS_STRIPE_LEN */
+ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+ return 0;
+}
+
+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ u64 zone_size = devices_info[0].dev->zone_info->zone_size;
+ /* Number of stripes that count for block group size */
+ int data_stripes;
+
+ /*
+ * It should hold because:
+ * dev_extent_min == dev_extent_want == zone_size * dev_stripes
+ */
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+
+ ctl->stripe_size = zone_size;
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+
+ /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
+ ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
+ ctl->stripe_size) + ctl->nparity,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+ }
+
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+ return 0;
+}
+
+static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = fs_devices->fs_info;
+
+ /*
+ * Round down to number of usable stripes, devs_increment can be any
+ * number so we can't use round_down() that requires power of 2, while
+ * rounddown is safe.
+ */
+ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
+
+ if (ctl->ndevs < ctl->devs_min) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
+ btrfs_debug(info,
+ "%s: not enough devices with free space: have=%d minimum required=%d",
+ __func__, ctl->ndevs, ctl->devs_min);
+ }
+ return -ENOSPC;
+ }
+
+ ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
+
+ switch (fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ return decide_stripe_size_regular(ctl, devices_info);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ return decide_stripe_size_zoned(ctl, devices_info);
+ default:
+ BUG();
+ }
+}
+
+static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
+ struct btrfs_block_group *block_group;
+ struct extent_map *em;
+ u64 start = ctl->start;
+ u64 type = ctl->type;
+ int ret;
+ int i;
+ int j;
+
+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+ map->num_stripes = ctl->num_stripes;
+
+ for (i = 0; i < ctl->ndevs; ++i) {
+ for (j = 0; j < ctl->dev_stripes; ++j) {
+ int s = i * ctl->dev_stripes + j;
+ map->stripes[s].dev = devices_info[i].dev;
+ map->stripes[s].physical = devices_info[i].dev_offset +
+ j * ctl->stripe_size;
+ }
+ }
+ map->stripe_len = BTRFS_STRIPE_LEN;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
+ map->type = type;
+ map->sub_stripes = ctl->sub_stripes;
+
+ trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
+
+ em = alloc_extent_map();
+ if (!em) {
+ kfree(map);
+ return ERR_PTR(-ENOMEM);
+ }
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ em->map_lookup = map;
+ em->start = start;
+ em->len = ctl->chunk_size;
+ em->block_start = 0;
+ em->block_len = em->len;
+ em->orig_block_len = ctl->stripe_size;
+
+ em_tree = &info->mapping_tree;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ if (ret) {
+ write_unlock(&em_tree->lock);
+ free_extent_map(em);
+ return ERR_PTR(ret);
+ }
+ write_unlock(&em_tree->lock);
+
+ block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+ if (IS_ERR(block_group))
+ goto error_del_extent;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *dev = map->stripes[i].dev;
+
+ btrfs_device_set_bytes_used(dev,
+ dev->bytes_used + ctl->stripe_size);
+ if (list_empty(&dev->post_commit_list))
+ list_add_tail(&dev->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
+ atomic64_sub(ctl->stripe_size * map->num_stripes,
+ &info->free_chunk_space);
+
+ free_extent_map(em);
+ check_raid56_incompat_flag(info, type);
+ check_raid1c34_incompat_flag(info, type);
+
+ return block_group;
+
+error_del_extent:
+ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ /* One for our allocation */
+ free_extent_map(em);
+ /* One for the tree reference */
+ free_extent_map(em);
+
+ return block_group;
+}
+
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
+ u64 type)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct btrfs_device_info *devices_info = NULL;
+ struct alloc_chunk_ctl ctl;
+ struct btrfs_block_group *block_group;
+ int ret;
+
+ lockdep_assert_held(&info->chunk_mutex);
+
+ if (!alloc_profile_is_valid(type, 0)) {
+ ASSERT(0);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (list_empty(&fs_devices->alloc_list)) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info, "%s: no writable device", __func__);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
+ ASSERT(0);
+ return ERR_PTR(-EINVAL);
+ }
+
+ ctl.start = find_next_chunk(info);
+ ctl.type = type;
+ init_alloc_chunk_ctl(fs_devices, &ctl);
+
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+ GFP_NOFS);
+ if (!devices_info)
+ return ERR_PTR(-ENOMEM);
+
+ ret = gather_device_info(fs_devices, &ctl, devices_info);
+ if (ret < 0) {
+ block_group = ERR_PTR(ret);
+ goto out;
+ }
+
+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
+ if (ret < 0) {
+ block_group = ERR_PTR(ret);
+ goto out;
+ }
+
+ block_group = create_chunk(trans, &ctl, devices_info);
+
+out:
+ kfree(devices_info);
+ return block_group;
+}
+
+/*
+ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
+ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
+ * chunks.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ struct btrfs_key key;
+ struct btrfs_chunk *chunk;
+ struct btrfs_stripe *stripe;
+ struct extent_map *em;
+ struct map_lookup *map;
+ size_t item_size;
+ int i;
+ int ret;
+
+ /*
+ * We take the chunk_mutex for 2 reasons:
+ *
+ * 1) Updates and insertions in the chunk btree must be done while holding
+ * the chunk_mutex, as well as updating the system chunk array in the
+ * superblock. See the comment on top of btrfs_chunk_alloc() for the
+ * details;
+ *
+ * 2) To prevent races with the final phase of a device replace operation
+ * that replaces the device object associated with the map's stripes,
+ * because the device object's id can change at any time during that
+ * final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * which would cause a failure when updating the device item, which does
+ * not exists, or persisting a stripe of the chunk item with such ID.
+ * Here we can't use the device_list_mutex because our caller already
+ * has locked the chunk_mutex, and the final phase of device replace
+ * acquires both mutexes - first the device_list_mutex and then the
+ * chunk_mutex. Using any of those two mutexes protects us from a
+ * concurrent device replace.
+ */
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ map = em->map_lookup;
+ item_size = btrfs_chunk_item_size(map->num_stripes);
+
+ chunk = kzalloc(item_size, GFP_NOFS);
+ if (!chunk) {
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *device = map->stripes[i].dev;
+
+ ret = btrfs_update_device(trans, device);
+ if (ret)
+ goto out;
+ }
+
+ stripe = &chunk->stripe;
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 dev_offset = map->stripes[i].physical;
+
+ btrfs_set_stack_stripe_devid(stripe, device->devid);
+ btrfs_set_stack_stripe_offset(stripe, dev_offset);
+ memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+ stripe++;
+ }
+
+ btrfs_set_stack_chunk_length(chunk, bg->length);
+ btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
+ btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_type(chunk, map->type);
+ btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+ btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
+ btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = bg->start;
+
+ ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+ if (ret)
+ goto out;
+
+ set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
+ if (ret)
+ goto out;
+ }
+
+out:
+ kfree(chunk);
+ free_extent_map(em);
+ return ret;
+}
+
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 alloc_profile;
+ struct btrfs_block_group *meta_bg;
+ struct btrfs_block_group *sys_bg;
+
+ /*
+ * When adding a new device for sprouting, the seed device is read-only
+ * so we must first allocate a metadata and a system chunk. But before
+ * adding the block group items to the extent, device and chunk btrees,
+ * we must first:
+ *
+ * 1) Create both chunks without doing any changes to the btrees, as
+ * otherwise we would get -ENOSPC since the block groups from the
+ * seed device are read-only;
+ *
+ * 2) Add the device item for the new sprout device - finishing the setup
+ * of a new block group requires updating the device item in the chunk
+ * btree, so it must exist when we attempt to do it. The previous step
+ * ensures this does not fail with -ENOSPC.
+ *
+ * After that we can add the block group items to their btrees:
+ * update existing device item in the chunk btree, add a new block group
+ * item to the extent btree, add a new chunk item to the chunk btree and
+ * finally add the new device extent items to the devices btree.
+ */
+
+ alloc_profile = btrfs_metadata_alloc_profile(fs_info);
+ meta_bg = btrfs_create_chunk(trans, alloc_profile);
+ if (IS_ERR(meta_bg))
+ return PTR_ERR(meta_bg);
+
+ alloc_profile = btrfs_system_alloc_profile(fs_info);
+ sys_bg = btrfs_create_chunk(trans, alloc_profile);
+ if (IS_ERR(sys_bg))
+ return PTR_ERR(sys_bg);
+
+ return 0;
+}
+
+static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+{
+ const int index = btrfs_bg_flags_to_raid_index(map->type);
+
+ return btrfs_raid_array[index].tolerated_failures;
+}
+
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ int miss_ndevs = 0;
+ int i;
+ bool ret = true;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em))
+ return false;
+
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING,
+ &map->stripes[i].dev->dev_state)) {
+ miss_ndevs++;
+ continue;
+ }
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &map->stripes[i].dev->dev_state)) {
+ ret = false;
+ goto end;
+ }
+ }
+
+ /*
+ * If the number of missing devices is larger than max errors, we can
+ * not write the data into that chunk successfully.
+ */
+ if (miss_ndevs > btrfs_chunk_max_errors(map))
+ ret = false;
+end:
+ free_extent_map(em);
+ return ret;
+}
+
+void btrfs_mapping_tree_free(struct extent_map_tree *tree)
+{
+ struct extent_map *em;
+
+ while (1) {
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, 0, (u64)-1);
+ if (em)
+ remove_extent_mapping(tree, em);
+ write_unlock(&tree->lock);
+ if (!em)
+ break;
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+}
+
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ enum btrfs_raid_types index;
+ int ret = 1;
+
+ em = btrfs_get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(em))
+ /*
+ * We could return errors for these cases, but that could get
+ * ugly and we'd probably do the same thing which is just not do
+ * anything else and exit, so return 1 so the callers don't try
+ * to use other copies.
+ */
+ return 1;
+
+ map = em->map_lookup;
+ index = btrfs_bg_flags_to_raid_index(map->type);
+
+ /* Non-RAID56, use their ncopies from btrfs_raid_array. */
+ if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ ret = btrfs_raid_array[index].ncopies;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ ret = 2;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ /*
+ * There could be two corrupted data stripes, we need
+ * to loop retry in order to rebuild the correct data.
+ *
+ * Fail a stripe at a time on every retry except the
+ * stripe under reconstruction.
+ */
+ ret = map->num_stripes;
+ free_extent_map(em);
+
+ down_read(&fs_info->dev_replace.rwsem);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
+ fs_info->dev_replace.tgtdev)
+ ret++;
+ up_read(&fs_info->dev_replace.rwsem);
+
+ return ret;
+}
+
+unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
+ u64 logical)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned long len = fs_info->sectorsize;
+
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return len;
+
+ em = btrfs_get_chunk_map(fs_info, logical, len);
+
+ if (!WARN_ON(IS_ERR(em))) {
+ map = em->map_lookup;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ len = map->stripe_len * nr_data_stripes(map);
+ free_extent_map(em);
+ }
+ return len;
+}
+
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ int ret = 0;
+
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return 0;
+
+ em = btrfs_get_chunk_map(fs_info, logical, len);
+
+ if(!WARN_ON(IS_ERR(em))) {
+ map = em->map_lookup;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ ret = 1;
+ free_extent_map(em);
+ }
+ return ret;
+}
+
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+ struct map_lookup *map, int first,
+ int dev_replace_is_ongoing)
+{
+ int i;
+ int num_stripes;
+ int preferred_mirror;
+ int tolerance;
+ struct btrfs_device *srcdev;
+
+ ASSERT((map->type &
+ (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ num_stripes = map->sub_stripes;
+ else
+ num_stripes = map->num_stripes;
+
+ switch (fs_info->fs_devices->read_policy) {
+ default:
+ /* Shouldn't happen, just warn and use pid instead of failing */
+ btrfs_warn_rl(fs_info,
+ "unknown read_policy type %u, reset to pid",
+ fs_info->fs_devices->read_policy);
+ fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ fallthrough;
+ case BTRFS_READ_POLICY_PID:
+ preferred_mirror = first + (current->pid % num_stripes);
+ break;
+ }
+
+ if (dev_replace_is_ongoing &&
+ fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+ srcdev = fs_info->dev_replace.srcdev;
+ else
+ srcdev = NULL;
+
+ /*
+ * try to avoid the drive that is the source drive for a
+ * dev-replace procedure, only choose it if no other non-missing
+ * mirror is available
+ */
+ for (tolerance = 0; tolerance < 2; tolerance++) {
+ if (map->stripes[preferred_mirror].dev->bdev &&
+ (tolerance || map->stripes[preferred_mirror].dev != srcdev))
+ return preferred_mirror;
+ for (i = first; i < first + num_stripes; i++) {
+ if (map->stripes[i].dev->bdev &&
+ (tolerance || map->stripes[i].dev != srcdev))
+ return i;
+ }
+ }
+
+ /* we couldn't find one that doesn't fail. Just return something
+ * and the io error handling code will clean up eventually
+ */
+ return preferred_mirror;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
+{
+ int i;
+ int again = 1;
+
+ while (again) {
+ again = 0;
+ for (i = 0; i < num_stripes - 1; i++) {
+ /* Swap if parity is on a smaller index */
+ if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+ swap(bioc->stripes[i], bioc->stripes[i + 1]);
+ swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
+ again = 1;
+ }
+ }
+ }
+}
+
+static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ int total_stripes,
+ int real_stripes)
+{
+ struct btrfs_io_context *bioc = kzalloc(
+ /* The size of btrfs_io_context */
+ sizeof(struct btrfs_io_context) +
+ /* Plus the variable array for the stripes */
+ sizeof(struct btrfs_io_stripe) * (total_stripes) +
+ /* Plus the variable array for the tgt dev */
+ sizeof(int) * (real_stripes) +
+ /*
+ * Plus the raid_map, which includes both the tgt dev
+ * and the stripes.
+ */
+ sizeof(u64) * (total_stripes),
+ GFP_NOFS|__GFP_NOFAIL);
+
+ refcount_set(&bioc->refs, 1);
+
+ bioc->fs_info = fs_info;
+ bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+ bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
+
+ return bioc;
+}
+
+void btrfs_get_bioc(struct btrfs_io_context *bioc)
+{
+ WARN_ON(!refcount_read(&bioc->refs));
+ refcount_inc(&bioc->refs);
+}
+
+void btrfs_put_bioc(struct btrfs_io_context *bioc)
+{
+ if (!bioc)
+ return;
+ if (refcount_dec_and_test(&bioc->refs))
+ kfree(bioc);
+}
+
+/*
+ * Please note that, discard won't be sent to target device of device
+ * replace.
+ */
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_discard_stripe *stripes;
+ u64 length = *length_ret;
+ u64 offset;
+ u64 stripe_nr;
+ u64 stripe_nr_end;
+ u64 stripe_end_offset;
+ u64 stripe_cnt;
+ u64 stripe_len;
+ u64 stripe_offset;
+ u32 stripe_index;
+ u32 factor = 0;
+ u32 sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+ u32 last_stripe = 0;
+ int ret;
+ int i;
+
+ em = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return ERR_CAST(em);
+
+ map = em->map_lookup;
+
+ /* we don't discard raid56 yet */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out_free_map;
+}
+
+ offset = logical - em->start;
+ length = min_t(u64, em->start + em->len - logical, length);
+ *length_ret = length;
+
+ stripe_len = map->stripe_len;
+ /*
+ * stripe_nr counts the total number of stripes we have to stride
+ * to get to this block
+ */
+ stripe_nr = div64_u64(offset, stripe_len);
+
+ /* stripe_offset is the offset of this block in its stripe */
+ stripe_offset = offset - stripe_nr * stripe_len;
+
+ stripe_nr_end = round_up(offset + length, map->stripe_len);
+ stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+ stripe_cnt = stripe_nr_end - stripe_nr;
+ stripe_end_offset = stripe_nr_end * map->stripe_len -
+ (offset + length);
+ /*
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
+ */
+ *num_stripes = 1;
+ stripe_index = 0;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ *num_stripes = min_t(u64, map->num_stripes,
+ sub_stripes * stripe_cnt);
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index *= sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_cnt, factor,
+ &remaining_stripes);
+ div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+ last_stripe *= sub_stripes;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ *num_stripes = map->num_stripes;
+ } else {
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ }
+
+ stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
+ if (!stripes) {
+ ret = -ENOMEM;
+ goto out_free_map;
+ }
+
+ for (i = 0; i < *num_stripes; i++) {
+ stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ stripes[i].dev = map->stripes[stripe_index].dev;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ stripes[i].length = stripes_per_dev * map->stripe_len;
+
+ if (i / sub_stripes < remaining_stripes)
+ stripes[i].length += map->stripe_len;
+
+ /*
+ * Special for the first stripe and
+ * the last stripe:
+ *
+ * |-------|...|-------|
+ * |----------|
+ * off end_off
+ */
+ if (i < sub_stripes)
+ stripes[i].length -= stripe_offset;
+
+ if (stripe_index >= last_stripe &&
+ stripe_index <= (last_stripe +
+ sub_stripes - 1))
+ stripes[i].length -= stripe_end_offset;
+
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
+ } else {
+ stripes[i].length = length;
+ }
+
+ stripe_index++;
+ if (stripe_index == map->num_stripes) {
+ stripe_index = 0;
+ stripe_nr++;
+ }
+ }
+
+ free_extent_map(em);
+ return stripes;
+out_free_map:
+ free_extent_map(em);
+ return ERR_PTR(ret);
+}
+
+/*
+ * In dev-replace case, for repair case (that's the only case where the mirror
+ * is selected explicitly when calling btrfs_map_block), blocks left of the
+ * left cursor can also be read from the target drive.
+ *
+ * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
+ * array of stripes.
+ * For READ, it also needs to be supported using the same mirror number.
+ *
+ * If the requested block is not left of the left cursor, EIO is returned. This
+ * can happen because btrfs_num_copies() returns one more in the dev-replace
+ * case.
+ */
+static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ u64 srcdev_devid, int *mirror_num,
+ u64 *physical)
+{
+ struct btrfs_io_context *bioc = NULL;
+ int num_stripes;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+ int i;
+ int ret = 0;
+
+ ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ logical, &length, &bioc, NULL, NULL, 0);
+ if (ret) {
+ ASSERT(bioc == NULL);
+ return ret;
+ }
+
+ num_stripes = bioc->num_stripes;
+ if (*mirror_num > num_stripes) {
+ /*
+ * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
+ * that means that the requested area is not left of the left
+ * cursor
+ */
+ btrfs_put_bioc(bioc);
+ return -EIO;
+ }
+
+ /*
+ * process the rest of the function using the mirror_num of the source
+ * drive. Therefore look it up first. At the end, patch the device
+ * pointer to the one of the target drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bioc->stripes[i].dev->devid != srcdev_devid)
+ continue;
+
+ /*
+ * In case of DUP, in order to keep it simple, only add the
+ * mirror with the lowest physical address
+ */
+ if (found &&
+ physical_of_found <= bioc->stripes[i].physical)
+ continue;
+
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bioc->stripes[i].physical;
+ }
+
+ btrfs_put_bioc(bioc);
+
+ ASSERT(found);
+ if (!found)
+ return -EIO;
+
+ *mirror_num = index_srcdev + 1;
+ *physical = physical_of_found;
+ return ret;
+}
+
+static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+ bool ret;
+
+ /* Non zoned filesystem does not use "to_copy" flag */
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+
+ ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static void handle_ops_on_dev_replace(enum btrfs_map_op op,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_dev_replace *dev_replace,
+ u64 logical,
+ int *num_stripes_ret, int *max_errors_ret)
+{
+ struct btrfs_io_context *bioc = *bioc_ret;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int tgtdev_indexes = 0;
+ int num_stripes = *num_stripes_ret;
+ int max_errors = *max_errors_ret;
+ int i;
+
+ if (op == BTRFS_MAP_WRITE) {
+ int index_where_to_add;
+
+ /*
+ * A block group which have "to_copy" set will eventually
+ * copied by dev-replace process. We can avoid cloning IO here.
+ */
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+ return;
+
+ /*
+ * duplicate the write operations while the dev replace
+ * procedure is running. Since the copying of the old disk to
+ * the new disk takes place at run time while the filesystem is
+ * mounted writable, the regular write operations to the old
+ * disk have to be duplicated to go to the new disk as well.
+ *
+ * Note that device->missing is handled by the caller, and that
+ * the write to the old disk is already set up in the stripes
+ * array.
+ */
+ index_where_to_add = num_stripes;
+ for (i = 0; i < num_stripes; i++) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
+ /* write to new disk, too */
+ struct btrfs_io_stripe *new =
+ bioc->stripes + index_where_to_add;
+ struct btrfs_io_stripe *old =
+ bioc->stripes + i;
+
+ new->physical = old->physical;
+ new->dev = dev_replace->tgtdev;
+ bioc->tgtdev_map[i] = index_where_to_add;
+ index_where_to_add++;
+ max_errors++;
+ tgtdev_indexes++;
+ }
+ }
+ num_stripes = index_where_to_add;
+ } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ /*
+ * During the dev-replace procedure, the target drive can also
+ * be used to read data in case it is needed to repair a corrupt
+ * block elsewhere. This is possible if the requested area is
+ * left of the left cursor. In this area, the target drive is a
+ * full copy of the source drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it simple,
+ * only add the mirror with the lowest physical
+ * address
+ */
+ if (found &&
+ physical_of_found <= bioc->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bioc->stripes[i].physical;
+ }
+ }
+ if (found) {
+ struct btrfs_io_stripe *tgtdev_stripe =
+ bioc->stripes + num_stripes;
+
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+ bioc->tgtdev_map[index_srcdev] = num_stripes;
+
+ tgtdev_indexes++;
+ num_stripes++;
+ }
+ }
+
+ *num_stripes_ret = num_stripes;
+ *max_errors_ret = max_errors;
+ bioc->num_tgtdevs = tgtdev_indexes;
+ *bioc_ret = bioc;
+}
+
+static bool need_full_stripe(enum btrfs_map_op op)
+{
+ return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
+}
+
+/*
+ * Calculate the geometry of a particular (address, len) tuple. This
+ * information is used to calculate how big a particular bio can get before it
+ * straddles a stripe.
+ *
+ * @fs_info: the filesystem
+ * @em: mapping containing the logical extent
+ * @op: type of operation - write or read
+ * @logical: address that we want to figure out the geometry of
+ * @io_geom: pointer used to return values
+ *
+ * Returns < 0 in case a chunk for the given logical address cannot be found,
+ * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
+ */
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
+ enum btrfs_map_op op, u64 logical,
+ struct btrfs_io_geometry *io_geom)
+{
+ struct map_lookup *map;
+ u64 len;
+ u64 offset;
+ u64 stripe_offset;
+ u64 stripe_nr;
+ u32 stripe_len;
+ u64 raid56_full_stripe_start = (u64)-1;
+ int data_stripes;
+
+ ASSERT(op != BTRFS_MAP_DISCARD);
+
+ map = em->map_lookup;
+ /* Offset of this logical address in the chunk */
+ offset = logical - em->start;
+ /* Len of a stripe in a chunk */
+ stripe_len = map->stripe_len;
+ /*
+ * Stripe_nr is where this block falls in
+ * stripe_offset is the offset of this block in its stripe.
+ */
+ stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
+ ASSERT(stripe_offset < U32_MAX);
+
+ data_stripes = nr_data_stripes(map);
+
+ /* Only stripe based profiles needs to check against stripe length. */
+ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
+ u64 max_len = stripe_len - stripe_offset;
+
+ /*
+ * In case of raid56, we need to know the stripe aligned start
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ unsigned long full_stripe_len = stripe_len * data_stripes;
+ raid56_full_stripe_start = offset;
+
+ /*
+ * Allow a write of a full stripe, but make sure we
+ * don't allow straddling of stripes
+ */
+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+ full_stripe_len);
+ raid56_full_stripe_start *= full_stripe_len;
+
+ /*
+ * For writes to RAID[56], allow a full stripeset across
+ * all disks. For other RAID types and for RAID[56]
+ * reads, just allow a single stripe (on a single disk).
+ */
+ if (op == BTRFS_MAP_WRITE) {
+ max_len = stripe_len * data_stripes -
+ (offset - raid56_full_stripe_start);
+ }
+ }
+ len = min_t(u64, em->len - offset, max_len);
+ } else {
+ len = em->len - offset;
+ }
+
+ io_geom->len = len;
+ io_geom->offset = offset;
+ io_geom->stripe_len = stripe_len;
+ io_geom->stripe_nr = stripe_nr;
+ io_geom->stripe_offset = stripe_offset;
+ io_geom->raid56_stripe_offset = raid56_full_stripe_start;
+
+ return 0;
+}
+
+static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
+ u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
+{
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical = map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+}
+
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
+ enum btrfs_map_op op, u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap,
+ int *mirror_num_ret, int need_raid_map)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 stripe_offset;
+ u64 stripe_nr;
+ u64 stripe_len;
+ u32 stripe_index;
+ int data_stripes;
+ int i;
+ int ret = 0;
+ int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+ int num_stripes;
+ int max_errors = 0;
+ int tgtdev_indexes = 0;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int dev_replace_is_ongoing = 0;
+ int num_alloc_stripes;
+ int patch_the_first_stripe_for_dev_replace = 0;
+ u64 physical_to_patch_in_first_stripe = 0;
+ u64 raid56_full_stripe_start = (u64)-1;
+ struct btrfs_io_geometry geom;
+
+ ASSERT(bioc_ret);
+ ASSERT(op != BTRFS_MAP_DISCARD);
+
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
+ ASSERT(!IS_ERR(em));
+
+ ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
+ if (ret < 0)
+ return ret;
+
+ map = em->map_lookup;
+
+ *length = geom.len;
+ stripe_len = geom.stripe_len;
+ stripe_nr = geom.stripe_nr;
+ stripe_offset = geom.stripe_offset;
+ raid56_full_stripe_start = geom.raid56_stripe_offset;
+ data_stripes = nr_data_stripes(map);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ /*
+ * Hold the semaphore for read during the whole operation, write is
+ * requested at commit time but must wait.
+ */
+ if (!dev_replace_is_ongoing)
+ up_read(&dev_replace->rwsem);
+
+ if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+ !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
+ ret = get_extra_mirror_from_replace(fs_info, logical, *length,
+ dev_replace->srcdev->devid,
+ &mirror_num,
+ &physical_to_patch_in_first_stripe);
+ if (ret)
+ goto out;
+ else
+ patch_the_first_stripe_for_dev_replace = 1;
+ } else if (mirror_num > map->num_stripes) {
+ mirror_num = 0;
+ }
+
+ num_stripes = 1;
+ stripe_index = 0;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ if (!need_full_stripe(op))
+ mirror_num = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
+ if (need_full_stripe(op))
+ num_stripes = map->num_stripes;
+ else if (mirror_num)
+ stripe_index = mirror_num - 1;
+ else {
+ stripe_index = find_live_mirror(fs_info, map, 0,
+ dev_replace_is_ongoing);
+ mirror_num = stripe_index + 1;
+ }
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+ if (need_full_stripe(op)) {
+ num_stripes = map->num_stripes;
+ } else if (mirror_num) {
+ stripe_index = mirror_num - 1;
+ } else {
+ mirror_num = 1;
+ }
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ u32 factor = map->num_stripes / map->sub_stripes;
+
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index *= map->sub_stripes;
+
+ if (need_full_stripe(op))
+ num_stripes = map->sub_stripes;
+ else if (mirror_num)
+ stripe_index += mirror_num - 1;
+ else {
+ int old_stripe_index = stripe_index;
+ stripe_index = find_live_mirror(fs_info, map,
+ stripe_index,
+ dev_replace_is_ongoing);
+ mirror_num = stripe_index - old_stripe_index + 1;
+ }
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
+ if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
+ /* push stripe_nr back to the start of the full stripe */
+ stripe_nr = div64_u64(raid56_full_stripe_start,
+ stripe_len * data_stripes);
+
+ /* RAID[56] write or recovery. Return all stripes */
+ num_stripes = map->num_stripes;
+ max_errors = btrfs_chunk_max_errors(map);
+
+ /* Return the length to the full stripe end */
+ *length = min(logical + *length,
+ raid56_full_stripe_start + em->start +
+ data_stripes * stripe_len) - logical;
+ stripe_index = 0;
+ stripe_offset = 0;
+ } else {
+ /*
+ * Mirror #0 or #1 means the original data block.
+ * Mirror #2 is RAID5 parity block.
+ * Mirror #3 is RAID6 Q block.
+ */
+ stripe_nr = div_u64_rem(stripe_nr,
+ data_stripes, &stripe_index);
+ if (mirror_num > 1)
+ stripe_index = data_stripes + mirror_num - 2;
+
+ /* We distribute the parity blocks across stripes */
+ div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
+ &stripe_index);
+ if (!need_full_stripe(op) && mirror_num <= 1)
+ mirror_num = 1;
+ }
+ } else {
+ /*
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
+ */
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ mirror_num = stripe_index + 1;
+ }
+ if (stripe_index >= map->num_stripes) {
+ btrfs_crit(fs_info,
+ "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
+ stripe_index, map->num_stripes);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ num_alloc_stripes = num_stripes;
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
+ if (op == BTRFS_MAP_WRITE)
+ num_alloc_stripes <<= 1;
+ if (op == BTRFS_MAP_GET_READ_MIRRORS)
+ num_alloc_stripes++;
+ tgtdev_indexes = num_stripes;
+ }
+
+ /*
+ * If this I/O maps to a single device, try to return the device and
+ * physical block information on the stack instead of allocating an
+ * I/O context structure.
+ */
+ if (smap && num_alloc_stripes == 1 &&
+ !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
+ (!need_full_stripe(op) || !dev_replace_is_ongoing ||
+ !dev_replace->tgtdev)) {
+ if (patch_the_first_stripe_for_dev_replace) {
+ smap->dev = dev_replace->tgtdev;
+ smap->physical = physical_to_patch_in_first_stripe;
+ if (mirror_num_ret)
+ *mirror_num_ret = map->num_stripes + 1;
+ } else {
+ set_io_stripe(smap, map, stripe_index, stripe_offset,
+ stripe_nr);
+ if (mirror_num_ret)
+ *mirror_num_ret = mirror_num;
+ }
+ *bioc_ret = NULL;
+ ret = 0;
+ goto out;
+ }
+
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ if (!bioc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
+ stripe_nr);
+ stripe_index++;
+ }
+
+ /* Build raid_map */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+ (need_full_stripe(op) || mirror_num > 1)) {
+ u64 tmp;
+ unsigned rot;
+
+ /* Work out the disk rotation on this stripe-set */
+ div_u64_rem(stripe_nr, num_stripes, &rot);
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * data_stripes;
+ for (i = 0; i < data_stripes; i++)
+ bioc->raid_map[(i + rot) % num_stripes] =
+ em->start + (tmp + i) * map->stripe_len;
+
+ bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ bioc->raid_map[(i + rot + 1) % num_stripes] =
+ RAID6_Q_STRIPE;
+
+ sort_parity_stripes(bioc, num_stripes);
+ }
+
+ if (need_full_stripe(op))
+ max_errors = btrfs_chunk_max_errors(map);
+
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ need_full_stripe(op)) {
+ handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
+ &num_stripes, &max_errors);
+ }
+
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
+ bioc->max_errors = max_errors;
+ bioc->mirror_num = mirror_num;
+
+ /*
+ * this is the case that REQ_READ && dev_replace_is_ongoing &&
+ * mirror_num == num_stripes + 1 && dev_replace target drive is
+ * available as a mirror
+ */
+ if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+ WARN_ON(num_stripes > 1);
+ bioc->stripes[0].dev = dev_replace->tgtdev;
+ bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+ bioc->mirror_num = map->num_stripes + 1;
+ }
+out:
+ if (dev_replace_is_ongoing) {
+ lockdep_assert_held(&dev_replace->rwsem);
+ /* Unlock and let waiting writers proceed */
+ up_read(&dev_replace->rwsem);
+ }
+ free_extent_map(em);
+ return ret;
+}
+
+int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret, int mirror_num)
+{
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
+ NULL, &mirror_num, 0);
+}
+
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret)
+{
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
+ NULL, NULL, 1);
+}
+
+/*
+ * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
+ * is already initialized by the block layer.
+ */
+static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+ bbio->end_io = end_io;
+ bbio->private = private;
+}
+
+/*
+ * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ *
+ * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+ * a mempool.
+ */
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio), end_io, private);
+ return bio;
+}
+
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+ struct btrfs_bio *bbio;
+
+ ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, end_io, private);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ bbio->iter = bio->bi_iter;
+ return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+ if (!dev || !dev->bdev)
+ return;
+ if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+ return;
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ else if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
+}
+
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
+ struct bio *bio)
+{
+ if (bio->bi_opf & REQ_META)
+ return fs_info->endio_meta_workers;
+ return fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio =
+ container_of(work, struct btrfs_bio, end_io_work);
+
+ bbio->end_io(bbio);
+}
+
+static void btrfs_simple_end_io(struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(fs_info);
+
+ if (bio->bi_status)
+ btrfs_log_dev_io_error(bio, bbio->device);
+
+ if (bio_op(bio) == REQ_OP_READ) {
+ INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
+ } else {
+ bbio->end_io(bbio);
+ }
+}
+
+static void btrfs_raid56_end_io(struct bio *bio)
+{
+ struct btrfs_io_context *bioc = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
+ bbio->mirror_num = bioc->mirror_num;
+ bbio->end_io(bbio);
+
+ btrfs_put_bioc(bioc);
+}
+
+static void btrfs_orig_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
+
+ if (bio->bi_status) {
+ atomic_inc(&bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
+ }
+
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_status = BLK_STS_OK;
+
+ bbio->end_io(bbio);
+ btrfs_put_bioc(bioc);
+}
+
+static void btrfs_clone_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+
+ if (bio->bi_status) {
+ atomic_inc(&stripe->bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
+ }
+
+ /* Pass on control to the original bio this one was cloned from */
+ bio_endio(stripe->bioc->orig_bio);
+ bio_put(bio);
+}
+
+static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+{
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+ bio_io_error(bio);
+ return;
+ }
+
+ bio_set_dev(bio, dev->bdev);
+
+ /*
+ * For zone append writing, bi_sector must point the beginning of the
+ * zone
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (btrfs_dev_is_sequential(dev, physical)) {
+ u64 zone_start = round_down(physical,
+ dev->fs_info->zone_size);
+
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ } else {
+ bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+ bio->bi_opf |= REQ_OP_WRITE;
+ }
+ }
+ btrfs_debug_in_rcu(dev->fs_info,
+ "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+ __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
+ (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
+ dev->devid, bio->bi_iter.bi_size);
+
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
+}
+
+static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
+{
+ struct bio *orig_bio = bioc->orig_bio, *bio;
+
+ ASSERT(bio_op(orig_bio) != REQ_OP_READ);
+
+ /* Reuse the bio embedded into the btrfs_bio for the last mirror */
+ if (dev_nr == bioc->num_stripes - 1) {
+ bio = orig_bio;
+ bio->bi_end_io = btrfs_orig_write_end_io;
+ } else {
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ bio_inc_remaining(orig_bio);
+ bio->bi_end_io = btrfs_clone_write_end_io;
+ }
+
+ bio->bi_private = &bioc->stripes[dev_nr];
+ bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
+ bioc->stripes[dev_nr].bioc = bioc;
+ btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
+}
+
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
+{
+ u64 logical = bio->bi_iter.bi_sector << 9;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_io_stripe smap;
+ int ret;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num, 1);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ return;
+ }
+
+ if (map_length < length) {
+ btrfs_crit(fs_info,
+ "mapping failed logical %llu bio len %llu len %llu",
+ logical, length, map_length);
+ BUG();
+ }
+
+ if (!bioc) {
+ /* Single mirror read/write fast path */
+ btrfs_bio(bio)->mirror_num = mirror_num;
+ btrfs_bio(bio)->device = smap.dev;
+ bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ bio->bi_private = fs_info;
+ bio->bi_end_io = btrfs_simple_end_io;
+ btrfs_submit_dev_bio(smap.dev, bio);
+ } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /* Parity RAID write or read recovery */
+ bio->bi_private = bioc;
+ bio->bi_end_io = btrfs_raid56_end_io;
+ if (bio_op(bio) == REQ_OP_READ)
+ raid56_parity_recover(bio, bioc, mirror_num);
+ else
+ raid56_parity_write(bio, bioc);
+ } else {
+ /* Write to multiple mirrors */
+ int total_devs = bioc->num_stripes;
+ int dev_nr;
+
+ bioc->orig_bio = bio;
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ btrfs_submit_mirrored_bio(bioc, dev_nr);
+ }
+}
+
+static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_fs_devices *fs_devices)
+{
+ if (args->fsid == NULL)
+ return true;
+ if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+ return true;
+ return false;
+}
+
+static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_device *device)
+{
+ if (args->missing) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+ }
+
+ if (device->devid != args->devid)
+ return false;
+ if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+ return false;
+ return true;
+}
+
+/*
+ * Find a device specified by @devid or @uuid in the list of @fs_devices, or
+ * return NULL.
+ *
+ * If devid and uuid are both specified, the match must be exact, otherwise
+ * only devid is used.
+ */
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed_devs;
+
+ if (dev_args_match_fs_devices(args, fs_devices)) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (dev_args_match_device(args, device))
+ return device;
+ }
+ }
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ if (!dev_args_match_fs_devices(args, seed_devs))
+ continue;
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ if (dev_args_match_device(args, device))
+ return device;
+ }
+ }
+
+ return NULL;
+}
+
+static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *dev_uuid)
+{
+ struct btrfs_device *device;
+ unsigned int nofs_flag;
+
+ /*
+ * We call this under the chunk_mutex, so we want to use NOFS for this
+ * allocation, however we don't want to change btrfs_alloc_device() to
+ * always do NOFS because we use it in a lot of other GFP_KERNEL safe
+ * places.
+ */
+ nofs_flag = memalloc_nofs_save();
+ device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(device))
+ return device;
+
+ list_add(&device->dev_list, &fs_devices->devices);
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+
+ set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+ fs_devices->missing_devices++;
+
+ return device;
+}
+
+/**
+ * btrfs_alloc_device - allocate struct btrfs_device
+ * @fs_info: used only for generating a new devid, can be NULL if
+ * devid is provided (i.e. @devid != NULL).
+ * @devid: a pointer to devid for this device. If NULL a new devid
+ * is generated.
+ * @uuid: a pointer to UUID for this device. If NULL a new UUID
+ * is generated.
+ *
+ * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
+ * on error. Returned struct is not linked onto any lists and must be
+ * destroyed with btrfs_free_device.
+ */
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+ const u64 *devid,
+ const u8 *uuid)
+{
+ struct btrfs_device *dev;
+ u64 tmp;
+
+ if (WARN_ON(!devid && !fs_info))
+ return ERR_PTR(-EINVAL);
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&dev->dev_list);
+ INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->post_commit_list);
+
+ atomic_set(&dev->dev_stats_ccnt, 0);
+ btrfs_device_data_ordered_init(dev);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
+
+ if (devid)
+ tmp = *devid;
+ else {
+ int ret;
+
+ ret = find_next_devid(fs_info, &tmp);
+ if (ret) {
+ btrfs_free_device(dev);
+ return ERR_PTR(ret);
+ }
+ }
+ dev->devid = tmp;
+
+ if (uuid)
+ memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
+ else
+ generate_random_uuid(dev->uuid);
+
+ return dev;
+}
+
+static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid, bool error)
+{
+ if (error)
+ btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
+ else
+ btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
+}
+
+u64 btrfs_calc_stripe_length(const struct extent_map *em)
+{
+ const struct map_lookup *map = em->map_lookup;
+ const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
+
+ return div_u64(em->len, data_stripes);
+}
+
+#if BITS_PER_LONG == 32
+/*
+ * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
+ * can't be accessed on 32bit systems.
+ *
+ * This function do mount time check to reject the fs if it already has
+ * metadata chunk beyond that limit.
+ */
+static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_METADATA))
+ return 0;
+
+ if (logical + length < MAX_LFS_FILESIZE)
+ return 0;
+
+ btrfs_err_32bit_limit(fs_info);
+ return -EOVERFLOW;
+}
+
+/*
+ * This is to give early warning for any metadata chunk reaching
+ * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
+ * Although we can still access the metadata, it's not going to be possible
+ * once the limit is reached.
+ */
+static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_METADATA))
+ return;
+
+ if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
+ return;
+
+ btrfs_warn_32bit_limit(fs_info);
+}
+#endif
+
+static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info, devid, uuid, true);
+ return ERR_PTR(-ENOENT);
+ }
+
+ dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
+ if (IS_ERR(dev)) {
+ btrfs_err(fs_info, "failed to init missing device %llu: %ld",
+ devid, PTR_ERR(dev));
+ return dev;
+ }
+ btrfs_report_missing_device(fs_info, devid, uuid, false);
+
+ return dev;
+}
+
+static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ u64 logical;
+ u64 length;
+ u64 devid;
+ u64 type;
+ u8 uuid[BTRFS_UUID_SIZE];
+ int index;
+ int num_stripes;
+ int ret;
+ int i;
+
+ logical = key->offset;
+ length = btrfs_chunk_length(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+ index = btrfs_bg_flags_to_raid_index(type);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+#if BITS_PER_LONG == 32
+ ret = check_32bit_meta_chunk(fs_info, logical, length, type);
+ if (ret < 0)
+ return ret;
+ warn_32bit_meta_chunk(fs_info, logical, length, type);
+#endif
+
+ /*
+ * Only need to verify chunk item if we're reading from sys chunk array,
+ * as chunk item in tree block is already verified by tree-checker.
+ */
+ if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
+ ret = btrfs_check_chunk_valid(leaf, chunk, logical);
+ if (ret)
+ return ret;
+ }
+
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, logical, 1);
+ read_unlock(&map_tree->lock);
+
+ /* already mapped? */
+ if (em && em->start <= logical && em->start + em->len > logical) {
+ free_extent_map(em);
+ return 0;
+ } else if (em) {
+ free_extent_map(em);
+ }
+
+ em = alloc_extent_map();
+ if (!em)
+ return -ENOMEM;
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map) {
+ free_extent_map(em);
+ return -ENOMEM;
+ }
+
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ em->map_lookup = map;
+ em->start = logical;
+ em->len = length;
+ em->orig_start = 0;
+ em->block_start = 0;
+ em->block_len = em->len;
+
+ map->num_stripes = num_stripes;
+ map->io_width = btrfs_chunk_io_width(leaf, chunk);
+ map->io_align = btrfs_chunk_io_align(leaf, chunk);
+ map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ map->type = type;
+ /*
+ * We can't use the sub_stripes value, as for profiles other than
+ * RAID10, they may have 0 as sub_stripes for filesystems created by
+ * older mkfs (<v5.4).
+ * In that case, it can cause divide-by-zero errors later.
+ * Since currently sub_stripes is fixed for each profile, let's
+ * use the trusted value instead.
+ */
+ map->sub_stripes = btrfs_raid_array[index].sub_stripes;
+ map->verified_stripes = 0;
+ em->orig_block_len = btrfs_calc_stripe_length(em);
+ for (i = 0; i < num_stripes; i++) {
+ map->stripes[i].physical =
+ btrfs_stripe_offset_nr(leaf, chunk, i);
+ devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ args.devid = devid;
+ read_extent_buffer(leaf, uuid, (unsigned long)
+ btrfs_stripe_dev_uuid_nr(chunk, i),
+ BTRFS_UUID_SIZE);
+ args.uuid = uuid;
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
+ if (!map->stripes[i].dev) {
+ map->stripes[i].dev = handle_missing_device(fs_info,
+ devid, uuid);
+ if (IS_ERR(map->stripes[i].dev)) {
+ ret = PTR_ERR(map->stripes[i].dev);
+ free_extent_map(em);
+ return ret;
+ }
+ }
+
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &(map->stripes[i].dev->dev_state));
+ }
+
+ write_lock(&map_tree->lock);
+ ret = add_extent_mapping(map_tree, em, 0);
+ write_unlock(&map_tree->lock);
+ if (ret < 0) {
+ btrfs_err(fs_info,
+ "failed to add chunk map, start=%llu len=%llu: %d",
+ em->start, em->len, ret);
+ }
+ free_extent_map(em);
+
+ return ret;
+}
+
+static void fill_device_from_item(struct extent_buffer *leaf,
+ struct btrfs_dev_item *dev_item,
+ struct btrfs_device *device)
+{
+ unsigned long ptr;
+
+ device->devid = btrfs_device_id(leaf, dev_item);
+ device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+ device->total_bytes = device->disk_total_bytes;
+ device->commit_total_bytes = device->disk_total_bytes;
+ device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+ device->commit_bytes_used = device->bytes_used;
+ device->type = btrfs_device_type(leaf, dev_item);
+ device->io_align = btrfs_device_io_align(leaf, dev_item);
+ device->io_width = btrfs_device_io_width(leaf, dev_item);
+ device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+ WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ ptr = btrfs_device_uuid(dev_item);
+ read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+}
+
+static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
+ u8 *fsid)
+{
+ struct btrfs_fs_devices *fs_devices;
+ int ret;
+
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fsid);
+
+ /* This will match only for multi-device seed fs */
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
+ if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
+ return fs_devices;
+
+
+ fs_devices = find_fsid(fsid, NULL);
+ if (!fs_devices) {
+ if (!btrfs_test_opt(fs_info, DEGRADED))
+ return ERR_PTR(-ENOENT);
+
+ fs_devices = alloc_fs_devices(fsid, NULL);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ fs_devices->seeding = true;
+ fs_devices->opened = 1;
+ return fs_devices;
+ }
+
+ /*
+ * Upon first call for a seed fs fsid, just create a private copy of the
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
+ */
+ fs_devices = clone_fs_devices(fs_devices);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
+ if (ret) {
+ free_fs_devices(fs_devices);
+ return ERR_PTR(ret);
+ }
+
+ if (!fs_devices->seeding) {
+ close_fs_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ return ERR_PTR(-EINVAL);
+ }
+
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
+
+ return fs_devices;
+}
+
+static int read_one_dev(struct extent_buffer *leaf,
+ struct btrfs_dev_item *dev_item)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 devid;
+ int ret;
+ u8 fs_uuid[BTRFS_FSID_SIZE];
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+
+ devid = btrfs_device_id(leaf, dev_item);
+ args.devid = devid;
+ read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
+ BTRFS_UUID_SIZE);
+ read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
+ BTRFS_FSID_SIZE);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
+
+ if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
+ fs_devices = open_seed_devices(fs_info, fs_uuid);
+ if (IS_ERR(fs_devices))
+ return PTR_ERR(fs_devices);
+ }
+
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ if (!device) {
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info, devid,
+ dev_uuid, true);
+ return -ENOENT;
+ }
+
+ device = add_missing_dev(fs_devices, devid, dev_uuid);
+ if (IS_ERR(device)) {
+ btrfs_err(fs_info,
+ "failed to add missing dev %llu: %ld",
+ devid, PTR_ERR(device));
+ return PTR_ERR(device);
+ }
+ btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
+ } else {
+ if (!device->bdev) {
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info,
+ devid, dev_uuid, true);
+ return -ENOENT;
+ }
+ btrfs_report_missing_device(fs_info, devid,
+ dev_uuid, false);
+ }
+
+ if (!device->bdev &&
+ !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ /*
+ * this happens when a device that was properly setup
+ * in the device info lists suddenly goes bad.
+ * device->bdev is NULL, and so we have to set
+ * device->missing to one here
+ */
+ device->fs_devices->missing_devices++;
+ set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+ }
+
+ /* Move the device to its own fs_devices */
+ if (device->fs_devices != fs_devices) {
+ ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
+ &device->dev_state));
+
+ list_move(&device->dev_list, &fs_devices->devices);
+ device->fs_devices->num_devices--;
+ fs_devices->num_devices++;
+
+ device->fs_devices->missing_devices--;
+ fs_devices->missing_devices++;
+
+ device->fs_devices = fs_devices;
+ }
+ }
+
+ if (device->fs_devices != fs_info->fs_devices) {
+ BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
+ if (device->generation !=
+ btrfs_device_generation(leaf, dev_item))
+ return -EINVAL;
+ }
+
+ fill_device_from_item(leaf, dev_item, device);
+ if (device->bdev) {
+ u64 max_total_bytes = bdev_nr_bytes(device->bdev);
+
+ if (device->total_bytes > max_total_bytes) {
+ btrfs_err(fs_info,
+ "device total_bytes should be at most %llu but found %llu",
+ max_total_bytes, device->total_bytes);
+ return -EINVAL;
+ }
+ }
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+ !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
+ device->fs_devices->total_rw_bytes += device->total_bytes;
+ atomic64_add(device->total_bytes - device->bytes_used,
+ &fs_info->free_chunk_space);
+ }
+ ret = 0;
+ return ret;
+}
+
+int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
+ struct extent_buffer *sb;
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ u8 *array_ptr;
+ unsigned long sb_array_offset;
+ int ret = 0;
+ u32 num_stripes;
+ u32 array_size;
+ u32 len = 0;
+ u32 cur_offset;
+ u64 type;
+ struct btrfs_key key;
+
+ ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
+
+ /*
+ * We allocated a dummy extent, just to use extent buffer accessors.
+ * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
+ * that's fine, we will not go beyond system chunk array anyway.
+ */
+ sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ if (!sb)
+ return -ENOMEM;
+ set_extent_buffer_uptodate(sb);
+
+ write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ array_ptr = super_copy->sys_chunk_array;
+ sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
+ cur_offset = 0;
+
+ while (cur_offset < array_size) {
+ disk_key = (struct btrfs_disk_key *)array_ptr;
+ len = sizeof(*disk_key);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ btrfs_disk_key_to_cpu(&key, disk_key);
+
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
+
+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
+ btrfs_err(fs_info,
+ "unexpected item type %u in sys_array at offset %u",
+ (u32)key.type, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
+ chunk = (struct btrfs_chunk *)sb_array_offset;
+ /*
+ * At least one btrfs_chunk with one stripe must be present,
+ * exact stripe count check comes afterwards
+ */
+ len = btrfs_chunk_item_size(1);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ btrfs_err(fs_info,
+ "invalid number of stripes %u in sys_array at offset %u",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
+ type = btrfs_chunk_type(sb, chunk);
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
+ btrfs_err(fs_info,
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
+ len = btrfs_chunk_item_size(num_stripes);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ ret = read_one_chunk(&key, sb, chunk);
+ if (ret)
+ break;
+
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
+ }
+ clear_extent_buffer_uptodate(sb);
+ free_extent_buffer_stale(sb);
+ return ret;
+
+out_short_read:
+ btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
+ len, cur_offset);
+ clear_extent_buffer_uptodate(sb);
+ free_extent_buffer_stale(sb);
+ return -EIO;
+}
+
+/*
+ * Check if all chunks in the fs are OK for read-write degraded mount
+ *
+ * If the @failing_dev is specified, it's accounted as missing.
+ *
+ * Return true if all chunks meet the minimal RW mount requirements.
+ * Return false if any chunk doesn't meet the minimal RW mount requirements.
+ */
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *failing_dev)
+{
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ u64 next_start = 0;
+ bool ret = true;
+
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, 0, (u64)-1);
+ read_unlock(&map_tree->lock);
+ /* No chunk at all? Return false anyway */
+ if (!em) {
+ ret = false;
+ goto out;
+ }
+ while (em) {
+ struct map_lookup *map;
+ int missing = 0;
+ int max_tolerated;
+ int i;
+
+ map = em->map_lookup;
+ max_tolerated =
+ btrfs_get_num_tolerated_disk_barrier_failures(
+ map->type);
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *dev = map->stripes[i].dev;
+
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ dev->last_flush_error)
+ missing++;
+ else if (failing_dev && failing_dev == dev)
+ missing++;
+ }
+ if (missing > max_tolerated) {
+ if (!failing_dev)
+ btrfs_warn(fs_info,
+ "chunk %llu missing %d devices, max tolerance is %d for writable mount",
+ em->start, missing, max_tolerated);
+ free_extent_map(em);
+ ret = false;
+ goto out;
+ }
+ next_start = extent_map_end(em);
+ free_extent_map(em);
+
+ read_lock(&map_tree->lock);
+ em = lookup_extent_mapping(map_tree, next_start,
+ (u64)(-1) - next_start);
+ read_unlock(&map_tree->lock);
+ }
+out:
+ return ret;
+}
+
+static void readahead_tree_node_children(struct extent_buffer *node)
+{
+ int i;
+ const int nr_items = btrfs_header_nritems(node);
+
+ for (i = 0; i < nr_items; i++)
+ btrfs_readahead_node_child(node, i);
+}
+
+int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = fs_info->chunk_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+ int slot;
+ int iter_ret = 0;
+ u64 total_dev = 0;
+ u64 last_ra_node = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * uuid_mutex is needed only if we are mounting a sprout FS
+ * otherwise we don't need it.
+ */
+ mutex_lock(&uuid_mutex);
+
+ /*
+ * It is possible for mount and umount to race in such a way that
+ * we execute this code path, but open_fs_devices failed to clear
+ * total_rw_bytes. We certainly want it cleared before reading the
+ * device items, so clear it here.
+ */
+ fs_info->fs_devices->total_rw_bytes = 0;
+
+ /*
+ * Lockdep complains about possible circular locking dependency between
+ * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
+ * used for freeze procection of a fs (struct super_block.s_writers),
+ * which we take when starting a transaction, and extent buffers of the
+ * chunk tree if we call read_one_dev() while holding a lock on an
+ * extent buffer of the chunk tree. Since we are mounting the filesystem
+ * and at this point there can't be any concurrent task modifying the
+ * chunk tree, to keep it simple, just skip locking on the chunk tree.
+ */
+ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
+ path->skip_locking = 1;
+
+ /*
+ * Read all device items, and then all the chunk items. All
+ * device items are found before any chunk item (their object id
+ * is smaller than the lowest possible object id for a chunk
+ * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
+ */
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.offset = 0;
+ key.type = 0;
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *node = path->nodes[1];
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ if (node) {
+ if (last_ra_node != node->start) {
+ readahead_tree_node_children(node);
+ last_ra_node = node->start;
+ }
+ }
+ if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+ struct btrfs_dev_item *dev_item;
+ dev_item = btrfs_item_ptr(leaf, slot,
+ struct btrfs_dev_item);
+ ret = read_one_dev(leaf, dev_item);
+ if (ret)
+ goto error;
+ total_dev++;
+ } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+ struct btrfs_chunk *chunk;
+
+ /*
+ * We are only called at mount time, so no need to take
+ * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
+ * we always lock first fs_info->chunk_mutex before
+ * acquiring any locks on the chunk tree. This is a
+ * requirement for chunk allocation, see the comment on
+ * top of btrfs_chunk_alloc() for details.
+ */
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ ret = read_one_chunk(&found_key, leaf, chunk);
+ if (ret)
+ goto error;
+ }
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto error;
+ }
+
+ /*
+ * After loading chunk tree, we've got all device information,
+ * do another round of validation checks.
+ */
+ if (total_dev != fs_info->fs_devices->total_devices) {
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
+ btrfs_super_num_devices(fs_info->super_copy),
+ total_dev);
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
+ }
+ if (btrfs_super_total_bytes(fs_info->super_copy) <
+ fs_info->fs_devices->total_rw_bytes) {
+ btrfs_err(fs_info,
+ "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
+ btrfs_super_total_bytes(fs_info->super_copy),
+ fs_info->fs_devices->total_rw_bytes);
+ ret = -EINVAL;
+ goto error;
+ }
+ ret = 0;
+error:
+ mutex_unlock(&uuid_mutex);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ fs_devices->fs_info = fs_info;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->fs_info = fs_info;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ device->fs_info = fs_info;
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ break;
+ }
+
+ seed_devs->fs_info = fs_info;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
+ const struct btrfs_dev_stats_item *ptr,
+ int index)
+{
+ u64 val;
+
+ read_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+ return val;
+}
+
+static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index, u64 val)
+{
+ write_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+}
+
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ struct btrfs_path *path)
+{
+ struct btrfs_dev_stats_item *ptr;
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ int item_size;
+ int i, ret, slot;
+
+ if (!device->fs_info->dev_root)
+ return 0;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
+ if (ret) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_set(device, i, 0);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ struct btrfs_device *device;
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_stats_item *ptr;
+ int ret;
+ int i;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ btrfs_warn_in_rcu(fs_info,
+ "error %d while searching for dev_stats item for device %s",
+ ret, rcu_str_deref(device->name));
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /* need to delete old one and insert a new one */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ btrfs_warn_in_rcu(fs_info,
+ "delete too small dev_stats item for device %s failed %d",
+ rcu_str_deref(device->name), ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ btrfs_warn_in_rcu(fs_info,
+ "insert dev_stats item for device %s failed %d",
+ rcu_str_deref(device->name), ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_set_dev_stats_value(eb, ptr, i,
+ btrfs_dev_stat_read(device, i));
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ int stats_cnt;
+ int ret = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ stats_cnt = atomic_read(&device->dev_stats_ccnt);
+ if (!device->dev_stats_valid || stats_cnt == 0)
+ continue;
+
+
+ /*
+ * There is a LOAD-LOAD control dependency between the value of
+ * dev_stats_ccnt and updating the on-disk values which requires
+ * reading the in-memory counters. Such control dependencies
+ * require explicit read memory barriers.
+ *
+ * This memory barriers pairs with smp_mb__before_atomic in
+ * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
+ * barrier implied by atomic_xchg in
+ * btrfs_dev_stats_read_and_reset
+ */
+ smp_rmb();
+
+ ret = update_dev_stat_item(trans, device);
+ if (!ret)
+ atomic_sub(stats_cnt, &device->dev_stats_ccnt);
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+ btrfs_dev_stat_inc(dev, index);
+
+ if (!dev->dev_stats_valid)
+ return;
+ btrfs_err_rl_in_rcu(dev->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
+ rcu_str_deref(dev->name),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ if (btrfs_dev_stat_read(dev, i) != 0)
+ break;
+ if (i == BTRFS_DEV_STAT_VALUES_MAX)
+ return; /* all values == 0, suppress message */
+
+ btrfs_info_in_rcu(dev->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
+ rcu_str_deref(dev->name),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_get_dev_stats *stats)
+{
+ BTRFS_DEV_LOOKUP_ARGS(args);
+ struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ int i;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ args.devid = stats->devid;
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (!dev) {
+ btrfs_warn(fs_info, "get dev_stats failed, device not found");
+ return -ENODEV;
+ } else if (!dev->dev_stats_valid) {
+ btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
+ return -ENODEV;
+ } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (stats->nr_items > i)
+ stats->values[i] =
+ btrfs_dev_stat_read_and_reset(dev, i);
+ else
+ btrfs_dev_stat_set(dev, i, 0);
+ }
+ btrfs_info(fs_info, "device stats zeroed by %s (%d)",
+ current->comm, task_pid_nr(current));
+ } else {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ if (stats->nr_items > i)
+ stats->values[i] = btrfs_dev_stat_read(dev, i);
+ }
+ if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+ stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+ return 0;
+}
+
+/*
+ * Update the size and bytes used for each device where it changed. This is
+ * delayed since we would otherwise get errors while writing out the
+ * superblocks.
+ *
+ * Must be invoked during transaction commit.
+ */
+void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
+{
+ struct btrfs_device *curr, *next;
+
+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
+
+ if (list_empty(&trans->dev_update_list))
+ return;
+
+ /*
+ * We don't need the device_list_mutex here. This list is owned by the
+ * transaction and the transaction must complete before the device is
+ * released.
+ */
+ mutex_lock(&trans->fs_info->chunk_mutex);
+ list_for_each_entry_safe(curr, next, &trans->dev_update_list,
+ post_commit_list) {
+ list_del_init(&curr->post_commit_list);
+ curr->commit_total_bytes = curr->disk_total_bytes;
+ curr->commit_bytes_used = curr->bytes_used;
+ }
+ mutex_unlock(&trans->fs_info->chunk_mutex);
+}
+
+/*
+ * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
+ */
+int btrfs_bg_type_to_factor(u64 flags)
+{
+ const int index = btrfs_bg_flags_to_raid_index(flags);
+
+ return btrfs_raid_array[index].ncopies;
+}
+
+
+
+static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset, u64 devid,
+ u64 physical_offset, u64 physical_len)
+{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *dev;
+ u64 stripe_len;
+ bool found = false;
+ int ret = 0;
+ int i;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_err(fs_info,
+"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
+ physical_offset, devid);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ map = em->map_lookup;
+ stripe_len = btrfs_calc_stripe_length(em);
+ if (physical_len != stripe_len) {
+ btrfs_err(fs_info,
+"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
+ physical_offset, devid, em->start, physical_len,
+ stripe_len);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /*
+ * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * space. Although kernel can handle it without problem, better to warn
+ * the users.
+ */
+ if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
+ btrfs_warn(fs_info,
+ "devid %llu physical %llu len %llu inside the reserved space",
+ devid, physical_offset, physical_len);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev->devid == devid &&
+ map->stripes[i].physical == physical_offset) {
+ found = true;
+ if (map->verified_stripes >= map->num_stripes) {
+ btrfs_err(fs_info,
+ "too many dev extents for chunk %llu found",
+ em->start);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ map->verified_stripes++;
+ break;
+ }
+ }
+ if (!found) {
+ btrfs_err(fs_info,
+ "dev extent physical offset %llu devid %llu has no corresponding chunk",
+ physical_offset, devid);
+ ret = -EUCLEAN;
+ }
+
+ /* Make sure no dev extent is beyond device boundary */
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
+ if (!dev) {
+ btrfs_err(fs_info, "failed to find devid %llu", devid);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (physical_offset + physical_len > dev->disk_total_bytes) {
+ btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
+ devid, physical_offset, physical_len,
+ dev->disk_total_bytes);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (dev->zone_info) {
+ u64 zone_size = dev->zone_info->zone_size;
+
+ if (!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size)) {
+ btrfs_err(fs_info,
+"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
+ devid, physical_offset, physical_len);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct rb_node *node;
+ int ret = 0;
+
+ read_lock(&em_tree->lock);
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
+ em = rb_entry(node, struct extent_map, rb_node);
+ if (em->map_lookup->num_stripes !=
+ em->map_lookup->verified_stripes) {
+ btrfs_err(fs_info,
+ "chunk %llu has missing dev extent, have %d expect %d",
+ em->start, em->map_lookup->verified_stripes,
+ em->map_lookup->num_stripes);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+out:
+ read_unlock(&em_tree->lock);
+ return ret;
+}
+
+/*
+ * Ensure that all dev extents are mapped to correct chunk, otherwise
+ * later chunk allocation/free would cause unexpected behavior.
+ *
+ * NOTE: This will iterate through the whole device tree, which should be of
+ * the same size level as the chunk tree. This slightly increases mount time.
+ */
+int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_key key;
+ u64 prev_devid = 0;
+ u64 prev_dev_ext_end = 0;
+ int ret = 0;
+
+ /*
+ * We don't have a dev_root because we mounted with ignorebadroots and
+ * failed to load the root, so we want to skip the verification in this
+ * case for sure.
+ *
+ * However if the dev root is fine, but the tree itself is corrupted
+ * we'd still fail to mount. This verification is only to make sure
+ * writes can happen safely, so instead just bypass this check
+ * completely in the case of IGNOREBADROOTS.
+ */
+ if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ return 0;
+
+ key.objectid = 1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = READA_FORWARD;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ /* No dev extents at all? Not good */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ while (1) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_dev_extent *dext;
+ int slot = path->slots[0];
+ u64 chunk_offset;
+ u64 physical_offset;
+ u64 physical_len;
+ u64 devid;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
+ break;
+ devid = key.objectid;
+ physical_offset = key.offset;
+
+ dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+ chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
+ physical_len = btrfs_dev_extent_length(leaf, dext);
+
+ /* Check if this dev extent overlaps with the previous one */
+ if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+ btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
+ devid, physical_offset, prev_dev_ext_end);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
+ physical_offset, physical_len);
+ if (ret < 0)
+ goto out;
+ prev_devid = devid;
+ prev_dev_ext_end = physical_offset + physical_len;
+
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+
+ /* Ensure all chunks have corresponding dev extents */
+ ret = verify_chunk_dev_extent_mapping(fs_info);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Check whether the given block group or device is pinned by any inode being
+ * used as a swapfile.
+ */
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
+{
+ struct btrfs_swapfile_pin *sp;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->swapfile_pins_lock);
+ node = fs_info->swapfile_pins.rb_node;
+ while (node) {
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+ if (ptr < sp->ptr)
+ node = node->rb_left;
+ else if (ptr > sp->ptr)
+ node = node->rb_right;
+ else
+ break;
+ }
+ spin_unlock(&fs_info->swapfile_pins_lock);
+ return node != NULL;
+}
+
+static int relocating_repair_kthread(void *data)
+{
+ struct btrfs_block_group *cache = data;
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ u64 target;
+ int ret = 0;
+
+ target = cache->start;
+ btrfs_put_block_group(cache);
+
+ sb_start_write(fs_info->sb);
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ btrfs_info(fs_info,
+ "zoned: skip relocating block group %llu to repair: EBUSY",
+ target);
+ sb_end_write(fs_info->sb);
+ return -EBUSY;
+ }
+
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+
+ /* Ensure block group still exists */
+ cache = btrfs_lookup_block_group(fs_info, target);
+ if (!cache)
+ goto out;
+
+ if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
+ goto out;
+
+ ret = btrfs_may_alloc_data_chunk(fs_info, target);
+ if (ret < 0)
+ goto out;
+
+ btrfs_info(fs_info,
+ "zoned: relocating block group %llu to repair IO failure",
+ target);
+ ret = btrfs_relocate_chunk(fs_info, target);
+
+out:
+ if (cache)
+ btrfs_put_block_group(cache);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
+
+ return ret;
+}
+
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ /* Do not attempt to repair in degraded state */
+ if (btrfs_test_opt(fs_info, DEGRADED))
+ return true;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+ if (!cache)
+ return true;
+
+ if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
+ btrfs_put_block_group(cache);
+ return true;
+ }
+
+ kthread_run(relocating_repair_kthread, cache,
+ "btrfs-relocating-repair");
+
+ return true;
+}
+
+int __init btrfs_bioset_init(void)
+{
+ if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_bioset_exit(void)
+{
+ bioset_exit(&btrfs_bioset);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000..ec2260177
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,762 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ */
+
+#ifndef BTRFS_VOLUMES_H
+#define BTRFS_VOLUMES_H
+
+#include <linux/bio.h>
+#include <linux/sort.h>
+#include <linux/btrfs.h>
+#include "async-thread.h"
+
+#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
+
+extern struct mutex uuid_mutex;
+
+#define BTRFS_STRIPE_LEN SZ_64K
+
+/* Used by sanity check for btrfs_raid_types. */
+#define const_ffs(n) (__builtin_ctzll(n) + 1)
+
+/*
+ * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
+ * RAID0 always to be the lowest profile bit.
+ * Although it's part of on-disk format and should never change, do extra
+ * compile-time sanity checks.
+ */
+static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
+ const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
+static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
+ ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+
+/* ilog2() can handle both constants and variables */
+#define BTRFS_BG_FLAG_TO_INDEX(profile) \
+ ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))
+
+enum btrfs_raid_types {
+ /* SINGLE is the special one as it doesn't have on-disk bit. */
+ BTRFS_RAID_SINGLE = 0,
+
+ BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
+ BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
+ BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
+ BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
+ BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
+ BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
+ BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
+ BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),
+
+ BTRFS_NR_RAID_TYPES
+};
+
+struct btrfs_io_geometry {
+ /* remaining bytes before crossing a stripe */
+ u64 len;
+ /* offset of logical address in chunk */
+ u64 offset;
+ /* length of single IO stripe */
+ u32 stripe_len;
+ /* offset of address in stripe */
+ u32 stripe_offset;
+ /* number of stripe where address falls */
+ u64 stripe_nr;
+ /* offset of raid56 stripe into the chunk */
+ u64 raid56_stripe_offset;
+};
+
+/*
+ * Use sequence counter to get consistent device stat data on
+ * 32-bit processors.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#include <linux/seqlock.h>
+#define __BTRFS_NEED_DEVICE_DATA_ORDERED
+#define btrfs_device_data_ordered_init(device) \
+ seqcount_init(&device->data_seqcount)
+#else
+#define btrfs_device_data_ordered_init(device) do { } while (0)
+#endif
+
+#define BTRFS_DEV_STATE_WRITEABLE (0)
+#define BTRFS_DEV_STATE_IN_FS_METADATA (1)
+#define BTRFS_DEV_STATE_MISSING (2)
+#define BTRFS_DEV_STATE_REPLACE_TGT (3)
+#define BTRFS_DEV_STATE_FLUSH_SENT (4)
+#define BTRFS_DEV_STATE_NO_READA (5)
+
+struct btrfs_zoned_device_info;
+
+struct btrfs_device {
+ struct list_head dev_list; /* device_list_mutex */
+ struct list_head dev_alloc_list; /* chunk mutex */
+ struct list_head post_commit_list; /* chunk mutex */
+ struct btrfs_fs_devices *fs_devices;
+ struct btrfs_fs_info *fs_info;
+
+ struct rcu_string __rcu *name;
+
+ u64 generation;
+
+ struct block_device *bdev;
+
+ struct btrfs_zoned_device_info *zone_info;
+
+ /* the mode sent to blkdev_get */
+ fmode_t mode;
+
+ /*
+ * Device's major-minor number. Must be set even if the device is not
+ * opened (bdev == NULL), unless the device is missing.
+ */
+ dev_t devt;
+ unsigned long dev_state;
+ blk_status_t last_flush_error;
+
+#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
+ seqcount_t data_seqcount;
+#endif
+
+ /* the internal btrfs device id */
+ u64 devid;
+
+ /* size of the device in memory */
+ u64 total_bytes;
+
+ /* size of the device on disk */
+ u64 disk_total_bytes;
+
+ /* bytes used */
+ u64 bytes_used;
+
+ /* optimal io alignment for this device */
+ u32 io_align;
+
+ /* optimal io width for this device */
+ u32 io_width;
+ /* type and info about this device */
+ u64 type;
+
+ /* minimal io size for this device */
+ u32 sector_size;
+
+ /* physical drive uuid (or lvm uuid) */
+ u8 uuid[BTRFS_UUID_SIZE];
+
+ /*
+ * size of the device on the current transaction
+ *
+ * This variant is update when committing the transaction,
+ * and protected by chunk mutex
+ */
+ u64 commit_total_bytes;
+
+ /* bytes used on the current transaction */
+ u64 commit_bytes_used;
+
+ /* Bio used for flushing device barriers */
+ struct bio flush_bio;
+ struct completion flush_wait;
+
+ /* per-device scrub information */
+ struct scrub_ctx *scrub_ctx;
+
+ /* disk I/O failure stats. For detailed description refer to
+ * enum btrfs_dev_stat_values in ioctl.h */
+ int dev_stats_valid;
+
+ /* Counter to record the change of device stats */
+ atomic_t dev_stats_ccnt;
+ atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+
+ struct extent_io_tree alloc_state;
+
+ struct completion kobj_unregister;
+ /* For sysfs/FSID/devinfo/devid/ */
+ struct kobject devid_kobj;
+
+ /* Bandwidth limit for scrub, in bytes */
+ u64 scrub_speed_max;
+};
+
+/*
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
+ */
+struct btrfs_swapfile_pin {
+ struct rb_node node;
+ void *ptr;
+ struct inode *inode;
+ /*
+ * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+ * points to a struct btrfs_device.
+ */
+ bool is_block_group;
+ /*
+ * Only used when 'is_block_group' is true and it is the number of
+ * extents used by a swapfile for this block group ('ptr' field).
+ */
+ int bg_extent_count;
+};
+
+/*
+ * If we read those variants at the context of their own lock, we needn't
+ * use the following helpers, reading them directly is safe.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ unsigned int seq; \
+ \
+ do { \
+ seq = read_seqcount_begin(&dev->data_seqcount); \
+ size = dev->name; \
+ } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ write_seqcount_begin(&dev->data_seqcount); \
+ dev->name = size; \
+ write_seqcount_end(&dev->data_seqcount); \
+ preempt_enable(); \
+}
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ \
+ preempt_disable(); \
+ size = dev->name; \
+ preempt_enable(); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ dev->name = size; \
+ preempt_enable(); \
+}
+#else
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ return dev->name; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ dev->name = size; \
+}
+#endif
+
+BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+
+enum btrfs_chunk_allocation_policy {
+ BTRFS_CHUNK_ALLOC_REGULAR,
+ BTRFS_CHUNK_ALLOC_ZONED,
+};
+
+/*
+ * Read policies for mirrored block group profiles, read picks the stripe based
+ * on these policies.
+ */
+enum btrfs_read_policy {
+ /* Use process PID to choose the stripe */
+ BTRFS_READ_POLICY_PID,
+ BTRFS_NR_READ_POLICY,
+};
+
+struct btrfs_fs_devices {
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ u8 metadata_uuid[BTRFS_FSID_SIZE];
+ bool fsid_change;
+ struct list_head fs_list;
+
+ /*
+ * Number of devices under this fsid including missing and
+ * replace-target device and excludes seed devices.
+ */
+ u64 num_devices;
+
+ /*
+ * The number of devices that successfully opened, including
+ * replace-target, excludes seed devices.
+ */
+ u64 open_devices;
+
+ /* The number of devices that are under the chunk allocation list. */
+ u64 rw_devices;
+
+ /* Count of missing devices under this fsid excluding seed device. */
+ u64 missing_devices;
+ u64 total_rw_bytes;
+
+ /*
+ * Count of devices from btrfs_super_block::num_devices for this fsid,
+ * which includes the seed device, excludes the transient replace-target
+ * device.
+ */
+ u64 total_devices;
+
+ /* Highest generation number of seen devices */
+ u64 latest_generation;
+
+ /*
+ * The mount device or a device with highest generation after removal
+ * or replace.
+ */
+ struct btrfs_device *latest_dev;
+
+ /* all of the devices in the FS, protected by a mutex
+ * so we can safely walk it to write out the supers without
+ * worrying about add/remove by the multi-device code.
+ * Scrubbing super can kick off supers writing by holding
+ * this mutex lock.
+ */
+ struct mutex device_list_mutex;
+
+ /* List of all devices, protected by device_list_mutex */
+ struct list_head devices;
+
+ /*
+ * Devices which can satisfy space allocation. Protected by
+ * chunk_mutex
+ */
+ struct list_head alloc_list;
+
+ struct list_head seed_list;
+ bool seeding;
+
+ int opened;
+
+ /* set when we find or add a device that doesn't have the
+ * nonrot flag set
+ */
+ bool rotating;
+
+ struct btrfs_fs_info *fs_info;
+ /* sysfs kobjects */
+ struct kobject fsid_kobj;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
+ struct completion kobj_unregister;
+
+ enum btrfs_chunk_allocation_policy chunk_alloc_policy;
+
+ /* Policy used to read the mirrored stripes */
+ enum btrfs_read_policy read_policy;
+};
+
+#define BTRFS_BIO_INLINE_CSUM_SIZE 64
+
+#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
+ - sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
+ - 2 * sizeof(struct btrfs_disk_key) \
+ - 2 * sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
+/*
+ * Maximum number of sectors for a single bio to limit the size of the
+ * checksum array. This matches the number of bio_vecs per bio and thus the
+ * I/O size for buffered I/O.
+ */
+#define BTRFS_MAX_BIO_SECTORS (256)
+
+typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+
+/*
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
+ */
+struct btrfs_bio {
+ unsigned int mirror_num;
+ struct bvec_iter iter;
+
+ /* for direct I/O */
+ u64 file_offset;
+
+ /* @device is for stripe IO submission. */
+ struct btrfs_device *device;
+ u8 *csum;
+ u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+
+ /* End I/O information supplied to btrfs_bio_alloc */
+ btrfs_bio_end_io_t end_io;
+ void *private;
+
+ /* For read end I/O handling */
+ struct work_struct end_io_work;
+
+ /*
+ * This member must come last, bio_alloc_bioset will allocate enough
+ * bytes for entire btrfs_bio but relies on bio being last.
+ */
+ struct bio bio;
+};
+
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
+{
+ return container_of(bio, struct btrfs_bio, bio);
+}
+
+int __init btrfs_bioset_init(void);
+void __cold btrfs_bioset_exit(void);
+
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private);
+
+static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+{
+ bbio->bio.bi_status = status;
+ bbio->end_io(bbio);
+}
+
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
+{
+ if (bbio->csum != bbio->csum_inline) {
+ kfree(bbio->csum);
+ bbio->csum = NULL;
+ }
+}
+
+/*
+ * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+ *
+ * bvl - struct bio_vec
+ * bbio - struct btrfs_bio
+ * iters - struct bvec_iter
+ * bio_offset - unsigned int
+ */
+#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
+ for ((iter) = (bbio)->iter, (bio_offset) = 0; \
+ (iter).bi_size && \
+ (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
+ (bio_offset) += fs_info->sectorsize, \
+ bio_advance_iter_single(&(bbio)->bio, &(iter), \
+ (fs_info)->sectorsize))
+
+struct btrfs_io_stripe {
+ struct btrfs_device *dev;
+ union {
+ /* Block mapping */
+ u64 physical;
+ /* For the endio handler */
+ struct btrfs_io_context *bioc;
+ };
+};
+
+struct btrfs_discard_stripe {
+ struct btrfs_device *dev;
+ u64 physical;
+ u64 length;
+};
+
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ * Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ * Used by submit_stripe_bio() for mapping logical bio
+ * into physical device address.
+ *
+ * - Contain device replace info
+ * Used by handle_ops_on_dev_replace() to copy logical bios
+ * into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
+ refcount_t refs;
+ struct btrfs_fs_info *fs_info;
+ u64 map_type; /* get from map_lookup->type */
+ struct bio *orig_bio;
+ atomic_t error;
+ int max_errors;
+ int num_stripes;
+ int mirror_num;
+ int num_tgtdevs;
+ int *tgtdev_map;
+ /*
+ * logical block numbers for the start of each stripe
+ * The last one or two are p/q. These are sorted,
+ * so raid_map[0] is the start of our full stripe
+ */
+ u64 *raid_map;
+ struct btrfs_io_stripe stripes[];
+};
+
+struct btrfs_device_info {
+ struct btrfs_device *dev;
+ u64 dev_offset;
+ u64 max_avail;
+ u64 total_avail;
+};
+
+struct btrfs_raid_attr {
+ u8 sub_stripes; /* sub_stripes info for map */
+ u8 dev_stripes; /* stripes per dev */
+ u8 devs_max; /* max devs to use */
+ u8 devs_min; /* min devs needed */
+ u8 tolerated_failures; /* max tolerated fail devs */
+ u8 devs_increment; /* ndevs has to be a multiple of this */
+ u8 ncopies; /* how many copies to data has */
+ u8 nparity; /* number of stripes worth of bytes to store
+ * parity information */
+ u8 mindev_error; /* error code if min devs requisite is unmet */
+ const char raid_name[8]; /* name of the raid */
+ u64 bg_flag; /* block group flag of the raid */
+};
+
+extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
+
+struct map_lookup {
+ u64 type;
+ int io_align;
+ int io_width;
+ u32 stripe_len;
+ int num_stripes;
+ int sub_stripes;
+ int verified_stripes; /* For mount time dev extent verification */
+ struct btrfs_io_stripe stripes[];
+};
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+ (sizeof(struct btrfs_io_stripe) * (n)))
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+ struct btrfs_balance_args data;
+ struct btrfs_balance_args meta;
+ struct btrfs_balance_args sys;
+
+ u64 flags;
+
+ struct btrfs_balance_progress stat;
+};
+
+/*
+ * Search for a given device by the set parameters
+ */
+struct btrfs_dev_lookup_args {
+ u64 devid;
+ u8 *uuid;
+ u8 *fsid;
+ bool missing;
+};
+
+/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
+#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }
+
+#define BTRFS_DEV_LOOKUP_ARGS(name) \
+ struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT
+
+enum btrfs_map_op {
+ BTRFS_MAP_READ,
+ BTRFS_MAP_WRITE,
+ BTRFS_MAP_DISCARD,
+ BTRFS_MAP_GET_READ_MIRRORS,
+};
+
+static inline enum btrfs_map_op btrfs_op(struct bio *bio)
+{
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ return BTRFS_MAP_DISCARD;
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ return BTRFS_MAP_WRITE;
+ default:
+ WARN_ON_ONCE(1);
+ fallthrough;
+ case REQ_OP_READ:
+ return BTRFS_MAP_READ;
+ }
+}
+
+void btrfs_get_bioc(struct btrfs_io_context *bioc);
+void btrfs_put_bioc(struct btrfs_io_context *bioc);
+int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret);
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes);
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
+ enum btrfs_map_op op, u64 logical,
+ struct btrfs_io_geometry *io_geom);
+int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
+int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
+ u64 type);
+void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder);
+struct btrfs_device *btrfs_scan_one_device(const char *path,
+ fmode_t flags, void *holder);
+int btrfs_forget_devices(dev_t devt);
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
+void btrfs_assign_next_active_device(struct btrfs_device *device,
+ struct btrfs_device *this_dev);
+struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
+ u64 devid,
+ const char *devpath);
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path);
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+ const u64 *devid,
+ const u8 *uuid);
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
+void btrfs_free_device(struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ struct block_device **bdev, fmode_t *mode);
+void __exit btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
+int btrfs_balance(struct btrfs_fs_info *fs_info,
+ struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs);
+void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_uuid_scan_kthread(void *data);
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_get_dev_stats *stats);
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 len);
+unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
+ u64 logical);
+u64 btrfs_calc_stripe_length(const struct extent_map *em);
+int btrfs_nr_parity_stripes(u64 type);
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+void btrfs_release_disk_super(struct btrfs_super_block *super);
+
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+ int index)
+{
+ atomic_inc(dev->dev_stat_values + index);
+ /*
+ * This memory barrier orders stores updating statistics before stores
+ * updating dev_stats_ccnt.
+ *
+ * It pairs with smp_rmb() in btrfs_run_dev_stats().
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+ int index)
+{
+ return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+ int index)
+{
+ int ret;
+
+ ret = atomic_xchg(dev->dev_stat_values + index, 0);
+ /*
+ * atomic_xchg implies a full memory barriers as per atomic_t.txt:
+ * - RMW operations that have a return value are fully ordered;
+ *
+ * This implicit memory barriers is paired with the smp_rmb in
+ * btrfs_run_dev_stats
+ */
+ atomic_inc(&dev->dev_stats_ccnt);
+ return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+ int index, unsigned long val)
+{
+ atomic_set(dev->dev_stat_values + index, val);
+ /*
+ * This memory barrier orders stores updating statistics before stores
+ * updating dev_stats_ccnt.
+ *
+ * It pairs with smp_rmb() in btrfs_run_dev_stats().
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
+}
+
+void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
+
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *failing_dev);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path);
+
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
+int btrfs_bg_type_to_factor(u64 flags);
+const char *btrfs_bg_type_to_raid_name(u64 flags);
+int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb);
+
+#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000..5bb8d8c86
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,492 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+#include "props.h"
+#include "locking.h"
+
+int btrfs_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret = 0;
+ unsigned long data_ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* lookup the xattr by name */
+ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)),
+ name, strlen(name), 0);
+ if (!di) {
+ ret = -ENODATA;
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ /* if size is 0, that means we want the size of the attr */
+ if (!size) {
+ ret = btrfs_dir_data_len(leaf, di);
+ goto out;
+ }
+
+ /* now get the data out of our dir_item */
+ if (btrfs_dir_data_len(leaf, di) > size) {
+ ret = -ERANGE;
+ goto out;
+ }
+
+ /*
+ * The way things are packed into the leaf is like this
+ * |struct btrfs_dir_item|name|data|
+ * where name is the xattr name, so security.foo, and data is the
+ * content of the xattr. data_ptr points to the location in memory
+ * where the data starts in the in memory leaf
+ */
+ data_ptr = (unsigned long)((char *)(di + 1) +
+ btrfs_dir_name_len(leaf, di));
+ read_extent_buffer(leaf, buffer, data_ptr,
+ btrfs_dir_data_len(leaf, di));
+ ret = btrfs_dir_data_len(leaf, di);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+ const char *name, const void *value, size_t size, int flags)
+{
+ struct btrfs_dir_item *di = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ size_t name_len = strlen(name);
+ int ret = 0;
+
+ ASSERT(trans);
+
+ if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info))
+ return -ENOSPC;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->skip_release_on_error = 1;
+
+ if (!value) {
+ di = btrfs_lookup_xattr(trans, root, path,
+ btrfs_ino(BTRFS_I(inode)), name, name_len, -1);
+ if (!di && (flags & XATTR_REPLACE))
+ ret = -ENODATA;
+ else if (IS_ERR(di))
+ ret = PTR_ERR(di);
+ else if (di)
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ goto out;
+ }
+
+ /*
+ * For a replace we can't just do the insert blindly.
+ * Do a lookup first (read-only btrfs_search_slot), and return if xattr
+ * doesn't exist. If it exists, fall down below to the insert/replace
+ * path - we can't race with a concurrent xattr delete, because the VFS
+ * locks the inode's i_mutex before calling setxattr or removexattr.
+ */
+ if (flags & XATTR_REPLACE) {
+ ASSERT(inode_is_locked(inode));
+ di = btrfs_lookup_xattr(NULL, root, path,
+ btrfs_ino(BTRFS_I(inode)), name, name_len, 0);
+ if (!di)
+ ret = -ENODATA;
+ else if (IS_ERR(di))
+ ret = PTR_ERR(di);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ di = NULL;
+ }
+
+ ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(BTRFS_I(inode)),
+ name, name_len, value, size);
+ if (ret == -EOVERFLOW) {
+ /*
+ * We have an existing item in a leaf, split_leaf couldn't
+ * expand it. That item might have or not a dir_item that
+ * matches our target xattr, so lets check.
+ */
+ ret = 0;
+ btrfs_assert_tree_write_locked(path->nodes[0]);
+ di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ if (!di && !(flags & XATTR_REPLACE)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ ASSERT(di); /* logic error */
+ } else if (ret) {
+ goto out;
+ }
+
+ if (di && (flags & XATTR_CREATE)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ if (di) {
+ /*
+ * We're doing a replace, and it must be atomic, that is, at
+ * any point in time we have either the old or the new xattr
+ * value in the tree. We don't want readers (getxattr and
+ * listxattrs) to miss a value, this is specially important
+ * for ACLs.
+ */
+ const int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+ const u32 item_size = btrfs_item_size(leaf, slot);
+ const u32 data_size = sizeof(*di) + name_len + size;
+ unsigned long data_ptr;
+ char *ptr;
+
+ if (size > old_data_len) {
+ if (btrfs_leaf_free_space(leaf) <
+ (size - old_data_len)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ if (old_data_len + name_len + sizeof(*di) == item_size) {
+ /* No other xattrs packed in the same leaf item. */
+ if (size > old_data_len)
+ btrfs_extend_item(path, size - old_data_len);
+ else if (size < old_data_len)
+ btrfs_truncate_item(path, data_size, 1);
+ } else {
+ /* There are other xattrs packed in the same item. */
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto out;
+ btrfs_extend_item(path, data_size);
+ }
+
+ ptr = btrfs_item_ptr(leaf, slot, char);
+ ptr += btrfs_item_size(leaf, slot) - data_size;
+ di = (struct btrfs_dir_item *)ptr;
+ btrfs_set_dir_data_len(leaf, di, size);
+ data_ptr = ((unsigned long)(di + 1)) + name_len;
+ write_extent_buffer(leaf, value, data_ptr, size);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ /*
+ * Insert, and we had space for the xattr, so path->slots[0] is
+ * where our xattr dir_item is and btrfs_insert_xattr_item()
+ * filled it.
+ */
+ }
+out:
+ btrfs_free_path(path);
+ if (!ret) {
+ set_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags);
+ }
+ return ret;
+}
+
+/*
+ * @value: "" makes the attribute to empty, NULL removes it
+ */
+int btrfs_setxattr_trans(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ const bool start_trans = (current->journal_info == NULL);
+ int ret;
+
+ if (start_trans) {
+ /*
+ * 1 unit for inserting/updating/deleting the xattr
+ * 1 unit for the inode item update
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ } else {
+ /*
+ * This can happen when smack is enabled and a directory is being
+ * created. It happens through d_instantiate_new(), which calls
+ * smack_d_instantiate(), which in turn calls __vfs_setxattr() to
+ * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the
+ * inode. We have already reserved space for the xattr and inode
+ * update at btrfs_mkdir(), so just use the transaction handle.
+ * We don't join or start a transaction, as that will reset the
+ * block_rsv of the handle and trigger a warning for the start
+ * case.
+ */
+ ASSERT(strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) == 0);
+ trans = current->journal_info;
+ }
+
+ ret = btrfs_setxattr(trans, inode, name, value, size, flags);
+ if (ret)
+ goto out;
+
+ inode_inc_iversion(inode);
+ inode->i_ctime = current_time(inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+out:
+ if (start_trans)
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct btrfs_key found_key;
+ struct btrfs_key key;
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ int iter_ret = 0;
+ int ret = 0;
+ size_t total_size = 0, size_left = size;
+
+ /*
+ * ok we want all objects associated with this id.
+ * NOTE: we set key.offset = 0; because we want to start with the
+ * first xattr that we find and walk forward
+ */
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ /* search for our xattrs */
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_dir_item *di;
+ u32 item_size;
+ u32 cur;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ /* check to make sure this item is what we want */
+ if (found_key.objectid != key.objectid)
+ break;
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+ break;
+ if (found_key.type < BTRFS_XATTR_ITEM_KEY)
+ continue;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ item_size = btrfs_item_size(leaf, slot);
+ cur = 0;
+ while (cur < item_size) {
+ u16 name_len = btrfs_dir_name_len(leaf, di);
+ u16 data_len = btrfs_dir_data_len(leaf, di);
+ u32 this_len = sizeof(*di) + name_len + data_len;
+ unsigned long name_ptr = (unsigned long)(di + 1);
+
+ total_size += name_len + 1;
+ /*
+ * We are just looking for how big our buffer needs to
+ * be.
+ */
+ if (!size)
+ goto next;
+
+ if (!buffer || (name_len + 1) > size_left) {
+ iter_ret = -ERANGE;
+ break;
+ }
+
+ read_extent_buffer(leaf, buffer, name_ptr, name_len);
+ buffer[name_len] = '\0';
+
+ size_left -= name_len + 1;
+ buffer += name_len + 1;
+next:
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *)di + this_len);
+ }
+ }
+
+ if (iter_ret < 0)
+ ret = iter_ret;
+ else
+ ret = total_size;
+
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ name = xattr_full_name(handler, name);
+ return btrfs_getxattr(inode, name, buffer, size);
+}
+
+static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *buffer,
+ size_t size, int flags)
+{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
+ name = xattr_full_name(handler, name);
+ return btrfs_setxattr_trans(inode, name, buffer, size, flags);
+}
+
+static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ int ret;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ name = xattr_full_name(handler, name);
+ ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size);
+ if (ret)
+ return ret;
+
+ if (btrfs_ignore_prop(BTRFS_I(inode), name))
+ return 0;
+
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_set_prop(trans, inode, name, value, size, flags);
+ if (!ret) {
+ inode_inc_iversion(inode);
+ inode->i_ctime = current_time(inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+
+ btrfs_end_transaction(trans);
+
+ return ret;
+}
+
+static const struct xattr_handler btrfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_btrfs_xattr_handler = {
+ .prefix = XATTR_BTRFS_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set_prop,
+};
+
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+ &btrfs_security_xattr_handler,
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &btrfs_trusted_xattr_handler,
+ &btrfs_user_xattr_handler,
+ &btrfs_btrfs_xattr_handler,
+ NULL,
+};
+
+static int btrfs_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array, void *fs_private)
+{
+ struct btrfs_trans_handle *trans = fs_private;
+ const struct xattr *xattr;
+ unsigned int nofs_flag;
+ char *name;
+ int err = 0;
+
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_KERNEL);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
+ strcpy(name, XATTR_SECURITY_PREFIX);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ err = btrfs_setxattr(trans, inode, name, xattr->value,
+ xattr->value_len, 0);
+ kfree(name);
+ if (err < 0)
+ break;
+ }
+ memalloc_nofs_restore(nofs_flag);
+ return err;
+}
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &btrfs_initxattrs, trans);
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000..1cd3fc0a8
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ */
+
+#ifndef BTRFS_XATTR_H
+#define BTRFS_XATTR_H
+
+#include <linux/xattr.h>
+
+extern const struct xattr_handler *btrfs_xattr_handlers[];
+
+int btrfs_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size);
+int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
+ const char *name, const void *value, size_t size, int flags);
+int btrfs_setxattr_trans(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr);
+
+#endif
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000..94d06a76e
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/refcount.h>
+#include "compression.h"
+
+/* workspace buffer size for s390 zlib hardware support */
+#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE)
+
+struct workspace {
+ z_stream strm;
+ char *buf;
+ unsigned int buf_size;
+ struct list_head list;
+ int level;
+};
+
+static struct workspace_manager wsm;
+
+struct list_head *zlib_get_workspace(unsigned int level)
+{
+ struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ workspace->level = level;
+
+ return ws;
+}
+
+void zlib_free_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ kvfree(workspace->strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+}
+
+struct list_head *zlib_alloc_workspace(unsigned int level)
+{
+ struct workspace *workspace;
+ int workspacesize;
+
+ workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
+ if (!workspace)
+ return ERR_PTR(-ENOMEM);
+
+ workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+ workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL);
+ workspace->level = level;
+ workspace->buf = NULL;
+ /*
+ * In case of s390 zlib hardware support, allocate lager workspace
+ * buffer. If allocator fails, fall back to a single page buffer.
+ */
+ if (zlib_deflate_dfltcc_enabled()) {
+ workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN | GFP_NOIO);
+ workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
+ }
+ if (!workspace->buf) {
+ workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ workspace->buf_size = PAGE_SIZE;
+ }
+ if (!workspace->strm.workspace || !workspace->buf)
+ goto fail;
+
+ INIT_LIST_HEAD(&workspace->list);
+
+ return &workspace->list;
+fail:
+ zlib_free_workspace(&workspace->list);
+ return ERR_PTR(-ENOMEM);
+}
+
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret;
+ char *data_in = NULL;
+ char *cpage_out;
+ int nr_pages = 0;
+ struct page *in_page = NULL;
+ struct page *out_page = NULL;
+ unsigned long bytes_left;
+ unsigned int in_buf_pages;
+ unsigned long len = *total_out;
+ unsigned long nr_dest_pages = *out_pages;
+ const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) {
+ pr_warn("BTRFS: deflateInit failed\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ workspace->strm.total_in = 0;
+ workspace->strm.total_out = 0;
+
+ out_page = alloc_page(GFP_NOFS);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = page_address(out_page);
+ pages[0] = out_page;
+ nr_pages = 1;
+
+ workspace->strm.next_in = workspace->buf;
+ workspace->strm.avail_in = 0;
+ workspace->strm.next_out = cpage_out;
+ workspace->strm.avail_out = PAGE_SIZE;
+
+ while (workspace->strm.total_in < len) {
+ /*
+ * Get next input pages and copy the contents to
+ * the workspace buffer if required.
+ */
+ if (workspace->strm.avail_in == 0) {
+ bytes_left = len - workspace->strm.total_in;
+ in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
+ workspace->buf_size / PAGE_SIZE);
+ if (in_buf_pages > 1) {
+ int i;
+
+ for (i = 0; i < in_buf_pages; i++) {
+ if (data_in) {
+ kunmap_local(data_in);
+ put_page(in_page);
+ }
+ in_page = find_get_page(mapping,
+ start >> PAGE_SHIFT);
+ data_in = kmap_local_page(in_page);
+ memcpy(workspace->buf + i * PAGE_SIZE,
+ data_in, PAGE_SIZE);
+ start += PAGE_SIZE;
+ }
+ workspace->strm.next_in = workspace->buf;
+ } else {
+ if (data_in) {
+ kunmap_local(data_in);
+ put_page(in_page);
+ }
+ in_page = find_get_page(mapping,
+ start >> PAGE_SHIFT);
+ data_in = kmap_local_page(in_page);
+ start += PAGE_SIZE;
+ workspace->strm.next_in = data_in;
+ }
+ workspace->strm.avail_in = min(bytes_left,
+ (unsigned long) workspace->buf_size);
+ }
+
+ ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
+ if (ret != Z_OK) {
+ pr_debug("BTRFS: deflate in loop returned %d\n",
+ ret);
+ zlib_deflateEnd(&workspace->strm);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* we're making it bigger, give up */
+ if (workspace->strm.total_in > 8192 &&
+ workspace->strm.total_in <
+ workspace->strm.total_out) {
+ ret = -E2BIG;
+ goto out;
+ }
+ /* we need another page for writing out. Test this
+ * before the total_in so we will pull in a new page for
+ * the stream end if required
+ */
+ if (workspace->strm.avail_out == 0) {
+ if (nr_pages == nr_dest_pages) {
+ ret = -E2BIG;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = page_address(out_page);
+ pages[nr_pages] = out_page;
+ nr_pages++;
+ workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.next_out = cpage_out;
+ }
+ /* we're all done */
+ if (workspace->strm.total_in >= len)
+ break;
+ if (workspace->strm.total_out > max_out)
+ break;
+ }
+ workspace->strm.avail_in = 0;
+ /*
+ * Call deflate with Z_FINISH flush parameter providing more output
+ * space but no more input data, until it returns with Z_STREAM_END.
+ */
+ while (ret != Z_STREAM_END) {
+ ret = zlib_deflate(&workspace->strm, Z_FINISH);
+ if (ret == Z_STREAM_END)
+ break;
+ if (ret != Z_OK && ret != Z_BUF_ERROR) {
+ zlib_deflateEnd(&workspace->strm);
+ ret = -EIO;
+ goto out;
+ } else if (workspace->strm.avail_out == 0) {
+ /* get another page for the stream end */
+ if (nr_pages == nr_dest_pages) {
+ ret = -E2BIG;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = page_address(out_page);
+ pages[nr_pages] = out_page;
+ nr_pages++;
+ workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.next_out = cpage_out;
+ }
+ }
+ zlib_deflateEnd(&workspace->strm);
+
+ if (workspace->strm.total_out >= workspace->strm.total_in) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ ret = 0;
+ *total_out = workspace->strm.total_out;
+ *total_in = workspace->strm.total_in;
+out:
+ *out_pages = nr_pages;
+ if (data_in) {
+ kunmap_local(data_in);
+ put_page(in_page);
+ }
+
+ return ret;
+}
+
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0, ret2;
+ int wbits = MAX_WBITS;
+ char *data_in;
+ size_t total_out = 0;
+ unsigned long page_in_index = 0;
+ size_t srclen = cb->compressed_len;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long buf_start;
+ struct page **pages_in = cb->compressed_pages;
+
+ data_in = kmap_local_page(pages_in[page_in_index]);
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->strm.total_in = 0;
+
+ workspace->strm.total_out = 0;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = workspace->buf_size;
+
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
+ pr_warn("BTRFS: inflateInit failed\n");
+ kunmap_local(data_in);
+ return -EIO;
+ }
+ while (workspace->strm.total_in < srclen) {
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+
+ buf_start = total_out;
+ total_out = workspace->strm.total_out;
+
+ /* we didn't make progress in this inflate call, we're done */
+ if (buf_start == total_out)
+ break;
+
+ ret2 = btrfs_decompress_buf2page(workspace->buf,
+ total_out - buf_start, cb, buf_start);
+ if (ret2 == 0) {
+ ret = 0;
+ goto done;
+ }
+
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = workspace->buf_size;
+
+ if (workspace->strm.avail_in == 0) {
+ unsigned long tmp;
+ kunmap_local(data_in);
+ page_in_index++;
+ if (page_in_index >= total_pages_in) {
+ data_in = NULL;
+ break;
+ }
+ data_in = kmap_local_page(pages_in[page_in_index]);
+ workspace->strm.next_in = data_in;
+ tmp = srclen - workspace->strm.total_in;
+ workspace->strm.avail_in = min(tmp, PAGE_SIZE);
+ }
+ }
+ if (ret != Z_STREAM_END)
+ ret = -EIO;
+ else
+ ret = 0;
+done:
+ zlib_inflateEnd(&workspace->strm);
+ if (data_in)
+ kunmap_local(data_in);
+ if (!ret)
+ zero_fill_bio(cb->orig_bio);
+ return ret;
+}
+
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0;
+ int wbits = MAX_WBITS;
+ unsigned long bytes_left;
+ unsigned long total_out = 0;
+ unsigned long pg_offset = 0;
+
+ destlen = min_t(unsigned long, destlen, PAGE_SIZE);
+ bytes_left = destlen;
+
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = srclen;
+ workspace->strm.total_in = 0;
+
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = workspace->buf_size;
+ workspace->strm.total_out = 0;
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
+ pr_warn("BTRFS: inflateInit failed\n");
+ return -EIO;
+ }
+
+ while (bytes_left > 0) {
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+
+ buf_start = total_out;
+ total_out = workspace->strm.total_out;
+
+ if (total_out == buf_start) {
+ ret = -EIO;
+ break;
+ }
+
+ if (total_out <= start_byte)
+ goto next;
+
+ if (total_out > start_byte && buf_start < start_byte)
+ buf_offset = start_byte - buf_start;
+ else
+ buf_offset = 0;
+
+ bytes = min(PAGE_SIZE - pg_offset,
+ PAGE_SIZE - (buf_offset % PAGE_SIZE));
+ bytes = min(bytes, bytes_left);
+
+ memcpy_to_page(dest_page, pg_offset,
+ workspace->buf + buf_offset, bytes);
+
+ pg_offset += bytes;
+ bytes_left -= bytes;
+next:
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = workspace->buf_size;
+ }
+
+ if (ret != Z_STREAM_END && bytes_left != 0)
+ ret = -EIO;
+ else
+ ret = 0;
+
+ zlib_inflateEnd(&workspace->strm);
+
+ /*
+ * this should only happen if zlib returned fewer bytes than we
+ * expected. btrfs_get_block is responsible for zeroing from the
+ * end of the inline extent (destlen) to the end of the page
+ */
+ if (pg_offset < destlen) {
+ memzero_page(dest_page, pg_offset, destlen - pg_offset);
+ }
+ return ret;
+}
+
+const struct btrfs_compress_op btrfs_zlib_compress = {
+ .workspace_manager = &wsm,
+ .max_level = 9,
+ .default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
+};
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
new file mode 100644
index 000000000..99cb690da
--- /dev/null
+++ b/fs/btrfs/zoned.c
@@ -0,0 +1,2364 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
+#include <linux/atomic.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "zoned.h"
+#include "rcu-string.h"
+#include "disk-io.h"
+#include "block-group.h"
+#include "transaction.h"
+#include "dev-replace.h"
+#include "space-info.h"
+
+/* Maximum number of zones to report per blkdev_report_zones() call */
+#define BTRFS_REPORT_NR_ZONES 4096
+/* Invalid allocation pointer value for missing devices */
+#define WP_MISSING_DEV ((u64)-1)
+/* Pseudo write pointer value for conventional zone */
+#define WP_CONVENTIONAL ((u64)-2)
+
+/*
+ * Location of the first zone of superblock logging zone pairs.
+ *
+ * - primary superblock: 0B (zone 0)
+ * - first copy: 512G (zone starting at that offset)
+ * - second copy: 4T (zone starting at that offset)
+ */
+#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
+#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
+#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
+
+#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
+#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
+
+/* Number of superblock log zones */
+#define BTRFS_NR_SB_LOG_ZONES 2
+
+/*
+ * Minimum of active zones we need:
+ *
+ * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
+ * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
+ * - 1 zone for tree-log dedicated block group
+ * - 1 zone for relocation
+ */
+#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
+
+/*
+ * Minimum / maximum supported zone size. Currently, SMR disks have a zone
+ * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
+ * We do not expect the zone size to become larger than 8GiB or smaller than
+ * 4MiB in the near future.
+ */
+#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define BTRFS_MIN_ZONE_SIZE SZ_4M
+
+#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+
+static inline bool sb_zone_is_full(const struct blk_zone *zone)
+{
+ return (zone->cond == BLK_ZONE_COND_FULL) ||
+ (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
+}
+
+static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
+{
+ struct blk_zone *zones = data;
+
+ memcpy(&zones[idx], zone, sizeof(*zone));
+
+ return 0;
+}
+
+static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
+ u64 *wp_ret)
+{
+ bool empty[BTRFS_NR_SB_LOG_ZONES];
+ bool full[BTRFS_NR_SB_LOG_ZONES];
+ sector_t sector;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
+ full[i] = sb_zone_is_full(&zones[i]);
+ }
+
+ /*
+ * Possible states of log buffer zones
+ *
+ * Empty[0] In use[0] Full[0]
+ * Empty[1] * 0 1
+ * In use[1] x x 1
+ * Full[1] 0 0 C
+ *
+ * Log position:
+ * *: Special case, no superblock is written
+ * 0: Use write pointer of zones[0]
+ * 1: Use write pointer of zones[1]
+ * C: Compare super blocks from zones[0] and zones[1], use the latest
+ * one determined by generation
+ * x: Invalid state
+ */
+
+ if (empty[0] && empty[1]) {
+ /* Special case to distinguish no superblock to read */
+ *wp_ret = zones[0].start << SECTOR_SHIFT;
+ return -ENOENT;
+ } else if (full[0] && full[1]) {
+ /* Compare two super blocks */
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct page *page[BTRFS_NR_SB_LOG_ZONES];
+ struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
+ int i;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
+ u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
+ BTRFS_SUPER_INFO_SIZE;
+
+ page[i] = read_cache_page_gfp(mapping,
+ bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page[i])) {
+ if (i == 1)
+ btrfs_release_disk_super(super[0]);
+ return PTR_ERR(page[i]);
+ }
+ super[i] = page_address(page[i]);
+ }
+
+ if (btrfs_super_generation(super[0]) >
+ btrfs_super_generation(super[1]))
+ sector = zones[1].start;
+ else
+ sector = zones[0].start;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ btrfs_release_disk_super(super[i]);
+ } else if (!full[0] && (empty[1] || full[1])) {
+ sector = zones[0].wp;
+ } else if (full[0]) {
+ sector = zones[1].wp;
+ } else {
+ return -EUCLEAN;
+ }
+ *wp_ret = sector << SECTOR_SHIFT;
+ return 0;
+}
+
+/*
+ * Get the first zone number of the superblock mirror
+ */
+static inline u32 sb_zone_number(int shift, int mirror)
+{
+ u64 zone;
+
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ switch (mirror) {
+ case 0: zone = 0; break;
+ case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
+ case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
+ }
+
+ ASSERT(zone <= U32_MAX);
+
+ return (u32)zone;
+}
+
+static inline sector_t zone_start_sector(u32 zone_number,
+ struct block_device *bdev)
+{
+ return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
+}
+
+static inline u64 zone_start_physical(u32 zone_number,
+ struct btrfs_zoned_device_info *zone_info)
+{
+ return (u64)zone_number << zone_info->zone_size_shift;
+}
+
+/*
+ * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
+ * device into static sized chunks and fake a conventional zone on each of
+ * them.
+ */
+static int emulate_report_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int nr_zones)
+{
+ const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
+ sector_t bdev_size = bdev_nr_sectors(device->bdev);
+ unsigned int i;
+
+ pos >>= SECTOR_SHIFT;
+ for (i = 0; i < nr_zones; i++) {
+ zones[i].start = i * zone_sectors + pos;
+ zones[i].len = zone_sectors;
+ zones[i].capacity = zone_sectors;
+ zones[i].wp = zones[i].start + zone_sectors;
+ zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zones[i].cond = BLK_ZONE_COND_NOT_WP;
+
+ if (zones[i].wp >= bdev_size) {
+ i++;
+ break;
+ }
+ }
+
+ return i;
+}
+
+static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int *nr_zones)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zno;
+ int ret;
+
+ if (!*nr_zones)
+ return 0;
+
+ if (!bdev_is_zoned(device->bdev)) {
+ ret = emulate_report_zones(device, pos, zones, *nr_zones);
+ *nr_zones = ret;
+ return 0;
+ }
+
+ /* Check cache */
+ if (zinfo->zone_cache) {
+ unsigned int i;
+
+ ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ zno = pos >> zinfo->zone_size_shift;
+ /*
+ * We cannot report zones beyond the zone end. So, it is OK to
+ * cap *nr_zones to at the end.
+ */
+ *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
+
+ for (i = 0; i < *nr_zones; i++) {
+ struct blk_zone *zone_info;
+
+ zone_info = &zinfo->zone_cache[zno + i];
+ if (!zone_info->len)
+ break;
+ }
+
+ if (i == *nr_zones) {
+ /* Cache hit on all the zones */
+ memcpy(zones, zinfo->zone_cache + zno,
+ sizeof(*zinfo->zone_cache) * *nr_zones);
+ return 0;
+ }
+ }
+
+ ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
+ copy_zone_info_cb, zones);
+ if (ret < 0) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read zone %llu on %s (devid %llu)",
+ pos, rcu_str_deref(device->name),
+ device->devid);
+ return ret;
+ }
+ *nr_zones = ret;
+ if (!ret)
+ return -EIO;
+
+ /* Populate cache */
+ if (zinfo->zone_cache)
+ memcpy(zinfo->zone_cache + zno, zones,
+ sizeof(*zinfo->zone_cache) * *nr_zones);
+
+ return 0;
+}
+
+/* The emulated zone size is determined from the size of device extent */
+static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_dev_extent *dext;
+ int ret = 0;
+
+ key.objectid = 1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ /* No dev extents at all? Not good */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ /* We can skip reading of zone info for missing devices */
+ if (!device->bdev)
+ continue;
+
+ ret = btrfs_get_dev_zone_info(device, true);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_zoned_device_info *zone_info = NULL;
+ struct block_device *bdev = device->bdev;
+ unsigned int max_active_zones;
+ unsigned int nactive;
+ sector_t nr_sectors;
+ sector_t sector = 0;
+ struct blk_zone *zones = NULL;
+ unsigned int i, nreported = 0, nr_zones;
+ sector_t zone_sectors;
+ char *model, *emulated;
+ int ret;
+
+ /*
+ * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
+ * yet be set.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return 0;
+
+ if (device->zone_info)
+ return 0;
+
+ zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return -ENOMEM;
+
+ device->zone_info = zone_info;
+
+ if (!bdev_is_zoned(bdev)) {
+ if (!fs_info->zone_size) {
+ ret = calculate_emulated_zone_size(fs_info);
+ if (ret)
+ goto out;
+ }
+
+ ASSERT(fs_info->zone_size);
+ zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+ } else {
+ zone_sectors = bdev_zone_sectors(bdev);
+ }
+
+ /* Check if it's power of 2 (see is_power_of_2) */
+ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+ zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+
+ /* We reject devices with a zone size larger than 8GB */
+ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu larger than supported maximum %llu",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
+ } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu smaller than supported minimum %u",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ nr_sectors = bdev_nr_sectors(bdev);
+ zone_info->zone_size_shift = ilog2(zone_info->zone_size);
+ zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+ /*
+ * We limit max_zone_append_size also by max_segments *
+ * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
+ * since btrfs adds the pages one by one to a bio, and btrfs cannot
+ * increase the metadata reservation even if it increases the number of
+ * extents, it is safe to stick with the limit.
+ *
+ * With the zoned emulation, we can have non-zoned device on the zoned
+ * mode. In this case, we don't have a valid max zone append size. So,
+ * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
+ */
+ if (bdev_is_zoned(bdev)) {
+ zone_info->max_zone_append_size = min_t(u64,
+ (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+ } else {
+ zone_info->max_zone_append_size =
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT;
+ }
+ if (!IS_ALIGNED(nr_sectors, zone_sectors))
+ zone_info->nr_zones++;
+
+ max_active_zones = bdev_max_active_zones(bdev);
+ if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
+ btrfs_err_in_rcu(fs_info,
+"zoned: %s: max active zones %u is too small, need at least %u active zones",
+ rcu_str_deref(device->name), max_active_zones,
+ BTRFS_MIN_ACTIVE_ZONES);
+ ret = -EINVAL;
+ goto out;
+ }
+ zone_info->max_active_zones = max_active_zones;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Enable zone cache only for a zoned device. On a non-zoned device, we
+ * fill the zone info with emulated CONVENTIONAL zones, so no need to
+ * use the cache.
+ */
+ if (populate_cache && bdev_is_zoned(device->bdev)) {
+ zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
+ zone_info->nr_zones);
+ if (!zone_info->zone_cache) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to allocate zone cache for %s",
+ rcu_str_deref(device->name));
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* Get zones type */
+ nactive = 0;
+ while (sector < nr_sectors) {
+ nr_zones = BTRFS_REPORT_NR_ZONES;
+ ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < nr_zones; i++) {
+ if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
+ __set_bit(nreported, zone_info->seq_zones);
+ switch (zones[i].cond) {
+ case BLK_ZONE_COND_EMPTY:
+ __set_bit(nreported, zone_info->empty_zones);
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ __set_bit(nreported, zone_info->active_zones);
+ nactive++;
+ break;
+ }
+ nreported++;
+ }
+ sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
+ }
+
+ if (nreported != zone_info->nr_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "inconsistent number of zones on %s (%u/%u)",
+ rcu_str_deref(device->name), nreported,
+ zone_info->nr_zones);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (max_active_zones) {
+ if (nactive > max_active_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: %u active zones on %s exceeds max_active_zones %u",
+ nactive, rcu_str_deref(device->name),
+ max_active_zones);
+ ret = -EIO;
+ goto out;
+ }
+ atomic_set(&zone_info->active_zones_left,
+ max_active_zones - nactive);
+ set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
+ }
+
+ /* Validate superblock log */
+ nr_zones = BTRFS_NR_SB_LOG_ZONES;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_wp;
+ int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
+
+ sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
+ if (sb_zone + 1 >= zone_info->nr_zones)
+ continue;
+
+ ret = btrfs_get_dev_zones(device,
+ zone_start_physical(sb_zone, zone_info),
+ &zone_info->sb_zones[sb_pos],
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read super block log zone info at devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /*
+ * If zones[0] is conventional, always use the beginning of the
+ * zone to record superblock. No need to validate in that case.
+ */
+ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
+ BLK_ZONE_TYPE_CONVENTIONAL)
+ continue;
+
+ ret = sb_write_pointer(device->bdev,
+ &zone_info->sb_zones[sb_pos], &sb_wp);
+ if (ret != -ENOENT && ret) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: super block log zone corrupted devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+
+ kvfree(zones);
+
+ switch (bdev_zoned_model(bdev)) {
+ case BLK_ZONED_HM:
+ model = "host-managed zoned";
+ emulated = "";
+ break;
+ case BLK_ZONED_HA:
+ model = "host-aware zoned";
+ emulated = "";
+ break;
+ case BLK_ZONED_NONE:
+ model = "regular";
+ emulated = "emulated ";
+ break;
+ default:
+ /* Just in case */
+ btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
+ bdev_zoned_model(bdev),
+ rcu_str_deref(device->name));
+ ret = -EOPNOTSUPP;
+ goto out_free_zone_info;
+ }
+
+ btrfs_info_in_rcu(fs_info,
+ "%s block device %s, %u %szones of %llu bytes",
+ model, rcu_str_deref(device->name), zone_info->nr_zones,
+ emulated, zone_info->zone_size);
+
+ return 0;
+
+out:
+ kvfree(zones);
+out_free_zone_info:
+ btrfs_destroy_dev_zone_info(device);
+
+ return ret;
+}
+
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return;
+
+ bitmap_free(zone_info->active_zones);
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ vfree(zone_info->zone_cache);
+ kfree(zone_info);
+ device->zone_info = NULL;
+}
+
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
+{
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return NULL;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones)
+ goto out;
+
+ bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
+ zone_info->nr_zones);
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones)
+ goto out;
+
+ bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
+ zone_info->nr_zones);
+
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones)
+ goto out;
+
+ bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
+ zone_info->nr_zones);
+ zone_info->zone_cache = NULL;
+
+ return zone_info;
+
+out:
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ bitmap_free(zone_info->active_zones);
+ kfree(zone_info);
+ return NULL;
+}
+
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ unsigned int nr_zones = 1;
+ int ret;
+
+ ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
+ if (ret != 0 || !nr_zones)
+ return ret ? ret : -EIO;
+
+ return 0;
+}
+
+static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ if (device->bdev &&
+ bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ btrfs_err(fs_info,
+ "zoned: mode not enabled but zoned device found: %pg",
+ device->bdev);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+ u64 zone_size = 0;
+ u64 max_zone_append_size = 0;
+ int ret;
+
+ /*
+ * Host-Managed devices can't be used without the ZONED flag. With the
+ * ZONED all devices can be used, using zone emulation if required.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return btrfs_check_for_zoned_device(fs_info);
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!device->bdev)
+ continue;
+
+ if (!zone_size) {
+ zone_size = zone_info->zone_size;
+ } else if (zone_info->zone_size != zone_size) {
+ btrfs_err(fs_info,
+ "zoned: unequal block device zone sizes: have %llu found %llu",
+ zone_info->zone_size, zone_size);
+ return -EINVAL;
+ }
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size = zone_info->max_zone_append_size;
+ }
+
+ /*
+ * stripe_size is always aligned to BTRFS_STRIPE_LEN in
+ * btrfs_create_chunk(). Since we want stripe_len == zone_size,
+ * check the alignment here.
+ */
+ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
+ btrfs_err(fs_info,
+ "zoned: zone size %llu not aligned to stripe %u",
+ zone_size, BTRFS_STRIPE_LEN);
+ return -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ btrfs_err(fs_info, "zoned: mixed block groups not supported");
+ return -EINVAL;
+ }
+
+ fs_info->zone_size = zone_size;
+ fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
+ fs_info->sectorsize);
+ fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+ if (fs_info->max_zone_append_size < fs_info->max_extent_size)
+ fs_info->max_extent_size = fs_info->max_zone_append_size;
+
+ /*
+ * Check mount options here, because we might change fs_info->zoned
+ * from fs_info->zone_size.
+ */
+ ret = btrfs_check_mountopts_zoned(fs_info);
+ if (ret)
+ return ret;
+
+ btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
+ return 0;
+}
+
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ if (!btrfs_is_zoned(info))
+ return 0;
+
+ /*
+ * Space cache writing is not COWed. Disable that to avoid write errors
+ * in sequential zones.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE)) {
+ btrfs_err(info, "zoned: space cache v1 is not supported");
+ return -EINVAL;
+ }
+
+ if (btrfs_test_opt(info, NODATACOW)) {
+ btrfs_err(info, "zoned: NODATACOW not supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
+ int rw, u64 *bytenr_ret)
+{
+ u64 wp;
+ int ret;
+
+ if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ *bytenr_ret = zones[0].start << SECTOR_SHIFT;
+ return 0;
+ }
+
+ ret = sb_write_pointer(bdev, zones, &wp);
+ if (ret != -ENOENT && ret < 0)
+ return ret;
+
+ if (rw == WRITE) {
+ struct blk_zone *reset = NULL;
+
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ reset = &zones[0];
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ reset = &zones[1];
+
+ if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ ASSERT(sb_zone_is_full(reset));
+
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ reset->start, reset->len,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ reset->cond = BLK_ZONE_COND_EMPTY;
+ reset->wp = reset->start;
+ }
+ } else if (ret != -ENOENT) {
+ /*
+ * For READ, we want the previous one. Move write pointer to
+ * the end of a zone, if it is at the head of a zone.
+ */
+ u64 zone_end = 0;
+
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ zone_end = zones[1].start + zones[1].capacity;
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ zone_end = zones[0].start + zones[0].capacity;
+ if (zone_end)
+ wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
+ BTRFS_SUPER_INFO_SIZE);
+
+ wp -= BTRFS_SUPER_INFO_SIZE;
+ }
+
+ *bytenr_ret = wp;
+ return 0;
+
+}
+
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
+ sector_t zone_sectors;
+ u32 sb_zone;
+ int ret;
+ u8 zone_sectors_shift;
+ sector_t nr_sectors;
+ u32 nr_zones;
+
+ if (!bdev_is_zoned(bdev)) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ ASSERT(rw == READ || rw == WRITE);
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ if (!is_power_of_2(zone_sectors))
+ return -EINVAL;
+ zone_sectors_shift = ilog2(zone_sectors);
+ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
+ BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
+ zones);
+ if (ret < 0)
+ return ret;
+ if (ret != BTRFS_NR_SB_LOG_ZONES)
+ return -EIO;
+
+ return sb_log_location(bdev, zones, rw, bytenr_ret);
+}
+
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zone_num;
+
+ /*
+ * For a zoned filesystem on a non-zoned block device, use the same
+ * super block locations as regular filesystem. Doing so, the super
+ * block can always be retrieved and the zoned flag of the volume
+ * detected from the super block information.
+ */
+ if (!bdev_is_zoned(device->bdev)) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return -ENOENT;
+
+ return sb_log_location(device->bdev,
+ &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
+ rw, bytenr_ret);
+}
+
+static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
+ int mirror)
+{
+ u32 zone_num;
+
+ if (!zinfo)
+ return false;
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return false;
+
+ if (!test_bit(zone_num, zinfo->seq_zones))
+ return false;
+
+ return true;
+}
+
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ struct blk_zone *zone;
+ int i;
+
+ if (!is_sb_log_zone(zinfo, mirror))
+ return 0;
+
+ zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ /* Advance the next zone */
+ if (zone->cond == BLK_ZONE_COND_FULL) {
+ zone++;
+ continue;
+ }
+
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += SUPER_INFO_SECTORS;
+
+ if (sb_zone_is_full(zone)) {
+ /*
+ * No room left to write new superblock. Since
+ * superblock is written with REQ_SYNC, it is safe to
+ * finish the zone now.
+ *
+ * If the write pointer is exactly at the capacity,
+ * explicit ZONE_FINISH is not necessary.
+ */
+ if (zone->wp != zone->start + zone->capacity) {
+ int ret;
+
+ ret = blkdev_zone_mgmt(device->bdev,
+ REQ_OP_ZONE_FINISH, zone->start,
+ zone->len, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ zone->wp = zone->start + zone->len;
+ zone->cond = BLK_ZONE_COND_FULL;
+ }
+ return 0;
+ }
+
+ /* All the zones are FULL. Should not reach here. */
+ ASSERT(0);
+ return -EIO;
+}
+
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ sector_t zone_sectors;
+ sector_t nr_sectors;
+ u8 zone_sectors_shift;
+ u32 sb_zone;
+ u32 nr_zones;
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ zone_sectors_shift = ilog2(zone_sectors);
+ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+}
+
+/**
+ * btrfs_find_allocatable_zones - find allocatable zones within a given region
+ *
+ * @device: the device to allocate a region on
+ * @hole_start: the position of the hole to allocate the region
+ * @num_bytes: size of wanted region
+ * @hole_end: the end of the hole
+ * @return: position of allocatable zones
+ *
+ * Allocatable region should not contain any superblock locations.
+ */
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ const u8 shift = zinfo->zone_size_shift;
+ u64 nzones = num_bytes >> shift;
+ u64 pos = hole_start;
+ u64 begin, end;
+ bool have_sb;
+ int i;
+
+ ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+ while (pos < hole_end) {
+ begin = pos >> shift;
+ end = begin + nzones;
+
+ if (end > zinfo->nr_zones)
+ return hole_end;
+
+ /* Check if zones in the region are all empty */
+ if (btrfs_dev_is_sequential(device, pos) &&
+ find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
+ pos += zinfo->zone_size;
+ continue;
+ }
+
+ have_sb = false;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_pos;
+
+ sb_zone = sb_zone_number(shift, i);
+ if (!(end <= sb_zone ||
+ sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
+ have_sb = true;
+ pos = zone_start_physical(
+ sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
+ break;
+ }
+
+ /* We also need to exclude regular superblock positions */
+ sb_pos = btrfs_sb_offset(i);
+ if (!(pos + num_bytes <= sb_pos ||
+ sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
+ have_sb = true;
+ pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
+ zinfo->zone_size);
+ break;
+ }
+ }
+ if (!have_sb)
+ break;
+ }
+
+ return pos;
+}
+
+static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return true;
+
+ if (!test_bit(zno, zone_info->active_zones)) {
+ /* Active zone left? */
+ if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
+ return false;
+ if (test_and_set_bit(zno, zone_info->active_zones)) {
+ /* Someone already set the bit */
+ atomic_inc(&zone_info->active_zones_left);
+ }
+ }
+
+ return true;
+}
+
+static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return;
+
+ if (test_and_clear_bit(zno, zone_info->active_zones))
+ atomic_inc(&zone_info->active_zones_left);
+}
+
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+ u64 length, u64 *bytes)
+{
+ int ret;
+
+ *bytes = 0;
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ *bytes = length;
+ while (length) {
+ btrfs_dev_set_zone_empty(device, physical);
+ btrfs_dev_clear_active_zone(device, physical);
+ physical += device->zone_info->zone_size;
+ length -= device->zone_info->zone_size;
+ }
+
+ return 0;
+}
+
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ const u8 shift = zinfo->zone_size_shift;
+ unsigned long begin = start >> shift;
+ unsigned long end = (start + size) >> shift;
+ u64 pos;
+ int ret;
+
+ ASSERT(IS_ALIGNED(start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+
+ if (end > zinfo->nr_zones)
+ return -ERANGE;
+
+ /* All the zones are conventional */
+ if (find_next_bit(zinfo->seq_zones, end, begin) == end)
+ return 0;
+
+ /* All the zones are sequential and empty */
+ if (find_next_zero_bit(zinfo->seq_zones, end, begin) == end &&
+ find_next_zero_bit(zinfo->empty_zones, end, begin) == end)
+ return 0;
+
+ for (pos = start; pos < start + size; pos += zinfo->zone_size) {
+ u64 reset_bytes;
+
+ if (!btrfs_dev_is_sequential(device, pos) ||
+ btrfs_dev_is_empty_zone(device, pos))
+ continue;
+
+ /* Free regions should be empty */
+ btrfs_warn_in_rcu(
+ device->fs_info,
+ "zoned: resetting device %s (devid %llu) zone %llu for allocation",
+ rcu_str_deref(device->name), device->devid, pos >> shift);
+ WARN_ON_ONCE(1);
+
+ ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
+ &reset_bytes);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Calculate an allocation pointer from the extent allocation information
+ * for a block group consist of conventional zones. It is pointed to the
+ * end of the highest addressed extent in the block group as an allocation
+ * offset.
+ */
+static int calculate_alloc_pointer(struct btrfs_block_group *cache,
+ u64 *offset_ret, bool new)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+ u64 length;
+
+ /*
+ * Avoid tree lookups for a new block group, there's no use for it.
+ * It must always be 0.
+ *
+ * Also, we have a lock chain of extent buffer lock -> chunk mutex.
+ * For new a block group, this function is called from
+ * btrfs_make_block_group() which is already taking the chunk mutex.
+ * Thus, we cannot call calculate_alloc_pointer() which takes extent
+ * buffer locks to avoid deadlock.
+ */
+ if (new) {
+ *offset_ret = 0;
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = cache->start + cache->length;
+ key.type = 0;
+ key.offset = 0;
+
+ root = btrfs_extent_root(fs_info, key.objectid);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ /* We should not find the exact match */
+ if (!ret)
+ ret = -EUCLEAN;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_previous_extent_item(root, path, cache->start);
+ if (ret) {
+ if (ret == 1) {
+ ret = 0;
+ *offset_ret = 0;
+ }
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
+ length = found_key.offset;
+ else
+ length = fs_info->nodesize;
+
+ if (!(found_key.objectid >= cache->start &&
+ found_key.objectid + length <= cache->start + cache->length)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ *offset_ret = found_key.objectid + length - cache->start;
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 logical = cache->start;
+ u64 length = cache->length;
+ int ret;
+ int i;
+ unsigned int nofs_flag;
+ u64 *alloc_offsets = NULL;
+ u64 *caps = NULL;
+ u64 *physical = NULL;
+ unsigned long *active = NULL;
+ u64 last_alloc = 0;
+ u32 num_sequential = 0, num_conventional = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ /* Sanity check */
+ if (!IS_ALIGNED(length, fs_info->zone_size)) {
+ btrfs_err(fs_info,
+ "zoned: block group %llu len %llu unaligned to zone size %llu",
+ logical, length, fs_info->zone_size);
+ return -EIO;
+ }
+
+ /* Get the chunk mapping */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em)
+ return -EINVAL;
+
+ map = em->map_lookup;
+
+ cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ if (!cache->physical_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
+ if (!alloc_offsets) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
+ if (!caps) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
+ if (!physical) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
+ if (!active) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ bool is_sequential;
+ struct blk_zone zone;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int dev_replace_is_ongoing = 0;
+
+ device = map->stripes[i].dev;
+ physical[i] = map->stripes[i].physical;
+
+ if (device->bdev == NULL) {
+ alloc_offsets[i] = WP_MISSING_DEV;
+ continue;
+ }
+
+ is_sequential = btrfs_dev_is_sequential(device, physical[i]);
+ if (is_sequential)
+ num_sequential++;
+ else
+ num_conventional++;
+
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
+
+ if (!is_sequential) {
+ alloc_offsets[i] = WP_CONVENTIONAL;
+ continue;
+ }
+
+ /*
+ * This zone will be used for allocation, so mark this zone
+ * non-empty.
+ */
+ btrfs_dev_clear_zone_empty(device, physical[i]);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * The group is mapped to a sequential zone. Get the zone write
+ * pointer to determine the allocation offset within the zone.
+ */
+ WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_get_dev_zone(device, physical[i], &zone);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret == -EIO || ret == -EOPNOTSUPP) {
+ ret = 0;
+ alloc_offsets[i] = WP_MISSING_DEV;
+ continue;
+ } else if (ret) {
+ goto out;
+ }
+
+ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+ zone.start << SECTOR_SHIFT,
+ rcu_str_deref(device->name), device->devid);
+ ret = -EIO;
+ goto out;
+ }
+
+ caps[i] = (zone.capacity << SECTOR_SHIFT);
+
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ btrfs_err(fs_info,
+ "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+ physical[i] >> device->zone_info->zone_size_shift,
+ rcu_str_deref(device->name), device->devid);
+ alloc_offsets[i] = WP_MISSING_DEV;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ alloc_offsets[i] = 0;
+ break;
+ case BLK_ZONE_COND_FULL:
+ alloc_offsets[i] = caps[i];
+ break;
+ default:
+ /* Partially used zone */
+ alloc_offsets[i] =
+ ((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(i, active);
+ break;
+ }
+ }
+
+ if (num_sequential > 0)
+ set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
+
+ if (num_conventional > 0) {
+ /* Zone capacity is always zone size in emulation */
+ cache->zone_capacity = cache->length;
+ ret = calculate_alloc_pointer(cache, &last_alloc, new);
+ if (ret) {
+ btrfs_err(fs_info,
+ "zoned: failed to determine allocation offset of bg %llu",
+ cache->start);
+ goto out;
+ } else if (map->num_stripes == num_conventional) {
+ cache->alloc_offset = last_alloc;
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+ goto out;
+ }
+ }
+
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case 0: /* single */
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[0]);
+ ret = -EIO;
+ goto out;
+ }
+ cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = caps[0];
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ if (map->type & BTRFS_BLOCK_GROUP_DATA) {
+ btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[0]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[1] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[1]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[0] != alloc_offsets[1]) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ ret = -EIO;
+ goto out;
+ }
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(cache)) {
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &cache->runtime_flags);
+ }
+ cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = min(caps[0], caps[1]);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID0:
+ case BTRFS_BLOCK_GROUP_RAID10:
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ /* non-single profiles are not supported yet */
+ default:
+ btrfs_err(fs_info, "zoned: profile %s not yet supported",
+ btrfs_bg_type_to_raid_name(map->type));
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ if (cache->alloc_offset > fs_info->zone_size) {
+ btrfs_err(fs_info,
+ "zoned: invalid write pointer %llu in block group %llu",
+ cache->alloc_offset, cache->start);
+ ret = -EIO;
+ }
+
+ if (cache->alloc_offset > cache->zone_capacity) {
+ btrfs_err(fs_info,
+"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
+ cache->alloc_offset, cache->zone_capacity,
+ cache->start);
+ ret = -EIO;
+ }
+
+ /* An extent is allocated after the write pointer */
+ if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
+ btrfs_err(fs_info,
+ "zoned: got wrong write pointer in BG %llu: %llu > %llu",
+ logical, last_alloc, cache->alloc_offset);
+ ret = -EIO;
+ }
+
+ if (!ret) {
+ cache->meta_write_pointer = cache->alloc_offset + cache->start;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list,
+ &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+ } else {
+ kfree(cache->physical_map);
+ cache->physical_map = NULL;
+ }
+ bitmap_free(active);
+ kfree(physical);
+ kfree(caps);
+ kfree(alloc_offsets);
+ free_extent_map(em);
+
+ return ret;
+}
+
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
+{
+ u64 unusable, free;
+
+ if (!btrfs_is_zoned(cache->fs_info))
+ return;
+
+ WARN_ON(cache->bytes_super != 0);
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
+
+ /* We only need ->free_space in ALLOC_SEQ block groups */
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->free_space_ctl->free_space = free;
+ cache->zone_unusable = unusable;
+}
+
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ if (!btrfs_is_zoned(fs_info) ||
+ btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
+ !list_empty(&eb->release_list))
+ return;
+
+ memzero_extent_buffer(eb, 0, eb->len);
+ set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
+ set_extent_buffer_dirty(eb);
+ set_extent_bits_nowait(&trans->dirty_pages, eb->start,
+ eb->start + eb->len - 1, EXTENT_DIRTY);
+
+ spin_lock(&trans->releasing_ebs_lock);
+ list_add_tail(&eb->release_list, &trans->releasing_ebs);
+ spin_unlock(&trans->releasing_ebs_lock);
+ atomic_inc(&eb->refs);
+}
+
+void btrfs_free_redirty_list(struct btrfs_transaction *trans)
+{
+ spin_lock(&trans->releasing_ebs_lock);
+ while (!list_empty(&trans->releasing_ebs)) {
+ struct extent_buffer *eb;
+
+ eb = list_first_entry(&trans->releasing_ebs,
+ struct extent_buffer, release_list);
+ list_del_init(&eb->release_list);
+ free_extent_buffer(eb);
+ }
+ spin_unlock(&trans->releasing_ebs_lock);
+}
+
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_block_group *cache;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ if (!is_data_inode(&inode->vfs_inode))
+ return false;
+
+ /*
+ * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * extent layout the relocation code has.
+ * Furthermore we have set aside own block-group from which only the
+ * relocation "process" can allocate and make sure only one process at a
+ * time can add pages to an extent that gets relocated, so it's safe to
+ * use regular REQ_OP_WRITE for this special case.
+ */
+ if (btrfs_is_data_reloc_root(inode->root))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, start);
+ ASSERT(cache);
+ if (!cache)
+ return false;
+
+ ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
+ btrfs_put_block_group(cache);
+
+ return ret;
+}
+
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+ struct bio *bio)
+{
+ struct btrfs_ordered_extent *ordered;
+ const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (bio_op(bio) != REQ_OP_ZONE_APPEND)
+ return;
+
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
+ if (WARN_ON(!ordered))
+ return;
+
+ ordered->physical = physical;
+ ordered->bdev = bio->bi_bdev;
+
+ btrfs_put_ordered_extent(ordered);
+}
+
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_ordered_sum *sum;
+ u64 orig_logical = ordered->disk_bytenr;
+ u64 *logical = NULL;
+ int nr, stripe_len;
+
+ /* Zoned devices should not have partitions. So, we can assume it is 0 */
+ ASSERT(!bdev_is_partition(ordered->bdev));
+ if (WARN_ON(!ordered->bdev))
+ return;
+
+ if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
+ ordered->physical, &logical, &nr,
+ &stripe_len)))
+ goto out;
+
+ WARN_ON(nr != 1);
+
+ if (orig_logical == *logical)
+ goto out;
+
+ ordered->disk_bytenr = *logical;
+
+ em_tree = &inode->extent_tree;
+ write_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, ordered->file_offset,
+ ordered->num_bytes);
+ em->block_start = *logical;
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+
+ list_for_each_entry(sum, &ordered->list, list) {
+ if (*logical < orig_logical)
+ sum->bytenr -= orig_logical - *logical;
+ else
+ sum->bytenr += *logical - orig_logical;
+ }
+
+out:
+ kfree(logical);
+}
+
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret)
+{
+ struct btrfs_block_group *cache;
+ bool ret = true;
+
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ cache = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!cache)
+ return true;
+
+ if (cache->meta_write_pointer != eb->start) {
+ btrfs_put_block_group(cache);
+ cache = NULL;
+ ret = false;
+ } else {
+ cache->meta_write_pointer = eb->start + eb->len;
+ }
+
+ *cache_ret = cache;
+
+ return ret;
+}
+
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ struct extent_buffer *eb)
+{
+ if (!btrfs_is_zoned(eb->fs_info) || !cache)
+ return;
+
+ ASSERT(cache->meta_write_pointer == eb->start + eb->len);
+ cache->meta_write_pointer = eb->start;
+}
+
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
+{
+ if (!btrfs_dev_is_sequential(device, physical))
+ return -EOPNOTSUPP;
+
+ return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
+ length >> SECTOR_SHIFT, GFP_NOFS, 0);
+}
+
+static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
+ struct blk_zone *zone)
+{
+ struct btrfs_io_context *bioc = NULL;
+ u64 mapped_length = PAGE_SIZE;
+ unsigned int nofs_flag;
+ int nmirrors;
+ int i, ret;
+
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+ &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ ret = -EIO;
+ goto out_put_bioc;
+ }
+
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EINVAL;
+ goto out_put_bioc;
+ }
+
+ nofs_flag = memalloc_nofs_save();
+ nmirrors = (int)bioc->num_stripes;
+ for (i = 0; i < nmirrors; i++) {
+ u64 physical = bioc->stripes[i].physical;
+ struct btrfs_device *dev = bioc->stripes[i].dev;
+
+ /* Missing device */
+ if (!dev->bdev)
+ continue;
+
+ ret = btrfs_get_dev_zone(dev, physical, zone);
+ /* Failing device */
+ if (ret == -EIO || ret == -EOPNOTSUPP)
+ continue;
+ break;
+ }
+ memalloc_nofs_restore(nofs_flag);
+out_put_bioc:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
+ * filling zeros between @physical_pos to a write pointer of dev-replace
+ * source device.
+ */
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ u64 physical_start, u64 physical_pos)
+{
+ struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
+ struct blk_zone zone;
+ u64 length;
+ u64 wp;
+ int ret;
+
+ if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
+ return 0;
+
+ ret = read_zone_info(fs_info, logical, &zone);
+ if (ret)
+ return ret;
+
+ wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
+
+ if (physical_pos == wp)
+ return 0;
+
+ if (physical_pos > wp)
+ return -EUCLEAN;
+
+ length = wp - physical_pos;
+ return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
+}
+
+struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_device *device;
+ struct extent_map *em;
+ struct map_lookup *map;
+
+ em = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return ERR_CAST(em);
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ device = map->stripes[0].dev;
+
+ free_extent_map(em);
+
+ return device;
+}
+
+/**
+ * Activate block group and underlying device zones
+ *
+ * @block_group: the block group to activate
+ *
+ * Return: true on success, false otherwise
+ */
+bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ bool ret;
+ int i;
+
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return true;
+
+ map = block_group->physical_map;
+
+ spin_lock(&block_group->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
+ ret = true;
+ goto out_unlock;
+ }
+
+ /* No space left */
+ if (btrfs_zoned_bg_is_full(block_group)) {
+ ret = false;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ continue;
+
+ if (!btrfs_dev_set_active_zone(device, physical)) {
+ /* Cannot activate the zone */
+ ret = false;
+ goto out_unlock;
+ }
+ }
+
+ /* Successfully activated all the zones */
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
+ spin_unlock(&block_group->lock);
+
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ return true;
+
+out_unlock:
+ spin_unlock(&block_group->lock);
+ return ret;
+}
+
+static void wait_eb_writebacks(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ const u64 end = block_group->start + block_group->length;
+ struct radix_tree_iter iter;
+ struct extent_buffer *eb;
+ void __rcu **slot;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
+ block_group->start >> fs_info->sectorsize_bits) {
+ eb = radix_tree_deref_slot(slot);
+ if (!eb)
+ continue;
+ if (radix_tree_deref_retry(eb)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ if (eb->start < block_group->start)
+ continue;
+ if (eb->start >= end)
+ break;
+
+ slot = radix_tree_iter_resume(slot, &iter);
+ rcu_read_unlock();
+ wait_on_extent_buffer_writeback(eb);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ const bool is_metadata = (block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
+ int ret = 0;
+ int i;
+
+ spin_lock(&block_group->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ /* Check if we have unwritten allocated space */
+ if (is_metadata &&
+ block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
+
+ /*
+ * If we are sure that the block group is full (= no more room left for
+ * new allocation) and the IO for the last usable block is completed, we
+ * don't need to wait for the other IOs. This holds because we ensure
+ * the sequential IO submissions using the ZONE_APPEND command for data
+ * and block_group->meta_write_pointer for metadata.
+ */
+ if (!fully_written) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+ /* Wait for extent buffers to be written. */
+ if (is_metadata)
+ wait_eb_writebacks(block_group);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
+ }
+
+ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
+ block_group->alloc_offset = block_group->zone_capacity;
+ block_group->free_space_ctl->free_space = 0;
+ btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ map = block_group->physical_map;
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 physical = map->stripes[i].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ continue;
+
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
+
+ if (ret)
+ return ret;
+
+ btrfs_dev_clear_active_zone(device, physical);
+ }
+
+ if (!fully_written)
+ btrfs_dec_block_group_ro(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+
+ clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
+
+ return 0;
+}
+
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return 0;
+
+ return do_zone_finish(block_group, false);
+}
+
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+{
+ struct btrfs_fs_info *fs_info = fs_devices->fs_info;
+ struct btrfs_device *device;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ /* Check if there is a device with active zones left */
+ mutex_lock(&fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+
+ if (!device->bdev)
+ continue;
+
+ if (!zinfo->max_active_zones) {
+ ret = true;
+ break;
+ }
+
+ switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case 0: /* single */
+ ret = (atomic_read(&zinfo->active_zones_left) >= 1);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ ret = (atomic_read(&zinfo->active_zones_left) >= 2);
+ break;
+ }
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (!ret)
+ set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
+
+ return ret;
+}
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+ struct btrfs_block_group *block_group;
+ u64 min_alloc_bytes;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ ASSERT(block_group);
+
+ /* No MIXED_BG on zoned btrfs. */
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ min_alloc_bytes = fs_info->sectorsize;
+ else
+ min_alloc_bytes = fs_info->nodesize;
+
+ /* Bail out if we can allocate more data from this block group. */
+ if (logical + length + min_alloc_bytes <=
+ block_group->start + block_group->zone_capacity)
+ goto out;
+
+ do_zone_finish(block_group, true);
+
+out:
+ btrfs_put_block_group(block_group);
+}
+
+static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+{
+ struct btrfs_block_group *bg =
+ container_of(work, struct btrfs_block_group, zone_finish_work);
+
+ wait_on_extent_buffer_writeback(bg->last_eb);
+ free_extent_buffer(bg->last_eb);
+ btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ btrfs_put_block_group(bg);
+}
+
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
+ eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+ return;
+
+ if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
+ btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
+ bg->start);
+ return;
+ }
+
+ /* For the work */
+ btrfs_get_block_group(bg);
+ atomic_inc(&eb->refs);
+ bg->last_eb = eb;
+ INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
+ queue_work(system_unbound_wq, &bg->zone_finish_work);
+}
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg == bg->start)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
+}
+
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->zone_info) {
+ vfree(device->zone_info->zone_cache);
+ device->zone_info->zone_cache = NULL;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 used = 0;
+ u64 total = 0;
+ u64 factor;
+
+ ASSERT(btrfs_is_zoned(fs_info));
+
+ if (fs_info->bg_reclaim_threshold == 0)
+ return false;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ total += device->disk_total_bytes;
+ used += device->bytes_used;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ factor = div64_u64(used * 100, total);
+ return factor >= fs_info->bg_reclaim_threshold;
+}
+
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length)
+{
+ struct btrfs_block_group *block_group;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ /* It should be called on a previous data relocation block group. */
+ ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
+
+ spin_lock(&block_group->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
+ goto out;
+
+ /* All relocation extents are written. */
+ if (block_group->start + block_group->alloc_offset == logical + length) {
+ /*
+ * Now, release this block group for further allocations and
+ * zone finish.
+ */
+ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ &block_group->runtime_flags);
+ }
+
+out:
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
+}
+
+int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_block_group *min_bg = NULL;
+ u64 min_avail = U64_MAX;
+ int ret;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs,
+ active_bg_list) {
+ u64 avail;
+
+ spin_lock(&block_group->lock);
+ if (block_group->reserved || block_group->alloc_offset == 0 ||
+ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ avail = block_group->zone_capacity - block_group->alloc_offset;
+ if (min_avail > avail) {
+ if (min_bg)
+ btrfs_put_block_group(min_bg);
+ min_bg = block_group;
+ min_avail = avail;
+ btrfs_get_block_group(min_bg);
+ }
+ spin_unlock(&block_group->lock);
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ if (!min_bg)
+ return 0;
+
+ ret = btrfs_zone_finish(min_bg);
+ btrfs_put_block_group(min_bg);
+
+ return ret < 0 ? ret : 1;
+}
+
+int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool do_finish)
+{
+ struct btrfs_block_group *bg;
+ int index;
+
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
+
+ /* No more block groups to activate */
+ if (space_info->active_total_bytes == space_info->total_bytes)
+ return 0;
+
+ for (;;) {
+ int ret;
+ bool need_finish = false;
+
+ down_read(&space_info->groups_sem);
+ for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
+ list_for_each_entry(bg, &space_info->block_groups[index],
+ list) {
+ if (!spin_trylock(&bg->lock))
+ continue;
+ if (btrfs_zoned_bg_is_full(bg) ||
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ continue;
+ }
+ spin_unlock(&bg->lock);
+
+ if (btrfs_zone_activate(bg)) {
+ up_read(&space_info->groups_sem);
+ return 1;
+ }
+
+ need_finish = true;
+ }
+ }
+ up_read(&space_info->groups_sem);
+
+ if (!do_finish || !need_finish)
+ break;
+
+ ret = btrfs_zone_finish_one_bg(fs_info);
+ if (ret == 0)
+ break;
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
new file mode 100644
index 000000000..8bd16d40b
--- /dev/null
+++ b/fs/btrfs/zoned.h
@@ -0,0 +1,420 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ZONED_H
+#define BTRFS_ZONED_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include "volumes.h"
+#include "disk-io.h"
+#include "block-group.h"
+#include "btrfs_inode.h"
+
+#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
+
+struct btrfs_zoned_device_info {
+ /*
+ * Number of zones, zone size and types of zones if bdev is a
+ * zoned block device.
+ */
+ u64 zone_size;
+ u8 zone_size_shift;
+ u64 max_zone_append_size;
+ u32 nr_zones;
+ unsigned int max_active_zones;
+ atomic_t active_zones_left;
+ unsigned long *seq_zones;
+ unsigned long *empty_zones;
+ unsigned long *active_zones;
+ struct blk_zone *zone_cache;
+ struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
+};
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone);
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret);
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret);
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes);
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+ u64 length, u64 *bytes);
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb);
+void btrfs_free_redirty_list(struct btrfs_transaction *trans);
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start);
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+ struct bio *bio);
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret);
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ struct extent_buffer *eb);
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ u64 physical_start, u64 physical_pos);
+struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+int btrfs_zone_finish(struct btrfs_block_group *block_group);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
+int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, bool do_finish);
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ return 0;
+}
+
+static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+
+static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
+ bool populate_cache)
+{
+ return 0;
+}
+
+static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }
+
+/*
+ * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call
+ * into btrfs_clone_dev_zone_info() so it's safe to return NULL here.
+ */
+static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(
+ struct btrfs_device *orig_dev)
+{
+ return NULL;
+}
+
+static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ btrfs_err(fs_info, "zoned block devices support is not enabled");
+ return -EOPNOTSUPP;
+}
+
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ return 0;
+}
+
+static inline int btrfs_sb_log_location_bdev(struct block_device *bdev,
+ int mirror, int rw, u64 *bytenr_ret)
+{
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+}
+
+static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
+ int rw, u64 *bytenr_ret)
+{
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+}
+
+static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ return 0;
+}
+
+static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ return 0;
+}
+
+static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
+ u64 hole_start, u64 hole_end,
+ u64 num_bytes)
+{
+ return hole_start;
+}
+
+static inline int btrfs_reset_device_zone(struct btrfs_device *device,
+ u64 physical, u64 length, u64 *bytes)
+{
+ *bytes = 0;
+ return 0;
+}
+
+static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
+ u64 start, u64 size)
+{
+ return 0;
+}
+
+static inline int btrfs_load_block_group_zone_info(
+ struct btrfs_block_group *cache, bool new)
+{
+ return 0;
+}
+
+static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
+
+static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb) { }
+static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
+
+static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
+{
+ return false;
+}
+
+static inline void btrfs_record_physical_zoned(struct inode *inode,
+ u64 file_offset, struct bio *bio)
+{
+}
+
+static inline void btrfs_rewrite_logical_zoned(
+ struct btrfs_ordered_extent *ordered) { }
+
+static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret)
+{
+ return true;
+}
+
+static inline void btrfs_revert_meta_write_pointer(
+ struct btrfs_block_group *cache,
+ struct extent_buffer *eb)
+{
+}
+
+static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device,
+ u64 physical, u64 length)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
+ u64 logical, u64 physical_start,
+ u64 physical_pos)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct btrfs_device *btrfs_zoned_get_device(
+ struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ return true;
+}
+
+static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ return 0;
+}
+
+static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ u64 flags)
+{
+ return true;
+}
+
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb) { }
+
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
+static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ return false;
+}
+
+static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+{
+ return 1;
+}
+
+static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool do_finish)
+{
+ /* Consider all the block groups are active */
+ return 0;
+}
+
+#endif
+
+static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return false;
+
+ return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones);
+}
+
+static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return true;
+
+ return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones);
+}
+
+static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device,
+ u64 pos, bool set)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno;
+
+ if (!zone_info)
+ return;
+
+ zno = pos >> zone_info->zone_size_shift;
+ if (set)
+ set_bit(zno, zone_info->empty_zones);
+ else
+ clear_bit(zno, zone_info->empty_zones);
+}
+
+static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos)
+{
+ btrfs_dev_set_empty_zone_bit(device, pos, true);
+}
+
+static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos)
+{
+ btrfs_dev_set_empty_zone_bit(device, pos, false);
+}
+
+static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info,
+ struct block_device *bdev)
+{
+ if (btrfs_is_zoned(fs_info)) {
+ /*
+ * We can allow a regular device on a zoned filesystem, because
+ * we will emulate the zoned capabilities.
+ */
+ if (!bdev_is_zoned(bdev))
+ return true;
+
+ return fs_info->zone_size ==
+ (bdev_zone_sectors(bdev) << SECTOR_SHIFT);
+ }
+
+ /* Do not allow Host Manged zoned device */
+ return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+}
+
+static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
+{
+ /*
+ * On a non-zoned device, any address is OK. On a zoned device,
+ * non-SEQUENTIAL WRITE REQUIRED zones are capable.
+ */
+ return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos);
+}
+
+static inline bool btrfs_can_zone_reset(struct btrfs_device *device,
+ u64 physical, u64 length)
+{
+ u64 zone_size;
+
+ if (!btrfs_dev_is_sequential(device, physical))
+ return false;
+
+ zone_size = device->zone_info->zone_size;
+ if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size))
+ return false;
+
+ return true;
+}
+
+static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return;
+ mutex_lock(&fs_info->zoned_meta_io_lock);
+}
+
+static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return;
+ mutex_unlock(&fs_info->zoned_meta_io_lock);
+}
+
+static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg == bg->start)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+}
+
+static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+ mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
+}
+
+static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+ mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
+}
+
+static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
+{
+ ASSERT(btrfs_is_zoned(bg->fs_info));
+ return (bg->alloc_offset == bg->zone_capacity);
+}
+
+#endif
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
new file mode 100644
index 000000000..35a0224d4
--- /dev/null
+++ b/fs/btrfs/zstd.c
@@ -0,0 +1,703 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ */
+
+#include <linux/bio.h>
+#include <linux/bitmap.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/pagemap.h>
+#include <linux/refcount.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/zstd.h>
+#include "misc.h"
+#include "compression.h"
+#include "ctree.h"
+
+#define ZSTD_BTRFS_MAX_WINDOWLOG 17
+#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MAX_LEVEL 15
+/* 307s to avoid pathologically clashing with transaction commit */
+#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
+
+static zstd_parameters zstd_get_btrfs_parameters(unsigned int level,
+ size_t src_len)
+{
+ zstd_parameters params = zstd_get_params(level, src_len);
+
+ if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
+ params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
+ WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT);
+ return params;
+}
+
+struct workspace {
+ void *mem;
+ size_t size;
+ char *buf;
+ unsigned int level;
+ unsigned int req_level;
+ unsigned long last_used; /* jiffies */
+ struct list_head list;
+ struct list_head lru_list;
+ zstd_in_buffer in_buf;
+ zstd_out_buffer out_buf;
+};
+
+/*
+ * Zstd Workspace Management
+ *
+ * Zstd workspaces have different memory requirements depending on the level.
+ * The zstd workspaces are managed by having individual lists for each level
+ * and a global lru. Forward progress is maintained by protecting a max level
+ * workspace.
+ *
+ * Getting a workspace is done by using the bitmap to identify the levels that
+ * have available workspaces and scans up. This lets us recycle higher level
+ * workspaces because of the monotonic memory guarantee. A workspace's
+ * last_used is only updated if it is being used by the corresponding memory
+ * level. Putting a workspace involves adding it back to the appropriate places
+ * and adding it back to the lru if necessary.
+ *
+ * A timer is used to reclaim workspaces if they have not been used for
+ * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
+ * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
+ */
+
+struct zstd_workspace_manager {
+ const struct btrfs_compress_op *ops;
+ spinlock_t lock;
+ struct list_head lru_list;
+ struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
+ unsigned long active_map;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+};
+
+static struct zstd_workspace_manager wsm;
+
+static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
+
+static inline struct workspace *list_to_workspace(struct list_head *list)
+{
+ return container_of(list, struct workspace, list);
+}
+
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+
+/**
+ * Timer callback to free unused workspaces.
+ *
+ * @t: timer
+ *
+ * This scans the lru_list and attempts to reclaim any workspace that hasn't
+ * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ *
+ * The context is softirq and does not need the _bh locking primitives.
+ */
+static void zstd_reclaim_timer_fn(struct timer_list *timer)
+{
+ unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
+ struct list_head *pos, *next;
+
+ spin_lock(&wsm.lock);
+
+ if (list_empty(&wsm.lru_list)) {
+ spin_unlock(&wsm.lock);
+ return;
+ }
+
+ list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+ struct workspace *victim = container_of(pos, struct workspace,
+ lru_list);
+ unsigned int level;
+
+ if (time_after(victim->last_used, reclaim_threshold))
+ break;
+
+ /* workspace is in use */
+ if (victim->req_level)
+ continue;
+
+ level = victim->level;
+ list_del(&victim->lru_list);
+ list_del(&victim->list);
+ zstd_free_workspace(&victim->list);
+
+ if (list_empty(&wsm.idle_ws[level - 1]))
+ clear_bit(level - 1, &wsm.active_map);
+
+ }
+
+ if (!list_empty(&wsm.lru_list))
+ mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+
+ spin_unlock(&wsm.lock);
+}
+
+/*
+ * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ *
+ * It is possible based on the level configurations that a higher level
+ * workspace uses less memory than a lower level workspace. In order to reuse
+ * workspaces, this must be made a monotonic relationship. This precomputes
+ * the required memory for each level and enforces the monotonicity between
+ * level and memory required.
+ */
+static void zstd_calc_ws_mem_sizes(void)
+{
+ size_t max_size = 0;
+ unsigned int level;
+
+ for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ zstd_parameters params =
+ zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
+ size_t level_size =
+ max_t(size_t,
+ zstd_cstream_workspace_bound(&params.cParams),
+ zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT));
+
+ max_size = max_t(size_t, max_size, level_size);
+ zstd_ws_mem_sizes[level - 1] = max_size;
+ }
+}
+
+void zstd_init_workspace_manager(void)
+{
+ struct list_head *ws;
+ int i;
+
+ zstd_calc_ws_mem_sizes();
+
+ wsm.ops = &btrfs_zstd_compress;
+ spin_lock_init(&wsm.lock);
+ init_waitqueue_head(&wsm.wait);
+ timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
+
+ INIT_LIST_HEAD(&wsm.lru_list);
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&wsm.idle_ws[i]);
+
+ ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ if (IS_ERR(ws)) {
+ pr_warn(
+ "BTRFS: cannot preallocate zstd compression workspace\n");
+ } else {
+ set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
+ list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+ }
+}
+
+void zstd_cleanup_workspace_manager(void)
+{
+ struct workspace *workspace;
+ int i;
+
+ spin_lock_bh(&wsm.lock);
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+ while (!list_empty(&wsm.idle_ws[i])) {
+ workspace = container_of(wsm.idle_ws[i].next,
+ struct workspace, list);
+ list_del(&workspace->list);
+ list_del(&workspace->lru_list);
+ zstd_free_workspace(&workspace->list);
+ }
+ }
+ spin_unlock_bh(&wsm.lock);
+
+ del_timer_sync(&wsm.timer);
+}
+
+/*
+ * zstd_find_workspace - find workspace
+ * @level: compression level
+ *
+ * This iterates over the set bits in the active_map beginning at the requested
+ * compression level. This lets us utilize already allocated workspaces before
+ * allocating a new one. If the workspace is of a larger size, it is used, but
+ * the place in the lru_list and last_used times are not updated. This is to
+ * offer the opportunity to reclaim the workspace in favor of allocating an
+ * appropriately sized one in the future.
+ */
+static struct list_head *zstd_find_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ struct workspace *workspace;
+ int i = level - 1;
+
+ spin_lock_bh(&wsm.lock);
+ for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
+ if (!list_empty(&wsm.idle_ws[i])) {
+ ws = wsm.idle_ws[i].next;
+ workspace = list_to_workspace(ws);
+ list_del_init(ws);
+ /* keep its place if it's a lower level using this */
+ workspace->req_level = level;
+ if (level == workspace->level)
+ list_del(&workspace->lru_list);
+ if (list_empty(&wsm.idle_ws[i]))
+ clear_bit(i, &wsm.active_map);
+ spin_unlock_bh(&wsm.lock);
+ return ws;
+ }
+ }
+ spin_unlock_bh(&wsm.lock);
+
+ return NULL;
+}
+
+/*
+ * zstd_get_workspace - zstd's get_workspace
+ * @level: compression level
+ *
+ * If @level is 0, then any compression level can be used. Therefore, we begin
+ * scanning from 1. We first scan through possible workspaces and then after
+ * attempt to allocate a new workspace. If we fail to allocate one due to
+ * memory pressure, go to sleep waiting for the max level workspace to free up.
+ */
+struct list_head *zstd_get_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ unsigned int nofs_flag;
+
+ /* level == 0 means we can use any workspace */
+ if (!level)
+ level = 1;
+
+again:
+ ws = zstd_find_workspace(level);
+ if (ws)
+ return ws;
+
+ nofs_flag = memalloc_nofs_save();
+ ws = zstd_alloc_workspace(level);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (IS_ERR(ws)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&wsm.wait, &wait);
+
+ goto again;
+ }
+
+ return ws;
+}
+
+/*
+ * zstd_put_workspace - zstd put_workspace
+ * @ws: list_head for the workspace
+ *
+ * When putting back a workspace, we only need to update the LRU if we are of
+ * the requested compression level. Here is where we continue to protect the
+ * max level workspace or update last_used accordingly. If the reclaim timer
+ * isn't set, it is also set here. Only the max level workspace tries and wakes
+ * up waiting workspaces.
+ */
+void zstd_put_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_to_workspace(ws);
+
+ spin_lock_bh(&wsm.lock);
+
+ /* A node is only taken off the lru if we are the corresponding level */
+ if (workspace->req_level == workspace->level) {
+ /* Hide a max level workspace from reclaim */
+ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+ INIT_LIST_HEAD(&workspace->lru_list);
+ } else {
+ workspace->last_used = jiffies;
+ list_add(&workspace->lru_list, &wsm.lru_list);
+ if (!timer_pending(&wsm.timer))
+ mod_timer(&wsm.timer,
+ jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+ }
+ }
+
+ set_bit(workspace->level - 1, &wsm.active_map);
+ list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ workspace->req_level = 0;
+
+ spin_unlock_bh(&wsm.lock);
+
+ if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+ cond_wake_up(&wsm.wait);
+}
+
+void zstd_free_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ kvfree(workspace->mem);
+ kfree(workspace->buf);
+ kfree(workspace);
+}
+
+struct list_head *zstd_alloc_workspace(unsigned int level)
+{
+ struct workspace *workspace;
+
+ workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
+ if (!workspace)
+ return ERR_PTR(-ENOMEM);
+
+ workspace->size = zstd_ws_mem_sizes[level - 1];
+ workspace->level = level;
+ workspace->req_level = level;
+ workspace->last_used = jiffies;
+ workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
+ workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!workspace->mem || !workspace->buf)
+ goto fail;
+
+ INIT_LIST_HEAD(&workspace->list);
+ INIT_LIST_HEAD(&workspace->lru_list);
+
+ return &workspace->list;
+fail:
+ zstd_free_workspace(&workspace->list);
+ return ERR_PTR(-ENOMEM);
+}
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ zstd_cstream *stream;
+ int ret = 0;
+ int nr_pages = 0;
+ struct page *in_page = NULL; /* The current page to read */
+ struct page *out_page = NULL; /* The current page to write to */
+ unsigned long tot_in = 0;
+ unsigned long tot_out = 0;
+ unsigned long len = *total_out;
+ const unsigned long nr_dest_pages = *out_pages;
+ unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+ zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+ len);
+
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ /* Initialize the stream */
+ stream = zstd_init_cstream(&params, len, workspace->mem,
+ workspace->size);
+ if (!stream) {
+ pr_warn("BTRFS: zstd_init_cstream failed\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ /* map in the first page of input data */
+ in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+ workspace->in_buf.src = kmap_local_page(in_page);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
+
+
+ /* Allocate and map in the output buffer */
+ out_page = alloc_page(GFP_NOFS);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pages[nr_pages++] = out_page;
+ workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+
+ while (1) {
+ size_t ret2;
+
+ ret2 = zstd_compress_stream(stream, &workspace->out_buf,
+ &workspace->in_buf);
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_compress_stream returned %d\n",
+ zstd_get_error_code(ret2));
+ ret = -EIO;
+ goto out;
+ }
+
+ /* Check to see if we are making it bigger */
+ if (tot_in + workspace->in_buf.pos > 8192 &&
+ tot_in + workspace->in_buf.pos <
+ tot_out + workspace->out_buf.pos) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ /* We've reached the end of our output range */
+ if (workspace->out_buf.pos >= max_out) {
+ tot_out += workspace->out_buf.pos;
+ ret = -E2BIG;
+ goto out;
+ }
+
+ /* Check if we need more output space */
+ if (workspace->out_buf.pos == workspace->out_buf.size) {
+ tot_out += PAGE_SIZE;
+ max_out -= PAGE_SIZE;
+ if (nr_pages == nr_dest_pages) {
+ ret = -E2BIG;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pages[nr_pages++] = out_page;
+ workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out,
+ PAGE_SIZE);
+ }
+
+ /* We've reached the end of the input */
+ if (workspace->in_buf.pos >= len) {
+ tot_in += workspace->in_buf.pos;
+ break;
+ }
+
+ /* Check if we need more input */
+ if (workspace->in_buf.pos == workspace->in_buf.size) {
+ tot_in += PAGE_SIZE;
+ kunmap_local(workspace->in_buf.src);
+ put_page(in_page);
+ start += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+ workspace->in_buf.src = kmap_local_page(in_page);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ }
+ }
+ while (1) {
+ size_t ret2;
+
+ ret2 = zstd_end_stream(stream, &workspace->out_buf);
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_end_stream returned %d\n",
+ zstd_get_error_code(ret2));
+ ret = -EIO;
+ goto out;
+ }
+ if (ret2 == 0) {
+ tot_out += workspace->out_buf.pos;
+ break;
+ }
+ if (workspace->out_buf.pos >= max_out) {
+ tot_out += workspace->out_buf.pos;
+ ret = -E2BIG;
+ goto out;
+ }
+
+ tot_out += PAGE_SIZE;
+ max_out -= PAGE_SIZE;
+ if (nr_pages == nr_dest_pages) {
+ ret = -E2BIG;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pages[nr_pages++] = out_page;
+ workspace->out_buf.dst = page_address(out_page);
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ }
+
+ if (tot_out >= tot_in) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ ret = 0;
+ *total_in = tot_in;
+ *total_out = tot_out;
+out:
+ *out_pages = nr_pages;
+ if (workspace->in_buf.src) {
+ kunmap_local(workspace->in_buf.src);
+ put_page(in_page);
+ }
+ return ret;
+}
+
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct page **pages_in = cb->compressed_pages;
+ size_t srclen = cb->compressed_len;
+ zstd_dstream *stream;
+ int ret = 0;
+ unsigned long page_in_index = 0;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long buf_start;
+ unsigned long total_out = 0;
+
+ stream = zstd_init_dstream(
+ ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
+ if (!stream) {
+ pr_debug("BTRFS: zstd_init_dstream failed\n");
+ ret = -EIO;
+ goto done;
+ }
+
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+
+ workspace->out_buf.dst = workspace->buf;
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = PAGE_SIZE;
+
+ while (1) {
+ size_t ret2;
+
+ ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
+ &workspace->in_buf);
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
+ zstd_get_error_code(ret2));
+ ret = -EIO;
+ goto done;
+ }
+ buf_start = total_out;
+ total_out += workspace->out_buf.pos;
+ workspace->out_buf.pos = 0;
+
+ ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
+ total_out - buf_start, cb, buf_start);
+ if (ret == 0)
+ break;
+
+ if (workspace->in_buf.pos >= srclen)
+ break;
+
+ /* Check if we've hit the end of a frame */
+ if (ret2 == 0)
+ break;
+
+ if (workspace->in_buf.pos == workspace->in_buf.size) {
+ kunmap_local(workspace->in_buf.src);
+ page_in_index++;
+ if (page_in_index >= total_pages_in) {
+ workspace->in_buf.src = NULL;
+ ret = -EIO;
+ goto done;
+ }
+ srclen -= PAGE_SIZE;
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ }
+ }
+ ret = 0;
+ zero_fill_bio(cb->orig_bio);
+done:
+ if (workspace->in_buf.src)
+ kunmap_local(workspace->in_buf.src);
+ return ret;
+}
+
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ zstd_dstream *stream;
+ int ret = 0;
+ size_t ret2;
+ unsigned long total_out = 0;
+ unsigned long pg_offset = 0;
+
+ stream = zstd_init_dstream(
+ ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
+ if (!stream) {
+ pr_warn("BTRFS: zstd_init_dstream failed\n");
+ ret = -EIO;
+ goto finish;
+ }
+
+ destlen = min_t(size_t, destlen, PAGE_SIZE);
+
+ workspace->in_buf.src = data_in;
+ workspace->in_buf.pos = 0;
+ workspace->in_buf.size = srclen;
+
+ workspace->out_buf.dst = workspace->buf;
+ workspace->out_buf.pos = 0;
+ workspace->out_buf.size = PAGE_SIZE;
+
+ ret2 = 1;
+ while (pg_offset < destlen
+ && workspace->in_buf.pos < workspace->in_buf.size) {
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+
+ /* Check if the frame is over and we still need more input */
+ if (ret2 == 0) {
+ pr_debug("BTRFS: zstd_decompress_stream ended early\n");
+ ret = -EIO;
+ goto finish;
+ }
+ ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
+ &workspace->in_buf);
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
+ zstd_get_error_code(ret2));
+ ret = -EIO;
+ goto finish;
+ }
+
+ buf_start = total_out;
+ total_out += workspace->out_buf.pos;
+ workspace->out_buf.pos = 0;
+
+ if (total_out <= start_byte)
+ continue;
+
+ if (total_out > start_byte && buf_start < start_byte)
+ buf_offset = start_byte - buf_start;
+ else
+ buf_offset = 0;
+
+ bytes = min_t(unsigned long, destlen - pg_offset,
+ workspace->out_buf.size - buf_offset);
+
+ memcpy_to_page(dest_page, pg_offset,
+ workspace->out_buf.dst + buf_offset, bytes);
+
+ pg_offset += bytes;
+ }
+ ret = 0;
+finish:
+ if (pg_offset < destlen) {
+ memzero_page(dest_page, pg_offset, destlen - pg_offset);
+ }
+ return ret;
+}
+
+const struct btrfs_compress_op btrfs_zstd_compress = {
+ /* ZSTD uses own workspace manager */
+ .workspace_manager = NULL,
+ .max_level = ZSTD_BTRFS_MAX_LEVEL,
+ .default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
+};